## Project01

In [84]:
from collections import Counter
import re
import pandas as pd

### 1. texts processing

In [None]:
def word_counter(txtList: list[str]) -> dict[str, int]:
    """
    Counts word frequencies across multiple texts.
    Each text is cleaned (letters only, lowercase),
    and all word counts are accumulated together.
    Returns a dictionary {word: count}.
    """
    total_counts = Counter()

    for text in txtList:
        # clean the text
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)  
        text = re.sub(r'\s+', ' ', text)         
        text = text.strip().lower()               
        # split into words
        words = text.split()                   

        # update total word frequencies
        total_counts.update(words)

    return dict(total_counts)



In [None]:
# loading raw texts

with open("raw/book1.txt", "r", encoding="utf-8") as f:
    raw_book1 = f.read()

with open("raw/book2.txt", "r", encoding="utf-8") as f:
    raw_book2 = f.read()

with open("raw/book3.txt", "r", encoding="utf-8") as f:
    raw_book3 = f.read()


In [74]:
words = word_counter([raw_book1, raw_book2, raw_book3])
sorted_words = sorted(words.items(), key=lambda x: x[1], reverse=True)

In [None]:
# printing number of words and 20 most common words

word_number = sum(freq for word, freq in sorted_words)
print("number of words:", word_number)
print()

for key, value in sorted_words[:20]:
    print(key, value)

number of words: 165496

the 13923
of 6633
a 4977
and 4599
to 4234
in 3192
is 2501
it 1690
that 1562
be 1509
with 1316
as 1277
on 1193
at 1098
or 1072
by 1055
for 1004
which 987
are 865
from 845


### 2. showing 100 most common words with their zipf score

In [85]:
def build_zipf_df(sorted_words: list[tuple[str, int]]) -> pd.DataFrame:
    """
    Build a Zipf DataFrame,
    Columns: word, counts, rank, zipf_score (counts * rank).
    """
    # Create rows with rank starting at 1
    rows = []
    for rank, (word, count) in enumerate(sorted_words, start=1):
        zipf_score = count * rank 
        rows.append((word, count, rank, zipf_score))

    df = pd.DataFrame(rows, columns=["word", "counts", "rank", "zipf_score"])
    return df

In [86]:
df = build_zipf_df(sorted_words)

In [91]:
pd.set_option('display.max_rows', 100)
df.head(100)

Unnamed: 0,word,counts,rank,zipf_score
0,the,13923,1,13923
1,of,6633,2,13266
2,a,4977,3,14931
3,and,4599,4,18396
4,to,4234,5,21170
5,in,3192,6,19152
6,is,2501,7,17507
7,it,1690,8,13520
8,that,1562,9,14058
9,be,1509,10,15090


### 3. showing how many number of words you should know to be able to read specific percent of text

In [103]:
def words_needed_for_coverage(percent, sorted_words):
    if percent < 0 or percent > 1:
        raise ValueError("x should be between 0 and 1!")
    total_needed = sum(freq for word, freq in sorted_words) * percent
    count = 0
    i = 0
    while count < total_needed:
        count += sorted_words[i][1]
        i += 1

    return i

In [119]:
print(f"Number of different words in texts: {len(sorted_words)}")
print()
print("Number of words you should know to be able to read specific percent of text:")
print(f"10%: {words_needed_for_coverage(0.1,sorted_words)}")
print(f"20%: {words_needed_for_coverage(0.2,sorted_words)}")
print(f"30%: {words_needed_for_coverage(0.3,sorted_words)}")
print(f"40%: {words_needed_for_coverage(0.4,sorted_words)}")
print(f"50%: {words_needed_for_coverage(0.5,sorted_words)}")
print(f"60%: {words_needed_for_coverage(0.6,sorted_words)}")
print(f"70%: {words_needed_for_coverage(0.7,sorted_words)}")
print(f"80%: {words_needed_for_coverage(0.8,sorted_words)}")
print(f"90%: {words_needed_for_coverage(0.9,sorted_words)}")

Number of different words in texts: 12029

Number of words you should know to be able to read specific percent of text:
10%: 2
20%: 5
30%: 14
40%: 37
50%: 92
60%: 236
70%: 556
80%: 1282
90%: 3113


### 4. showing adjacent pairs of words

In [None]:
def tokenize_texts(texts: list[str]) -> list[str]:
    """
    Tokenizes a list of text documents into a single flat list of tokens.
    
    Steps:
      1. Clean each text (letters only, lowercase)
      2. Split into individual words
      3. Combine all tokens into one list

    Args:
        texts (List[str]): List of raw text strings.

    Returns:
        List[str]: List of all tokens from all texts.
    """
    all_tokens = []

    for text in texts:
        # Keep only letters and spaces
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        # Replace multiple spaces with one
        text = re.sub(r'\s+', ' ', text).strip().lower()
        # Split into tokens (words)
        tokens = text.split()
        # Add to global list
        all_tokens.extend(tokens)

    return all_tokens


In [125]:
tokens = tokenize_texts([raw_book1, raw_book2, raw_book3])

In [None]:
def get_adjacent_pairs(tokens: list[str]) -> list[tuple[str, str]]:
    """
    Returns all adjacent word pairs from a list of tokens.
    
    """
    pairs = [(tokens[i], tokens[i + 1]) for i in range(len(tokens) - 1)]
    return pairs



In [134]:
pairs = get_adjacent_pairs(tokens)
pair_counts = Counter(get_adjacent_pairs(tokens))
for key, value in pair_counts.most_common(20):
    print(key, value)

('of', 'the') 1969
('in', 'the') 902
('to', 'the') 756
('of', 'a') 532
('and', 'the') 479
('on', 'the') 468
('at', 'the') 362
('with', 'the') 335
('from', 'the') 328
('in', 'a') 326
('by', 'the') 323
('to', 'be') 283
('it', 'is') 269
('project', 'gutenberg') 264
('that', 'the') 234
('illustration', 'fig') 232
('with', 'a') 214
('for', 'the') 204
('the', 'other') 197
('may', 'be') 197
