## Lady Montagu's Letters Part 3

In [1]:
import pandas as pd
import numpy as np

from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mervetekgurler/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mervetekgurler/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mervetekgurler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Let's read our dataset in

montagu = pd.read_csv('../Data/montagu/montagu_letters_v3.csv')
montagu.head()

Unnamed: 0,filename,title,addressee,location_original,location_edited,location_wikidata,latitude,longitude,date_original,date_edited,body,body_cleaned,word_count,sentence_count,adjectives,pos_tags
0,letter_1.txt,LETTER I.,TO THE COUNTESS OF ——.,Rotterdam,Rotterdam,Q34370,51.92,4.48,Aug. 3. O. S. 1716.,1716-08-03,"I FLATTER, myself, dear sister, that I shall g...","I FLATTER, myself, dear sister, that I shall g...",633,18,"['dear', 'ill', 'sunday', 'i', 'dry', 'little'...","[('I', 'PRP'), ('FLATTER', 'NNP'), (',', ','),..."
1,letter_2.txt,LET. II,TO MRS. S——.,Hague,Hague,Q36600,52.08,4.31,Aug. 5. O. S. 1716.,1716-08-05,"I MAKE haste to tell you, dear Madam, that, af...","I MAKE haste to tell you, dear Madam, that, af...",344,12,"['dreadful', 'short', 'sure', 'whole', 'large'...","[('I', 'PRP'), ('MAKE', 'VBP'), ('haste', 'NN'..."
2,letter_3.txt,LET. III,TO MRS. S. C.,Nimeguen,Nimeguen,Q47887,51.8475,5.8625,Aug.13. O. S. 1716.,1716-08-13,"I AM extremely sorry, my dear S. that your fea...","I AM extremely sorry, my dear S. that your fea...",573,18,"['dear', 'agreeable', 'nottingham', 'more', 's...","[('I', 'PRP'), ('AM', 'VBP'), ('extremely', 'R..."
3,letter_4.txt,LET. IV.,TO THE LADY ——.,Cologn (sic),Cologne,Q365,50.94222222,6.957778,"Aug, 16. O. S. 1716.",1716-08-16,IF my lady —— could have any notion of the fat...,IF my lady —— could have any notion of the fat...,576,12,"['last', 'great', 'nimeguen', 'indifferent', '...","[('IF', 'NNP'), ('my', 'PRP$'), ('lady', 'NN')..."
4,letter_5.txt,LET. V.,TO THE COUNTESS OF B——.,Nuremberg,Nuremberg,Q2090,49.45388889,11.0775,Aug. 22. O. S. 1716.,1716-08-22,"AFTER five days travelling post, I could not s...","AFTER five days travelling post, I could not s...",743,18,"['other', 'dear', 'large', 'free', 'little', '...","[('AFTER', 'NNP'), ('five', 'CD'), ('days', 'N..."


In [14]:
# Before we can use these letters, we need to clean them up a bit.
# Let's call this preprocessing
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    # print(type(tokens))
    # print(type(tokens[0]))
    # Remove stopwords and punctuation
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    tokens_string = ' '.join(tokens)
    # print(type(tokens_string))
    return tokens_string

In [15]:
# Let's see what out code does

example_text = montagu['body_cleaned'].iloc[0]
print(preprocess_text(example_text))

flatter dear sister shall give pleasure letting know safely passed sea though ill fortune storm persuaded captain yacht set calm pretended nothing easy tide two days slowly moving wind blew hard none sailors could keep feet sunday night tossed handsomely never saw man frighted sic captain part lucky neither suffer fear seasickness though confess impatient see upon dry land would stay till yacht could get rotterdam went helvoetsluys voitures carry us briel charmed neatness little town arrival rotterdam presented new scene pleasure streets paved broad stones many meanest artificers doors placed seats various coloured marbles neatly kept assure walked almost town yesterday incognito slippers without receiving one spot dirt may see dutch maids washing pavement street application town seems full people busy faces motion hardly fancy celebrated fair see every day certain town advantageously situated commerce seven large canals merchants ships come doors houses shops warehouses surprising nea

In [16]:
# Let's apply this to our dataset and save it
montagu['body_preprocessed'] = montagu['body_cleaned'].apply(preprocess_text)
montagu.head()

Unnamed: 0,filename,title,addressee,location_original,location_edited,location_wikidata,latitude,longitude,date_original,date_edited,body,body_cleaned,word_count,sentence_count,adjectives,pos_tags,body_preprocessed
0,letter_1.txt,LETTER I.,TO THE COUNTESS OF ——.,Rotterdam,Rotterdam,Q34370,51.92,4.48,Aug. 3. O. S. 1716.,1716-08-03,"I FLATTER, myself, dear sister, that I shall g...","I FLATTER, myself, dear sister, that I shall g...",633,18,"['dear', 'ill', 'sunday', 'i', 'dry', 'little'...","[('I', 'PRP'), ('FLATTER', 'NNP'), (',', ','),...",flatter dear sister shall give pleasure lettin...
1,letter_2.txt,LET. II,TO MRS. S——.,Hague,Hague,Q36600,52.08,4.31,Aug. 5. O. S. 1716.,1716-08-05,"I MAKE haste to tell you, dear Madam, that, af...","I MAKE haste to tell you, dear Madam, that, af...",344,12,"['dreadful', 'short', 'sure', 'whole', 'large'...","[('I', 'PRP'), ('MAKE', 'VBP'), ('haste', 'NN'...",make haste tell dear madam dreadful fatigues t...
2,letter_3.txt,LET. III,TO MRS. S. C.,Nimeguen,Nimeguen,Q47887,51.8475,5.8625,Aug.13. O. S. 1716.,1716-08-13,"I AM extremely sorry, my dear S. that your fea...","I AM extremely sorry, my dear S. that your fea...",573,18,"['dear', 'agreeable', 'nottingham', 'more', 's...","[('I', 'PRP'), ('AM', 'VBP'), ('extremely', 'R...",extremely sorry dear fears disobliging relatio...
3,letter_4.txt,LET. IV.,TO THE LADY ——.,Cologn (sic),Cologne,Q365,50.94222222,6.957778,"Aug, 16. O. S. 1716.",1716-08-16,IF my lady —— could have any notion of the fat...,IF my lady —— could have any notion of the fat...,576,12,"['last', 'great', 'nimeguen', 'indifferent', '...","[('IF', 'NNP'), ('my', 'PRP$'), ('lady', 'NN')...",lady could notion fatigues suffered two last d...
4,letter_5.txt,LET. V.,TO THE COUNTESS OF B——.,Nuremberg,Nuremberg,Q2090,49.45388889,11.0775,Aug. 22. O. S. 1716.,1716-08-22,"AFTER five days travelling post, I could not s...","AFTER five days travelling post, I could not s...",743,18,"['other', 'dear', 'large', 'free', 'little', '...","[('AFTER', 'NNP'), ('five', 'CD'), ('days', 'N...",five days travelling post could sit write occa...


## Vectors



In this first example, we will repeat something that we did in the last notebook and get word counts but in a matrix.


`CountVectorizer` is used to convert a collection of text documents into a matrix of token counts. Each row corresponds to a document (in your case, each letter), and each column corresponds to a word in the entire corpus.

`fit_transform()` fits the vectorizer on the text and transforms the text into a **document-term matrix (DTM)**, where rows are documents (letters), and columns are words (terms) from the text. The matrix entries are the counts of each word in the respective document.

In [17]:
# First let's see it in a small example
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names_out()
print(vectorizer.get_feature_names_out())


['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


As you see, feature names are basically all the unique words in this corpus

In [18]:
# Let's see what our matrix looks like
print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


Below is a representation of what happened in a conceptual way

This is our corpus:

```python
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?'
```

This is our vocabulary

`['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']`

Each row of the X matrix corresponds to one sentence in our corpus and each column to one word in our vocabulary. The resulting matrix shows us whether a given word in the vocabulary exists in a given sentence

Below I left the actual word where it exists, which is a 1 or 2 in X depending on how many times they appear in the sentence and put a 0 for where the word does not exist

```python
[0 'document' 'first' 'is' 0 0 'the' 0 'this']
[0 'DOCUMENT' 0 'is' 0 'second' 'the' 0 'this']
['and' 0 'first' 0 'one' 'second' 0 'third' 'this']
[0 'document' 'first' 'is' 0 0 'the' 0 'this']
```

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

Document-term matrix

            term 1 | term 2 | term 3 | .....
document 1    1    |     0
document 2    2    |
document 3    5    |
document 4

terms: [this, is, the, first, document, second, and, third, one]

alphabetical order

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']



            'and' | 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this'
document 1    0   |     1         1      1    0
document 2    0   |     2
document 3    1   |
document 4



Now, let's do this with our letter. Beware this will be much bigger. 

Remember we had over 6k unique words? Maybe one way to addressed this could be to limit how many features, in this case words, we want to include.

In [19]:
# Initialize the vectorizer with max_features set to 20
vectorizer = CountVectorizer(max_features=20)

In [20]:
# Fit the vectorizer on the 'body_preprocessed' column
X = vectorizer.fit_transform(montagu['body_preprocessed'])

In [21]:
# Let's see what our feature names are
print(vectorizer.get_feature_names_out())

# Make sure we have the right number of features
print(len(vectorizer.get_feature_names_out()))


['could' 'every' 'great' 'large' 'little' 'make' 'many' 'may' 'much'
 'never' 'one' 'people' 'see' 'sic' 'though' 'two' 'upon' 'well' 'without'
 'would']
20


In [23]:
# Let's see what our matrix looks like
print(X.toarray())
print(X.shape)

[[ 2  2  2 ...  0  1  1]
 [ 0  3  2 ...  2  1  1]
 [ 0  1  3 ...  1  0  2]
 ...
 [ 4  1  4 ...  4  2 11]
 [ 2  2  2 ...  0  1  3]
 [ 1  1  2 ...  2  2  1]]
(58, 20)


In [24]:
# We could also convert this to a DataFrame
word_frequencies = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
word_frequencies


Unnamed: 0,could,every,great,large,little,make,many,may,much,never,one,people,see,sic,though,two,upon,well,without,would
0,2,2,2,1,2,2,1,2,3,1,3,1,6,3,2,1,1,0,1,1
1,0,3,2,4,0,2,0,1,0,0,1,1,1,1,0,0,3,2,1,1
2,0,1,3,1,0,1,0,0,3,1,4,1,1,0,0,1,1,1,0,2
3,3,0,2,1,0,0,0,0,1,1,1,0,3,5,4,1,0,3,0,2
4,3,0,0,2,2,1,1,0,1,1,3,4,2,3,0,0,1,1,0,0
5,2,4,3,1,1,0,2,1,2,1,4,1,2,2,2,0,1,1,1,2
6,0,0,3,5,2,0,2,1,2,1,7,2,1,3,2,1,1,5,2,1
7,2,1,3,3,1,1,0,2,3,2,3,2,1,2,0,3,1,4,0,3
8,2,3,3,3,3,4,1,1,4,4,0,0,1,2,1,4,3,3,4,1
9,3,2,4,0,3,2,2,2,4,1,3,1,4,4,0,4,6,4,2,4


In [25]:
# Sum the word counts across all letters
word_counts = word_frequencies.sum(axis=0).sort_values(ascending=False)
word_counts

one        225
much       177
great      172
would      153
see        147
upon       135
little     129
two        119
sic        116
well       109
though     107
without    103
make       102
may         99
many        98
never       97
could       92
large       90
people      90
every       89
dtype: int64

### Remember this plot form yesterday?

<img src="../img/montagu_word_frequency.png" alt="Montagu Word Frequency" width="700"/>

In [26]:
# Let's do this for the whole corpus

# Create a new CountVectorizer instance
vectorizer = CountVectorizer()
# Fit the vectorizer on the 'body_preprocessed' column
X = vectorizer.fit_transform(montagu['body_preprocessed'])
# Get the feature names
feature_names_montagu = vectorizer.get_feature_names_out()
print(feature_names_montagu[:50])

['ab' 'abandon' 'abandoned' 'abate' 'abated' 'abbe' 'abbess' 'abbot'
 'abhorrence' 'abilities' 'ability' 'abject' 'able' 'abolish' 'abominable'
 'abominably' 'abound' 'abridgement' 'abroad' 'abrupt' 'abruptly'
 'absence' 'absent' 'absolute' 'absolutely' 'absolution' 'absurd'
 'absurdities' 'absurdity' 'abundance' 'abydos' 'ac' 'accableed' 'accept'
 'acceptable' 'accepted' 'accepting' 'accident' 'accidentally'
 'accommodated' 'accommodation' 'accommodations' 'accompanied'
 'accompanies' 'accompany' 'accomplish' 'accomplished' 'accomplishment'
 'accomplishments' 'according']


In [27]:
len(feature_names_montagu)

6941

Feature names or in this case vocabulary is in alphabetical order not in the order of frequency

In [28]:
# Convert the sparse matrix to a numpy array
dtm_montagu = X.toarray()
# Can we like actually see this?
dtm_montagu

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], shape=(58, 6941))

This is kinda empty? Well yes and no, the issue here is that we have a pretty big vocabulary 6941 words to be exact and 58 letters. A lot of the letters won't have a lot of the words.

In [30]:
# Let's sum the elements in each column and get the top 50 words
word_counts_montagu = dtm_montagu.sum(axis=0)
# Create a DataFrame with the word counts
word_counts_df_montagu = pd.DataFrame({'word': feature_names_montagu, 'count': word_counts_montagu})
# Sort the DataFrame by count in descending order
word_counts_df_montagu = word_counts_df_montagu.sort_values(by='count', ascending=False)
# Get the top 50 words
print('Top 50 Words: \n', word_counts_df_montagu.head(50))

# here top 20 will be the same as the previous CountVectorizer example

Top 50 Words: 
            word  count
4258        one    225
3983       much    177
2769      great    172
6886      would    153
5506        see    147
6548       upon    135
3596     little    129
6472        two    119
5649        sic    116
6781       well    109
6254     though    107
6841    without    103
3708       make    102
3782        may     99
3731       many     98
4087      never     97
1393      could     92
3470      large     90
4484     people     90
2163      every     89
2409       fine     89
6289       time     88
3425       know     87
6335       town     83
6240      think     82
3681       made     80
4145    nothing     78
4386       part     77
2725       good     76
2422      first     76
593     believe     73
2694       give     72
6927        yet     70
3441     ladies     69
5514       seen     68
6556         us     68
3443       lady     67
6178       tell     66
6877      world     66
3725     manner     65
1411      court     65
168   agreeable   

### Solution 1: Text Normalization

1. Lemmatization

With lemmatization we can reduce the size of our vocabulary and recalculate our matrix.

In the previous notebook we visualized all the nouns in this corpus.

<img src="../img/montagu_top_50_nouns.png" alt="Montagu Top 50 Nouns" width="700"/>


When we take a look at the top 50 words, what we see is that there are some plural and singular nouns. For example men and man both appear in this data. 

In some cases, this kind of information is super super useful. In other cases however we might want to turn all nouns into their singular forms to count them.

More generally, this is a text normalization task called lemmatization. *Lemmatization is the task of turning inflected words into their dictionary forms.* Lemmatization algorithms can be complex. For this reason we sometimes make use of a simpler but cruder method, which mainly consists of chopping off word-
final affixes. This naive version of morphological analysis is called *stemming.*

You can read more [here](https://web.stanford.edu/~jurafsky/slp3/ed3book_Jan25.pdf)



[What's the difference between stemming and lemmatizing?](https://stackoverflow.com/questions/1787110/what-is-the-difference-between-lemmatization-vs-stemming)

"**Stemming** usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. **Lemmatization** usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma. 

If confronted with the token "saw", **stemming** might return just "s", whereas **lemmatization** would attempt to return either "see" or "saw" depending on whether the use of the token was as a verb or a noun. The two may also differ in that stemming most commonly collapses derivationally related words, whereas lemmatization commonly only collapses the different inflectional forms of a lemma." See [this post](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html) for more information.



In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [None]:
# We will do this with spaCy https://spacy.io/ and https://spacy.io/api/lemmatizer

import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm", disable=["ner", "textcat"])  # Disable NER and text classification for speed

In [None]:
# Let's get the first letter from the dataset to test our spaCy implementation
letter_1 = montagu['body_cleaned'].iloc[0]

In [None]:
doc = nlp(letter_1)  # Process the letter with spaCy

In [None]:
doc[:20]  # Display the first 10 tokens in the document

In [None]:
# Let's check the word at index 16

print(word_tokenize(letter_1)[16]) 
print(doc[16].lemma_)

In [None]:
# Let's get all our lemmas
lemmatized_tokens = [token.lemma_ for token in doc]

In [None]:
# Let's create a tuple of the original word and its lemmatized form
lemmatized_pairs = [(token.text, token.lemma_) for token in doc if token.is_alpha]
# Display the first 10 lemmatized pairs
print(lemmatized_pairs[:10])

In [None]:
# Let's print the one that have changed side by side for better readability
for original, lemma in lemmatized_pairs:
    if original != lemma:  # Only print pairs where the original and lemma are different
        print(f"{original} -> {lemma}")

In [None]:
# Let's turn this into a function that we can apply to our dataset
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_ for token in doc])
    lemmatized_text = lemmatized_text.lower()
    clean_lemmas = [word for word in lemmatized_text.split() if word not in stop_words and word.isalpha()]
    return ' '.join(clean_lemmas)

In [None]:
# testing our lemmatization function on the first letter

letter_1_lemmatized = lemmatize_text(letter_1)
print(letter_1[:1000])
print(letter_1_lemmatized[:1000])

In [None]:
# Let's lemmatize the entire dataset
montagu['body_lemmatized'] = montagu['body_cleaned'].apply(lemmatize_text)
montagu.head()

In [None]:
# What did we change?
# Let's see the number of unique words in the preprocessed and lemmatized text

# Concatenate all the texts in the 'body_preprocessed' and 'body_lemmatized' columns
all_preprocessed_text = ' '.join(montagu['body_preprocessed'].astype(str))
all_lemmatized_text = ' '.join(montagu['body_lemmatized'].astype(str))

In [None]:
# Split the concatenated text into words and count unique words
unique_preprocessed_words = len(set(all_preprocessed_text.split()))
unique_lemmatized_words = len(set(all_lemmatized_text.split()))

print("Unique words in preprocessed text:", unique_preprocessed_words)
print("Unique words in lemmatized text:", unique_lemmatized_words)

Our vocabulary is down by 1392 words.

In [None]:
# Let's get another round of vectorization with the lemmatized text
vectorizer_lemmatized = CountVectorizer()

# Fit the vectorizer on the 'body_lemmatized' column
X_lemmatized = vectorizer_lemmatized.fit_transform(montagu['body_lemmatized'])

# Get the feature names
feature_names_lemmatized = vectorizer_lemmatized.get_feature_names_out()

# Convert the sparse matrix to a numpy array
dtm_lemmatized = X_lemmatized.toarray()

dtm_lemmatized


In [None]:
# Let's sum the elements in each column and get the top 50 words
word_counts_lemmatized = dtm_lemmatized.sum(axis=0)
# Create a DataFrame with the word counts
word_counts_df_lemmatized = pd.DataFrame({'word': feature_names_lemmatized, 'count': word_counts_lemmatized})
# Sort the DataFrame by count in descending order
word_counts_df_lemmatized = word_counts_df_lemmatized.sort_values(by='count', ascending=False)
# Get the top 50 words
print('Top 50 Words (Lemmatized): \n', word_counts_df_lemmatized.head(50))

### Solution 1: Text Normalization

2. Remove Rare Words

Now, we want to remove words that appear only once across all documents in the DTM.

In [None]:
# Let's do this with numpy, don't forget X_lemmatized is a matrix

# Sum the word counts across all documents (columns) to find the frequency of each word
word_counts = np.asarray(X_lemmatized.sum(axis=0)).flatten()

# Identify columns (words) that appear only once
words_to_remove = np.where(word_counts == 1)[0]

# Remove these words (columns) from the sparse matrix
X_cleaned = np.delete(X.toarray(), words_to_remove, axis=1)

# Verify the shape of the cleaned sparse matrix
print(f"Original matrix shape: {X.shape}")
print(f"Cleaned matrix shape: {X_cleaned.shape}")



We got rid of 2685 words that only appear once.

### Solution 1: Text Normalization

3. Remove Very Frequent Words

We can filter out very frequent words. Let's remove words that appear in more than 50% of this corpus.

For this we will need to calculate the *document frequency* of each term.

In [None]:
# Sum the word counts across all documents (columns) to get document frequency for each word
doc_frequency = np.asarray(X_cleaned.sum(axis=0)).flatten()
doc_frequency

`X_cleaned.sum(axis=0)`: This calculates the number of documents (rows) each word (column) appears in, giving the document frequency.

In [None]:
# Set the threshold for document frequency (words appearing in more than 50% of documents)
df_threshold = 0.5 * X.shape[0]  # 50% of total documents
df_threshold

`df_threshold = 0.5 * X.shape[0]`: This sets the threshold to 50% of the total number of documents.

We know that we have 58 letters, so here 50% is 29

In [None]:
# Find words to remove: those with document frequency greater than the threshold
words_to_remove = np.where(doc_frequency > df_threshold)[0]
words_to_remove

`np.where(doc_frequency > df_threshold)[0]:` This identifies the indices of words that exceed the 50% threshold (i.e., words that appear in more than 50% of the documents).

In [None]:
# Remove these words (columns) from the sparse matrix
X_cleaned_50 = np.delete(X_cleaned, words_to_remove, axis=1)

`np.delete(X_cleaned, words_to_remove, axis=1)`: This removes those columns (words) from the matrix

In [None]:
# Verify the shape of the cleaned matrix
print(f"Original matrix shape: {X.shape}")
print(f"Matrix shape after lemmatization and removing words that appear once: {X_cleaned.shape}")
print(f"Matrix shape after removing words with 50% document frequency: {X_cleaned_50.shape}")


We did not remove that many words when we removed the words that appear in more than 50% of the corpus.
What else can we do?

Rather than removing a fixed percentage (like the top 50%), a better approach is to use a more data-driven method to find an appropriate cutoff for frequent words.