Description:
Implement TF-IDF from scratch using the equations presented in the lecture then apply it to
a 4 different pdf documents from your choice. P.S. you can generate the documents.

Requirements:
1. Preprocess texts (Tokenization & Case normalization “Lowering case” & Punctuation
removal)
2. Compute Term Frequency (TF) and Inverse Document Frequency (IDF).  
3. Display the words with the highest TF-IDF value.
4. Use TF-IDF to encode/embed your documents.



In [None]:
!pip install nltk



In [None]:
import re
import math
from collections import Counter
from sklearn.preprocessing import normalize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{"should've", "that'll", 'then', 'from', 'to', 'i', 'is', 'what', 'by', 'why', 'y', 'themselves', 'll', 'wasn', "weren't", 'down', 'too', 'more', 'be', 'himself', 'both', 'they', 'couldn', 'for', 'now', "she'd", "i've", 'there', 'doesn', 'did', 'these', 'aren', 'because', "shouldn't", 'you', 'as', 'can', 'when', 'am', 'against', 'in', 'through', 'was', "won't", "he's", 'itself', 'about', 'does', 'until', "wouldn't", "you're", 'your', 'have', 'over', 'ain', "it'll", "aren't", 'been', 'where', 'no', 'needn', 'd', 'we', 'didn', "she's", "they'll", 'the', 'her', 'he', 'hers', "doesn't", "mustn't", 'shan', 'mightn', "they're", 'above', 'ma', 'an', "i'd", 'than', 'had', 'were', 'out', "it's", 'who', 'few', "she'll", "i'll", 'so', 'weren', 'all', 'it', 'own', 'between', 'some', 'how', 'just', 'shouldn', 'most', 'm', 'ourselves', 'yours', 'she', 'don', 'each', "didn't", 'not', 'do', 'once', 'at', 'o', "you'd", "we've", 'his', 'doing', "don't", 'nor', "we're", 'such', 'theirs', 'yourselves', 'u

In [None]:
class TextPreprocessing:
  def __init__(self, text):
    self.text = text
    self.stop_words = set(stopwords.words('english'))

  def _clean_text(self, text_to_clean):
    # if there is no string in the sentece
    if not isinstance(text_to_clean, str):  # Ensure the second argument is `str`
      return ""
    # Keep only letters and spaces
    cleaned_text = re.sub(r"[^a-zA-Z\s]", "", text_to_clean)
    return cleaned_text

  def fit(self):
    cleaned_text = self._clean_text(self.text)
    Lsent = cleaned_text.lower()
    sent_tok = word_tokenize(Lsent)
    sent_tok_filtered = set([word for word in sent_tok if word not in self.stop_words])
    return sent_tok_filtered

In [None]:
sentence = "Mina, Filopater, Martina, Youssef and (Mina & Martina & Filopater & Youssef) again for the assignment.!!!"
txtprepros = TextPreprocessing(sentence)
finaltxt = txtprepros.fit()
print(finaltxt)

{'mina', 'martina', 'filopater', 'assignment', 'youssef'}


In [None]:
documents = [
"Hany love going to school",
"The school is far from Sara home",
"Hany likes apple more than banana",
"Sara likes apple too"
]

preprocessed_documents = []
for doc in documents:
    preprocessor = TextPreprocessing(doc)
    preprocessed_documents.append(preprocessor.fit())

# Output the preprocessed documents
for i, doc in enumerate(preprocessed_documents):
    print(f"Document {i+1}: {doc}")

Document 1: {'love', 'school', 'going', 'hany'}
Document 2: {'school', 'sara', 'home', 'far'}
Document 3: {'banana', 'apple', 'hany', 'likes'}
Document 4: {'sara', 'apple', 'likes'}


In [None]:
# 2. Compute Term Frequency (TF)
def compute_tf(doc):
    term_count = Counter(doc)
    total_terms = len(doc)
    tf = {term: count / total_terms for term, count in term_count.items()}
    return tf

tf_list = [compute_tf(doc) for doc in preprocessed_documents]
print(tf_list)

[{'love': 0.25, 'school': 0.25, 'going': 0.25, 'hany': 0.25}, {'school': 0.25, 'sara': 0.25, 'home': 0.25, 'far': 0.25}, {'banana': 0.25, 'apple': 0.25, 'hany': 0.25, 'likes': 0.25}, {'sara': 0.3333333333333333, 'apple': 0.3333333333333333, 'likes': 0.3333333333333333}]


In [None]:
# 3. Compute Inverse Document Frequency (IDF)
def compute_idf(all_docs):
    total_docs = len(all_docs)
    idf = {}
    all_terms = set(term for doc in all_docs for term in doc)
    for term in all_terms:
        doc_count = sum(1 for doc in all_docs if term in doc)
        idf[term] = math.log((1 + total_docs) / (1 + doc_count)) + 1
    return idf

idf = compute_idf(preprocessed_documents)
print(idf)

{'sara': 1.5108256237659907, 'far': 1.916290731874155, 'love': 1.916290731874155, 'apple': 1.5108256237659907, 'banana': 1.916290731874155, 'home': 1.916290731874155, 'likes': 1.5108256237659907, 'school': 1.5108256237659907, 'going': 1.916290731874155, 'hany': 1.5108256237659907}


In [None]:
# 4. Compute TF-IDF
def compute_tfidf(tf, idf):
    tfidf = {term: tf_val * idf[term] for term, tf_val in tf.items()}
    return tfidf

tfidf_list = [compute_tfidf(tf, idf) for tf in tf_list]
print(tfidf_list)

[{'love': 0.4790726829685388, 'school': 0.3777064059414977, 'going': 0.4790726829685388, 'hany': 0.3777064059414977}, {'school': 0.3777064059414977, 'sara': 0.3777064059414977, 'home': 0.4790726829685388, 'far': 0.4790726829685388}, {'banana': 0.4790726829685388, 'apple': 0.3777064059414977, 'hany': 0.3777064059414977, 'likes': 0.3777064059414977}, {'sara': 0.5036085412553302, 'apple': 0.5036085412553302, 'likes': 0.5036085412553302}]


In [None]:
# Normalize TF-IDF vectors
def encode_documents(tfidf_list):
    terms = sorted(idf.keys())
    doc_vectors = []
    for tfidf in tfidf_list:
        vector = [tfidf.get(term, 0) for term in terms]
        doc_vectors.append(vector)
    return normalize(doc_vectors)

encoded_docs = encode_documents(tfidf_list)

# Display Words with Highest TF-IDF Value in Each Document
for idx, tfidf in enumerate(tfidf_list):
    sorted_terms = sorted(tfidf.items(), key=lambda x: x[1], reverse=True)[:3]
    print(f"Top words in Document {idx + 1}: {sorted_terms}")

# Display Encoded Document Vectors
print("\nEncoded Document Vectors:")
for idx, vec in enumerate(encoded_docs):
    print(f"Document {idx + 1}: {vec}")


Top words in Document 1: [('love', 0.4790726829685388), ('going', 0.4790726829685388), ('school', 0.3777064059414977)]
Top words in Document 2: [('home', 0.4790726829685388), ('far', 0.4790726829685388), ('school', 0.3777064059414977)]
Top words in Document 3: [('banana', 0.4790726829685388), ('apple', 0.3777064059414977), ('hany', 0.3777064059414977)]
Top words in Document 4: [('sara', 0.5036085412553302), ('apple', 0.5036085412553302), ('likes', 0.5036085412553302)]

Encoded Document Vectors:
Document 1: [0.         0.         0.         0.55528266 0.43779123 0.
 0.         0.55528266 0.         0.43779123]
Document 2: [0.         0.         0.55528266 0.         0.         0.55528266
 0.         0.         0.43779123 0.43779123]
Document 3: [0.46580855 0.59081908 0.         0.         0.46580855 0.
 0.46580855 0.         0.         0.        ]
Document 4: [0.57735027 0.         0.         0.         0.         0.
 0.57735027 0.         0.57735027 0.        ]


In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.4


In [None]:
import fitz

Each line element in list

In [None]:
Pdfs = ["Football_Basics.pdf", "Tennis SG.pdf", "NLPTrends.pdf", "Relationships.pdf"]

list_of_docs = []

for pdf_path in Pdfs:
    doc = fitz.open(pdf_path)

    Doc = []  # List to store all lines from the PDF

    for page in doc:
        page_text = page.get_text()
        lines = page_text.split('\n')  # Split the text into lines
        Doc.extend(lines)  # Add the lines to the list

    list_of_docs.append(Doc)

In [None]:
for docs in list_of_docs:
  fpreprocessed_documents = []
  for doc in docs:
    preprocessor = TextPreprocessing(doc)
    fpreprocessed_documents.append(preprocessor.fit())

  # Output the preprocessed documents
  for i, doc in enumerate(fpreprocessed_documents):
      print(f"Document {i+1}: {doc}")

  print(100*"#")

Document 1: {'football', 'basics'}
Document 2: {'check', 'new', 'need', 'games', 'football', 'youre', 'time', 'following', 'difficult', 'look'}
Document 3: {'terms'}
Document 4: {'game', 'basics'}
Document 5: {'advance', 'offense', 'football', 'possession', 'tries', 'called', 'man', 'ball', 'team', 'one'}
Document 6: {'points', 'score', 'running', 'line', 'ball', 'throwing', 'goal', 'crossing', 'fieldby'}
Document 7: {'area', 'defense', 'players', 'also', 'called', 'zone', 'team', 'end', 'getting'}
Document 8: {'make', 'offensive', 'possession', 'tries', 'ball', 'stop', 'team', 'give'}
Document 9: {'teams', 'offensive', 'possession', 'score', 'roles', 'give', 'forced', 'defensive', 'switch'}
Document 10: {'forth', 'defense', 'offense', 'four', 'back', 'team', 'defensive', 'goes'}
Document 11: {'played', 'game', 'quarters'}
Document 12: {'playing', 'field'}
Document 13: {'little', 'markings', 'field', 'yards', 'white', 'called', 'yard', 'measures', 'long', 'wide', 'markers'}
Document 14

MIna

In [None]:
Pdfs = ["Football_Basics.pdf", "Tennis SG.pdf", "NLPTrends.pdf", "Relationships.pdf"]

mdocs = []

for pdf_path in Pdfs:
    doc = fitz.open(pdf_path)

    page_text = ""
    for page in doc:
        page_text += page.get_text()

    mdocs.append(page_text)
    print(f"Content of {pdf_path}:")
    print(page_text)
    print(100*"#")
    print("\n")


Content of Football_Basics.pdf:
Football Basics 
Check here if you're new to football, having a difficult time following the games or if you just need to look 
up some terms.  
The Basics of the Game  
One 11-man team has possession of the football. It is called the offense and it tries to advance the ball 
down the field-by running with the ball or throwing it - and score points by crossing the goal line and 
getting into an area called the end zone. The other team (also with 11 players) is called the defense. It 
tries to stop the offensive team and make it give up possession of the ball. If the team with the ball does 
score or is forced to give up possession, the offensive and defensive teams switch roles (the offensive 
team goes on defense and the defensive team goes on offense). And so on, back and forth, until all four 
quarters of the game have been played.  
The Playing Field  
The field measures 100 yards long and 53 yards wide. Little white markings on the field called yard

In [None]:
mpreprocessed_documents = []
for doc in mdocs:
    preprocessor = TextPreprocessing(doc)
    mpreprocessed_documents.append(preprocessor.fit())

# Output the preprocessed documents
for i, doc in enumerate(mpreprocessed_documents):
    print(f"Document {i+1}: {doc}")

Document 1: {'prevents', 'closer', 'advancing', 'roles', 'sack', 'whether', 'thats', 'passing', 'next', 'safety', 'come', 'alternative', 'break', 'accidentally', 'hit', 'another', 'block', 'certain', 'time', 'th', 'keep', 'main', 'use', 'getsor', 'team', 'give', 'typically', 'interception', 'need', 'officials', 'spotted', 'bounds', 'free', 'grab', 'rushes', 'lateral', 'official', 'first', 'hike', 'blocking', 'holds', 'audible', 'changes', 'recover', 'make', 'umpire', 'someone', 'thrown', 'overtime', 'touchdown', 'position', 'legally', 'decide', 'runs', 'linemen', 'receiver', 'snap', 'encroachment', 'snaps', 'crossbar', 'kicker', 'blitz', 'air', 'upon', 'opponent', 'area', 'check', 'prevent', 'halfback', 'one', 'timing', 'rush', 'fumble', 'retainspossession', 'tackles', 'three', 'blockers', 'catching', 'goes', 'cornerbacks', 'whoever', 'four', 'usual', 'half', 'stop', 'wide', 'hold', 'standing', 'diving', 'incomplete', 'fair', 'part', 'measured', 'way', 'purposely', 'grasping', 'fourth'

In [None]:
tf_list = [compute_tf(doc) for doc in mpreprocessed_documents]
print(tf_list)

[{'prevents': 0.002457002457002457, 'closer': 0.002457002457002457, 'advancing': 0.002457002457002457, 'roles': 0.002457002457002457, 'sack': 0.002457002457002457, 'whether': 0.002457002457002457, 'thats': 0.002457002457002457, 'passing': 0.002457002457002457, 'next': 0.002457002457002457, 'safety': 0.002457002457002457, 'come': 0.002457002457002457, 'alternative': 0.002457002457002457, 'break': 0.002457002457002457, 'accidentally': 0.002457002457002457, 'hit': 0.002457002457002457, 'another': 0.002457002457002457, 'block': 0.002457002457002457, 'certain': 0.002457002457002457, 'time': 0.002457002457002457, 'th': 0.002457002457002457, 'keep': 0.002457002457002457, 'main': 0.002457002457002457, 'use': 0.002457002457002457, 'getsor': 0.002457002457002457, 'team': 0.002457002457002457, 'give': 0.002457002457002457, 'typically': 0.002457002457002457, 'interception': 0.002457002457002457, 'need': 0.002457002457002457, 'officials': 0.002457002457002457, 'spotted': 0.002457002457002457, 'boun

In [None]:
idf = compute_idf(mpreprocessed_documents)
print(idf)

{'works': 1.916290731874155, 'sexual': 1.916290731874155, 'european': 1.916290731874155, 'aspects': 1.5108256237659907, 'closer': 1.916290731874155, 'intentionality': 1.916290731874155, 'scarcity': 1.916290731874155, 'scene': 1.916290731874155, 'endeavors': 1.916290731874155, 'evolution': 1.916290731874155, 'skynner': 1.916290731874155, 'acceptance': 1.916290731874155, 'graziotin': 1.916290731874155, 'whether': 1.2231435513142097, 'thats': 1.5108256237659907, 'jective': 1.916290731874155, 'tenggan': 1.916290731874155, 'safety': 1.916290731874155, 'come': 1.5108256237659907, 'pluralistic': 1.916290731874155, 'alimuddin': 1.916290731874155, 'jonathan': 1.916290731874155, 'alternative': 1.5108256237659907, 'selfcitations': 1.916290731874155, 'haque': 1.916290731874155, 'accidentally': 1.916290731874155, 'bowlby': 1.916290731874155, 'therefore': 1.916290731874155, 'hit': 1.5108256237659907, 'initiative': 1.916290731874155, 'giuseppe': 1.916290731874155, 'focus': 1.5108256237659907, 'divers

In [None]:
tfidf_list = [compute_tfidf(tf, idf) for tf in tf_list]
print(tfidf_list)

[{'prevents': 0.004708331036545835, 'closer': 0.004708331036545835, 'advancing': 0.004708331036545835, 'roles': 0.0037121022696953087, 'sack': 0.004708331036545835, 'whether': 0.0030052667108457242, 'thats': 0.0037121022696953087, 'passing': 0.004708331036545835, 'next': 0.0030052667108457242, 'safety': 0.004708331036545835, 'come': 0.0037121022696953087, 'alternative': 0.0037121022696953087, 'break': 0.004708331036545835, 'accidentally': 0.004708331036545835, 'hit': 0.0037121022696953087, 'another': 0.0037121022696953087, 'block': 0.004708331036545835, 'certain': 0.0037121022696953087, 'time': 0.002457002457002457, 'th': 0.0030052667108457242, 'keep': 0.0030052667108457242, 'main': 0.0030052667108457242, 'use': 0.0037121022696953087, 'getsor': 0.004708331036545835, 'team': 0.0037121022696953087, 'give': 0.0037121022696953087, 'typically': 0.004708331036545835, 'interception': 0.004708331036545835, 'need': 0.0030052667108457242, 'officials': 0.004708331036545835, 'spotted': 0.004708331

In [None]:
encoded_docs = encode_documents(tfidf_list)

# Display Words with Highest TF-IDF Value in Each Document
for idx, tfidf in enumerate(tfidf_list):
    sorted_terms = sorted(tfidf.items(), key=lambda x: x[1], reverse=True)[:3]
    print(f"Top words in Document {idx + 1}: {sorted_terms}")

# Display Encoded Document Vectors
print("\nEncoded Document Vectors:")
for idx, vec in enumerate(encoded_docs):
    print(f"Document {idx + 1}: {vec}")

Top words in Document 1: [('prevents', 0.004708331036545835), ('closer', 0.004708331036545835), ('advancing', 0.004708331036545835)]
Top words in Document 2: [('leather', 0.007758262072365), ('choose', 0.007758262072365), ('always', 0.007758262072365)]
Top words in Document 3: [('works', 0.0008259873844285151), ('european', 0.0008259873844285151), ('intentionality', 0.0008259873844285151)]
Top words in Document 4: [('sexual', 0.004085907743868134), ('asking', 0.004085907743868134), ('whole', 0.004085907743868134)]

Encoded Document Vectors:
Document 1: [0.        0.        0.        ... 0.        0.        0.0543671]
Document 2: [0. 0. 0. ... 0. 0. 0.]
Document 3: [0.02122141 0.02122141 0.02122141 ... 0.02122141 0.02122141 0.        ]
Document 4: [0. 0. 0. ... 0. 0. 0.]
