In [1]:
import math
from collections import Counter
import pandas as pd
from nltk.tokenize import word_tokenize

In [2]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
documents = [
    "This movie was fantastic and I loved every minute of it",
    "The acting was terrible and the plot made no sense",
    "Great special effects but the story was predictable",
    "I fell asleep during this boring movie",
    "The soundtrack was amazing and the cinematography stunning"
]


tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

In [4]:
# Calculate Term Frequency (TF)
term_freq = []
for doc in tokenized_docs:
    total_words = len(doc)
    word_counts = Counter(doc)
    tf = {word: count / total_words for word, count in word_counts.items()}
    term_freq.append(tf)

In [5]:
term_freq

[{'this': 0.09090909090909091,
  'movie': 0.09090909090909091,
  'was': 0.09090909090909091,
  'fantastic': 0.09090909090909091,
  'and': 0.09090909090909091,
  'i': 0.09090909090909091,
  'loved': 0.09090909090909091,
  'every': 0.09090909090909091,
  'minute': 0.09090909090909091,
  'of': 0.09090909090909091,
  'it': 0.09090909090909091},
 {'the': 0.2,
  'acting': 0.1,
  'was': 0.1,
  'terrible': 0.1,
  'and': 0.1,
  'plot': 0.1,
  'made': 0.1,
  'no': 0.1,
  'sense': 0.1},
 {'great': 0.125,
  'special': 0.125,
  'effects': 0.125,
  'but': 0.125,
  'the': 0.125,
  'story': 0.125,
  'was': 0.125,
  'predictable': 0.125},
 {'i': 0.14285714285714285,
  'fell': 0.14285714285714285,
  'asleep': 0.14285714285714285,
  'during': 0.14285714285714285,
  'this': 0.14285714285714285,
  'boring': 0.14285714285714285,
  'movie': 0.14285714285714285},
 {'the': 0.25,
  'soundtrack': 0.125,
  'was': 0.125,
  'amazing': 0.125,
  'and': 0.125,
  'cinematography': 0.125,
  'stunning': 0.125}]

In [6]:
# Print Term Frequency
print("Term Frequency (TF):")
for i, tf in enumerate(term_freq):
    print(f"Document {i+1}: {tf}")

Term Frequency (TF):
Document 1: {'this': 0.09090909090909091, 'movie': 0.09090909090909091, 'was': 0.09090909090909091, 'fantastic': 0.09090909090909091, 'and': 0.09090909090909091, 'i': 0.09090909090909091, 'loved': 0.09090909090909091, 'every': 0.09090909090909091, 'minute': 0.09090909090909091, 'of': 0.09090909090909091, 'it': 0.09090909090909091}
Document 2: {'the': 0.2, 'acting': 0.1, 'was': 0.1, 'terrible': 0.1, 'and': 0.1, 'plot': 0.1, 'made': 0.1, 'no': 0.1, 'sense': 0.1}
Document 3: {'great': 0.125, 'special': 0.125, 'effects': 0.125, 'but': 0.125, 'the': 0.125, 'story': 0.125, 'was': 0.125, 'predictable': 0.125}
Document 4: {'i': 0.14285714285714285, 'fell': 0.14285714285714285, 'asleep': 0.14285714285714285, 'during': 0.14285714285714285, 'this': 0.14285714285714285, 'boring': 0.14285714285714285, 'movie': 0.14285714285714285}
Document 5: {'the': 0.25, 'soundtrack': 0.125, 'was': 0.125, 'amazing': 0.125, 'and': 0.125, 'cinematography': 0.125, 'stunning': 0.125}


In [7]:
# Calculate Document Frequency (DF)
document_freq = {}
total_docs = len(tokenized_docs)

In [8]:
for doc in tokenized_docs:
    unique_words = set(doc)
    for word in unique_words:
        if word in document_freq:
            document_freq[word] += 1
        else:
            document_freq[word] = 1

In [9]:
total_docs

5

In [10]:
# Calculate Inverse Document Frequency (IDF)
idf = {word: math.log(total_docs / freq) for word, freq in document_freq.items()}

In [11]:
# Print Document Frequency and IDF
print("\nDocument Frequency (DF):")
print(document_freq)
print("\nInverse Document Frequency (IDF):")
print(idf)


Document Frequency (DF):
{'it': 1, 'loved': 1, 'minute': 1, 'of': 1, 'was': 4, 'every': 1, 'movie': 2, 'and': 3, 'i': 2, 'this': 2, 'fantastic': 1, 'plot': 1, 'no': 1, 'sense': 1, 'terrible': 1, 'acting': 1, 'made': 1, 'the': 3, 'great': 1, 'special': 1, 'effects': 1, 'but': 1, 'predictable': 1, 'story': 1, 'boring': 1, 'fell': 1, 'asleep': 1, 'during': 1, 'stunning': 1, 'soundtrack': 1, 'amazing': 1, 'cinematography': 1}

Inverse Document Frequency (IDF):
{'it': 1.6094379124341003, 'loved': 1.6094379124341003, 'minute': 1.6094379124341003, 'of': 1.6094379124341003, 'was': 0.22314355131420976, 'every': 1.6094379124341003, 'movie': 0.9162907318741551, 'and': 0.5108256237659907, 'i': 0.9162907318741551, 'this': 0.9162907318741551, 'fantastic': 1.6094379124341003, 'plot': 1.6094379124341003, 'no': 1.6094379124341003, 'sense': 1.6094379124341003, 'terrible': 1.6094379124341003, 'acting': 1.6094379124341003, 'made': 1.6094379124341003, 'the': 0.5108256237659907, 'great': 1.6094379124341003

In [12]:
# Calculate TF-IDF
tfidf_docs = []
for i, tf in enumerate(term_freq):
    print(tf)
    tfidf = {word: tf_val * idf[word] for word, tf_val in tf.items()}
    tfidf_docs.append(tfidf)

{'this': 0.09090909090909091, 'movie': 0.09090909090909091, 'was': 0.09090909090909091, 'fantastic': 0.09090909090909091, 'and': 0.09090909090909091, 'i': 0.09090909090909091, 'loved': 0.09090909090909091, 'every': 0.09090909090909091, 'minute': 0.09090909090909091, 'of': 0.09090909090909091, 'it': 0.09090909090909091}
{'the': 0.2, 'acting': 0.1, 'was': 0.1, 'terrible': 0.1, 'and': 0.1, 'plot': 0.1, 'made': 0.1, 'no': 0.1, 'sense': 0.1}
{'great': 0.125, 'special': 0.125, 'effects': 0.125, 'but': 0.125, 'the': 0.125, 'story': 0.125, 'was': 0.125, 'predictable': 0.125}
{'i': 0.14285714285714285, 'fell': 0.14285714285714285, 'asleep': 0.14285714285714285, 'during': 0.14285714285714285, 'this': 0.14285714285714285, 'boring': 0.14285714285714285, 'movie': 0.14285714285714285}
{'the': 0.25, 'soundtrack': 0.125, 'was': 0.125, 'amazing': 0.125, 'and': 0.125, 'cinematography': 0.125, 'stunning': 0.125}


In [13]:
# Print TF-IDF
print("\nTF-IDF Scores:")
for i, tfidf in enumerate(tfidf_docs):
    print(f"Document {i+1}: {tfidf}")


TF-IDF Scores:
Document 1: {'this': 0.08329915744310501, 'movie': 0.08329915744310501, 'was': 0.020285777392200888, 'fantastic': 0.14631253749400913, 'and': 0.04643869306963552, 'i': 0.08329915744310501, 'loved': 0.14631253749400913, 'every': 0.14631253749400913, 'minute': 0.14631253749400913, 'of': 0.14631253749400913, 'it': 0.14631253749400913}
Document 2: {'the': 0.10216512475319815, 'acting': 0.16094379124341004, 'was': 0.02231435513142098, 'terrible': 0.16094379124341004, 'and': 0.051082562376599076, 'plot': 0.16094379124341004, 'made': 0.16094379124341004, 'no': 0.16094379124341004, 'sense': 0.16094379124341004}
Document 3: {'great': 0.20117973905426254, 'special': 0.20117973905426254, 'effects': 0.20117973905426254, 'but': 0.20117973905426254, 'the': 0.06385320297074884, 'story': 0.20117973905426254, 'was': 0.02789294391427622, 'predictable': 0.20117973905426254}
Document 4: {'i': 0.13089867598202215, 'fell': 0.22991970177630003, 'asleep': 0.22991970177630003, 'during': 0.22991

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

In [15]:
feature_names

array(['acting', 'amazing', 'and', 'asleep', 'boring', 'but',
       'cinematography', 'during', 'effects', 'every', 'fantastic',
       'fell', 'great', 'it', 'loved', 'made', 'minute', 'movie', 'no',
       'of', 'plot', 'predictable', 'sense', 'soundtrack', 'special',
       'story', 'stunning', 'terrible', 'the', 'this', 'was'],
      dtype=object)

In [16]:
df_sklearn = pd.DataFrame(X.toarray(), columns=feature_names)
df_sklearn.index = [f"Doc {i+1}" for i in range(len(documents))]

for i in range(len(documents)):
    doc_name = f"Doc {i+1}"
    doc_features = df_sklearn.loc[doc_name]

    present_words = doc_features[doc_features > 0]

    print(f"\n{doc_name} - Words present with TF-IDF scores:")
    print(present_words.sort_values(ascending=False))


Doc 1 - Words present with TF-IDF scores:
every        0.352066
fantastic    0.352066
of           0.352066
it           0.352066
loved        0.352066
minute       0.352066
this         0.284045
movie        0.284045
and          0.235783
was          0.198348
Name: Doc 1, dtype: float64

Doc 2 - Words present with TF-IDF scores:
the         0.457806
acting      0.341794
made        0.341794
plot        0.341794
no          0.341794
terrible    0.341794
sense       0.341794
and         0.228903
was         0.192561
Name: Doc 2, dtype: float64

Doc 3 - Words present with TF-IDF scores:
but            0.384447
effects        0.384447
great          0.384447
predictable    0.384447
special        0.384447
story          0.384447
the            0.257469
was            0.216591
Name: Doc 3, dtype: float64

Doc 4 - Words present with TF-IDF scores:
asleep    0.434297
boring    0.434297
during    0.434297
fell      0.434297
movie     0.350388
this      0.350388
Name: Doc 4, dtype: float64

