In [9]:
import math
from collections import Counter
import pandas

In [2]:
def calculate_tf(document):
    tf_document = Counter(document)
    total_words = len(document)
    tf = {word: tf_document[word] / total_words for word in tf_document}
    return tf

In [3]:
def calculate_idf(documents, term):
    document_count = sum(1 for doc in documents if term in doc)
    idf = math.log(len(documents) / (1 + document_count))
    return idf

In [4]:
def calculate_tfidf_score(query, document, documents):
    tf = calculate_tf(document)
    idf_scores = {word: calculate_idf(documents, word) for word in query}

    score = 0
    for word in query:
        if word in tf:
            score += tf[word] * idf_scores[word]
    return score

In [50]:
import string
def preprocess_document(document):
    # Convert text to lowercase
    document = document.lower()

    # Remove punctuation
    document = document.translate(str.maketrans("", "", string.punctuation))

    # Tokenize document into words
    words = word_tokenize(document)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    return words

In [6]:
!wget http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
!tar -xf MovieSummaries.tar.gz



--2023-07-14 10:54:05--  http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 48002242 (46M) [application/x-gzip]
Saving to: ‘MovieSummaries.tar.gz’


2023-07-14 10:54:49 (1.06 MB/s) - ‘MovieSummaries.tar.gz’ saved [48002242/48002242]



In [10]:
df = pandas.read_csv("/content/MovieSummaries/plot_summaries.txt", delimiter = "\t",names=["id","text"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42303 entries, 0 to 42302
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      42303 non-null  int64 
 1   text    42303 non-null  object
dtypes: int64(1), object(1)
memory usage: 661.1+ KB


In [15]:
df.head()

Unnamed: 0,id,text
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [21]:
! pip install nltk



In [34]:
import nltk
from nltk import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


all_contents = []
for d in df['text']:
  normalized_text = d.lower()
  all_contents.append(normalized_text)

In [35]:
all_contents[0]


"shlykov, a hard-working taxi driver and lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all."

In [42]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [44]:
# from nltk.tokenize import word_tokenize


# all_tokens= []
# tokenizer = WordTokenizer()
# for item in all_contents:
#   all_tokens.append(tokenizer.tokenize(item))

import nltk
from nltk.tokenize import word_tokenize

all_tokens = []
for item in all_contents:
    tokens = word_tokenize(item)
    all_tokens.append(tokens)

In [53]:
len(all_tokens)

42303

In [None]:
query = input("Enter your query: ")

preprocessed_query = preprocess_document(query)

preprocessed_query

scores = []

sample_of_doc = all_tokens[:100]
for doc in sample_of_doc:
    score = calculate_tfidf_score(preprocessed_query, doc, sample_of_doc)
    scores.append(score)

top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:5]

print("Top 5 related indices:")
for index in top_indices:
    # print(index)
    print(all_contents[index])
