# **Import Libraries and NLTK Setups**

In [61]:
import os
import logging
import math
import numpy as np
import nltk
nltk . download ('stopwords')
nltk . download ('punkt')
nltk . download ('wordnet')
import string
import logging
import re
from collections import defaultdict , Counter
from nltk . corpus import stopwords
from nltk . tokenize import word_tokenize
from nltk . stem import WordNetLemmatizer
from numpy.linalg import norm

STOPWORDS = set( stopwords . words ('english') )
LEMMATIZER = WordNetLemmatizer ()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Loading Movies Text Files**

In [62]:
def load_movies(folder_path):
    data = {}
    doc_id_to_filename = {}
    doc_id = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data[doc_id] = file.read()
                doc_id_to_filename[doc_id] = filename
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")
                doc_id += 1
    return data, doc_id_to_filename

# **Tokenization**

In [63]:
def tokenize(text):
    return text.lower().split()

# **Text Cleaning Process**

In [64]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = tokenize(text)
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]
    return cleaned_tokens

# **Term Frequency**

In [65]:
def term_frequency(term, document):
    return document.count(term) / len(document)

# **Inverse Document Frequency**

In [66]:
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

# **Computing TF-IDF**

In [67]:
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

# **Cosine Similarity**

In [68]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2) if norm_vec1 * norm_vec2 != 0 else 0

# **Processing Movie Search**

In [69]:
import os
import logging

def process_queries(query, all_documents, doc_tfidf_vectors, vocab, top_k=5):
    tokenized_query = clean_text(query)
    query_vector = compute_tfidf(tokenized_query, all_documents, vocab)

    similarities = []
    for doc_id, doc_vector in enumerate(doc_tfidf_vectors):
        similarity = cosine_similarity(query_vector, doc_vector)
        similarities.append((doc_id, similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)

    return similarities[:top_k]

# **Convert doc_ids to Filenames**

In [70]:
def convert_doc_ids_to_filenames(doc_ids, doc_id_to_filename):
    return [doc_id_to_filename[doc_id] for doc_id in doc_ids]

# **Main Function**

In [71]:
def main():
    folder_path = "/content/drive/MyDrive/FinalIRProject"


    print("Movies Loading.........")
    movies_data, doc_id_to_filename = load_movies(folder_path)

    queries = input("Enter the queries: ")


    tokenized_docs = [clean_text(doc) for doc in movies_data.values()]
    vocab = sorted(set(word for doc in tokenized_docs for word in doc))


    doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]

    print(f"Searching top results for '{queries}': ")
    similarities = process_queries(queries, tokenized_docs, doc_tfidf_vectors, vocab)

    results = [(queries, similarities)]

    if results:
        print("\nTop Five Movies: ")
        for idx, (doc_id, score) in enumerate(results[0][1], 1):
            print(f"Movie {idx}: {doc_id_to_filename[doc_id]}, Score: {score:.4f}")
    else:
        print("No results found.")


if __name__ == "__main__":
    main()

Movies Loading.........
Enter the queries: love romance
Searching top results for 'love romance': 

Top Five Movies: 
Movie 1: Titanic.txt, Score: 0.1494
Movie 2: The Shape of Water.txt, Score: 0.1400
Movie 3: Forrest Gump.txt, Score: 0.0890
Movie 4: La La Land .txt, Score: 0.0748
Movie 5: Slumdog Millionaire .txt, Score: 0.0651
