In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
import pandas as pd

nltk.download('stopwords')
nltk.download('punkt')

# Read the file and store the text in a variable
with open('LabE6.txt', 'r') as file:
    text = file.read()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\loviy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\loviy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


a. Apply preprocessing.

In [6]:
# Convert all text to lowercase
text = text.lower()

# Remove all punctuation marks, digits and other special characters
text = re.sub('[^a-zA-Z]', ' ', text)

# Tokenize the text into words
words = nltk.word_tokenize(text)

# Remove stop words
stop_words = set(stopwords.words('english'))
words = [word for word in words if not word in stop_words]

# Stem the remaining words
stemmer = PorterStemmer()
words = [stemmer.stem(word) for word in words]

# Join the stemmed words back into a single string
processed_text = ' '.join(words)

b. Create one-hot encoded vectors for the each tokens in the vocabulary

In [4]:
# Create a set of all unique words in the preprocessed text
vocab = set(words)

# Assign a unique index to each word in the vocabulary
word_to_idx = {word: i for i, word in enumerate(vocab)}

# Create a one-hot encoded vector for each word in the text
one_hot_vectors = []
for word in words:
    vector = [0] * len(vocab)
    vector[word_to_idx[word]] = 1
    one_hot_vectors.append(vector)


Apply newline tokenization to the text (use split(“\n”). Consider each element in the list as a document.
a. Apply preprocessing.

In [7]:
# Read the file and split the text into documents
with open('LabE6.txt', 'r') as file:
    text = file.read()
    documents = text.split('\n')

# Preprocess each document in the list of documents
processed_documents = []
for doc in documents:
    # Convert all text to lowercase
    doc = doc.lower()

    # Remove all punctuation marks, digits and other special characters
    doc = re.sub('[^a-zA-Z]', ' ', doc)

    # Tokenize the text into words
    words = nltk.word_tokenize(doc)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]

    # Stem the remaining words
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Join the stemmed words back into a single string
    processed_doc = ' '.join(words)

    # Append the processed document to the list of processed documents
    processed_documents.append(processed_doc)

    


b. Create BoWs vectors for each of the documents

In [8]:
# Create a set of all unique words in the preprocessed text
vocab = set()
for doc in processed_documents:
    vocab.update(doc.split())

# Assign a unique index to each word in the vocabulary
word_to_idx = {word: i for i, word in enumerate(vocab)}

# Create a BoW vector for each document
bow_vectors = []
for doc in processed_documents:
    vector = [0] * len(vocab)
    for word in doc.split():
        vector[word_to_idx[word]] += 1
    bow_vectors.append(vector)


Read a search text from the user
a. Using cosine similarity : List the top five similar documents based on the 
search text

In [11]:
# Create a set of all unique words in the preprocessed text
vocab = set()
for doc in processed_documents:
    vocab.update(doc.split())

# Assign a unique index to each word in the vocabulary
word_to_idx = {word: i for i, word in enumerate(vocab)}

# Create a BoW vector for each document
bow_vectors = []
for doc in processed_documents:
    vector = [0] * len(vocab)
    for word in doc.split():
        vector[word_to_idx[word]] += 1
    bow_vectors.append(vector)

search_text = input("Enter search text: ").lower()
search_words = nltk.word_tokenize(search_text)
search_words = [word for word in search_words if not word in stop_words]
search_words = [stemmer.stem(word) for word in search_words]
search_text = ' '.join(search_words)

search_vector = [0] * len(vocab)
for word in search_text.split():
    if word in word_to_idx:
        search_vector[word_to_idx[word]] += 1

similarities = []
for i, doc_vector in enumerate(bow_vectors):
    sim = np.dot(search_vector, doc_vector) / (np.linalg.norm(search_vector) * np.linalg.norm(doc_vector))
    similarities.append((i, sim))

similarities.sort(key=lambda x: x[1], reverse=True)

print("Top five similar documents:")
for i in range(5):
    doc_idx = similarities[i][0]
    doc_sim = similarities[i][1]
    print(f"Document {doc_idx+1} - Similarity: {doc_sim:.4f}")
    print(documents[doc_idx])
    print()    


Enter search text: hello
Top five similar documents:
Document 215 - Similarity: 0.0535
holy Sh*t this was god awful. i sat in the theater for for an hour and ten minutes and i thought i was going to gouge out my eyes much in the manor Oedipus Rex. dear god. this movie deserves no more credit than anything done by a middle school film buff. please save your money, this movie can offer you nothing. unless you enjoy sideshows and sleeping in movie theaters. you know, h3ll, bring your girlfriend and make things interesting. you will be the only ones there anyway. F@ck this slide show. <br /><br />Ye Be Warned.<br /><br />I recommend not watching this.<br /><br />hello.<br /><br />how are you?<br /><br />I'm pretty good.<br /><br />enjoying this day?<br /><br />I am.<br /><br />this comment was one-hundred times more fun than pretending to watch this daym movie. this is sad.

Document 18 - Similarity: 0.0442
This movie made it into one of my top 10 most awful movies. Horrible. <br /><br />T

  sim = np.dot(search_vector, doc_vector) / (np.linalg.norm(search_vector) * np.linalg.norm(doc_vector))
