# Task 1b: Implement Code to Build the Term-Document Matrix

In [47]:
import PyPDF2

def extract_text_from_pdf(pdf_file):
    text = ""
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)  # Use PdfReader instead of PdfFileReader
        for page_num in range(len(reader.pages)):  # reader.pages gives a list of pages
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

book1 = extract_text_from_pdf(r"D:\Downloads\NEU\NAM 4 KI 1\NLP\Assignment group 4\1. Harry Potter and the Philosopher's Stone.pdf")
book2 = extract_text_from_pdf(r"D:\Downloads\NEU\NAM 4 KI 1\NLP\Assignment group 4\2. Harry Potter and the Chamber of Secrets.pdf")
book3 = extract_text_from_pdf(r"D:\Downloads\NEU\NAM 4 KI 1\NLP\Assignment group 4\3. Harry Potter and the Prisoner of Azkaban.pdf")
book4 = extract_text_from_pdf(r"D:\Downloads\NEU\NAM 4 KI 1\NLP\Assignment group 4\4. Harry Potter and the Goblet.pdf")

In [48]:
# Text Preprocessing:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

processed_text1 = preprocess_text(book1)
processed_text2 = preprocess_text(book2)
processed_text3 = preprocess_text(book3)
processed_text4 = preprocess_text(book4)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nghaphg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nghaphg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
import time
from sklearn.feature_extraction.text import CountVectorizer

def ensure_string(text):
    if isinstance(text, list):
        return ' '.join(text)  

def build_term_document_matrix(documents):
    vectorizer = CountVectorizer() 
    X = vectorizer.fit_transform(documents)  
    return X, vectorizer.get_feature_names_out()  

documents = [
    ensure_string(processed_text1),
    ensure_string(processed_text2),
    ensure_string(processed_text3),
    ensure_string(processed_text4)
]

start_time = time.time()

term_document_matrix, terms = build_term_document_matrix(documents)

execution_time = time.time() - start_time


print("Term-Document Matrix (Raw Frequency):")
print(term_document_matrix.toarray()) 

print("\nTerms (Features):")
print(terms)  

print(f"\nExecution Time: {execution_time} seconds")

Term-Document Matrix (Raw Frequency):
[[ 1  1  1 ...  2  0  1]
 [ 1  1  1 ...  0  0  0]
 [ 1  1  1 ...  3  0  0]
 [ 1  1  1 ... 11  1  3]]

Terms (Features):
['10' '100' '101' ... 'zooming' 'éclair' 'éclairs']

Execution Time: 0.20378589630126953 seconds


# Task 1c: Compute Similarities Using Euclidean Distance and Cosine Similarity

In [50]:
# Euclidean Distance
from sklearn.metrics.pairwise import euclidean_distances

euclidean_dist = euclidean_distances(term_document_matrix)
print("Euclidean Distance Matrix:\n", euclidean_dist)


Euclidean Distance Matrix:
 [[   0.          916.1129843  1463.21256146 3420.60564813]
 [ 916.1129843     0.         1134.279507   2952.26082859]
 [1463.21256146 1134.279507      0.         2456.55042692]
 [3420.60564813 2952.26082859 2456.55042692    0.        ]]


In [51]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(term_document_matrix)
print("Cosine Similarity Matrix:\n", cosine_sim)


Cosine Similarity Matrix:
 [[1.         0.94329679 0.9416178  0.93467331]
 [0.94329679 1.         0.95078986 0.95214857]
 [0.9416178  0.95078986 1.         0.94577085]
 [0.93467331 0.95214857 0.94577085 1.        ]]
