#### Reimplement Task 1

In [1]:
import PyPDF2

#Extracting
def extract_text_from_pdf(pdf_file):
    text = ""
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)  # Use PdfReader instead of PdfFileReader
        for page_num in range(len(reader.pages)):  # reader.pages gives a list of pages
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

book1 = extract_text_from_pdf(r"C:\Users\Admin\Downloads\#Semester 7\NLP - Natural Language Processing\Gr homework\group 4\NLP-group-4\Data\1. Harry Potter and the Philosopher's Stone.pdf")
book2 = extract_text_from_pdf(r"C:\Users\Admin\Downloads\#Semester 7\NLP - Natural Language Processing\Gr homework\group 4\NLP-group-4\Data\2. Harry Potter and the Chamber of Secrets.pdf")
book3 = extract_text_from_pdf(r"C:\Users\Admin\Downloads\#Semester 7\NLP - Natural Language Processing\Gr homework\group 4\NLP-group-4\Data\3. Harry Potter and the Prisoner of Azkaban.pdf")
book4 = extract_text_from_pdf(r"C:\Users\Admin\Downloads\#Semester 7\NLP - Natural Language Processing\Gr homework\group 4\NLP-group-4\Data\4. Harry Potter and the Goblet.pdf")

In [2]:
# Text Preprocessing:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

processed_text1 = preprocess_text(book1)
processed_text2 = preprocess_text(book2)
processed_text3 = preprocess_text(book3)
processed_text4 = preprocess_text(book4)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import time
from sklearn.feature_extraction.text import CountVectorizer

def ensure_string(text):
    if isinstance(text, list):
        return ' '.join(text)  

def build_term_document_matrix(documents):
    vectorizer = CountVectorizer() 
    X = vectorizer.fit_transform(documents)  
    return X, vectorizer.get_feature_names_out()  

documents = [
    ensure_string(processed_text1),
    ensure_string(processed_text2),
    ensure_string(processed_text3),
    ensure_string(processed_text4)
]

start_time = time.time()

term_document_matrix, terms = build_term_document_matrix(documents)

execution_time = time.time() - start_time


print("Term-Document Matrix (Raw Frequency):")
print(term_document_matrix.toarray()) 

print("\nTerms (Features):")
print(terms)  

print(f"\nExecution Time: {execution_time} seconds")

Term-Document Matrix (Raw Frequency):
[[ 1  1  1 ...  2  0  1]
 [ 1  1  1 ...  0  0  0]
 [ 1  1  1 ...  3  0  0]
 [ 1  1  1 ... 11  1  3]]

Terms (Features):
['10' '100' '101' ... 'zooming' 'éclair' 'éclairs']

Execution Time: 0.6097168922424316 seconds


In [4]:
# Euclidean Distance
from sklearn.metrics.pairwise import euclidean_distances

euclidean_dist = euclidean_distances(term_document_matrix)
print("Euclidean Distance Matrix:\n", euclidean_dist)

Euclidean Distance Matrix:
 [[   0.          916.1129843  1463.21256146 3420.60564813]
 [ 916.1129843     0.         1134.279507   2952.26082859]
 [1463.21256146 1134.279507      0.         2456.55042692]
 [3420.60564813 2952.26082859 2456.55042692    0.        ]]


In [5]:
# Consine similarity
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(term_document_matrix)
print("Cosine Similarity Matrix:\n", cosine_sim)

Cosine Similarity Matrix:
 [[1.         0.94329679 0.9416178  0.93467331]
 [0.94329679 1.         0.95078986 0.95214857]
 [0.9416178  0.95078986 1.         0.94577085]
 [0.93467331 0.95214857 0.94577085 1.        ]]


In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

documents = [' '.join(processed_text1), 
             ' '.join(processed_text2), 
             ' '.join(processed_text3), 
             ' '.join(processed_text4)]

representative_words = ['wizard', 'magical', 'harry', 'spells', 'witch', 'journey']

# Create a CountVectorizer with the selected words
vectorizer = CountVectorizer(vocabulary = representative_words)
term_document_matrix = vectorizer.fit_transform(documents).toarray()

df = pd.DataFrame(term_document_matrix, columns = representative_words)
print("Term-Document Matrix for Selected Words:\n", df)

Term-Document Matrix for Selected Words:
    wizard  magical  harry  spells  witch  journey
0      42       11   1308       6     12       11
1      47       16   1633       9     16        5
2      39       31   2035       2     42        6
3      83      125   3134      24     37       10


In [7]:
# Compute cosine similarity
cosine_sim = cosine_similarity(term_document_matrix)
print("Cosine Similarity Matrix for Selected Words:\n", cosine_sim)

Cosine Similarity Matrix for Selected Words:
 [[1.         0.99997861 0.99980603 0.99946822]
 [0.99997861 1.         0.9998701  0.99954149]
 [0.99980603 0.9998701  1.         0.999609  ]
 [0.99946822 0.99954149 0.999609   1.        ]]


In [8]:
# Reduce dimensionality using Truncated SVD
n_components = 2
svd = TruncatedSVD(n_components=n_components)
reduced_embeddings = svd.fit_transform(term_document_matrix)

print("Reduced Embeddings:\n", reduced_embeddings)

Reduced Embeddings:
 [[1308.55565611  -23.64973942]
 [1633.60579063  -26.99254366]
 [2035.79150708  -26.59805943]
 [3137.64510112   41.17428047]]


#### Main task 2: Implement tf-idf to replace the raw frequency of the term-doc matrix in Question 1. Recompute the similarity between 4 Harry Potter books with cosine similarity metric. 

The part combines both ways for easy comparison 

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Preprocessed documents (from your previous code)
documents = [' '.join(processed_text1), 
             ' '.join(processed_text2), 
             ' '.join(processed_text3), 
             ' '.join(processed_text4)]

# 1. Raw Frequency (CountVectorizer)
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(documents).toarray()

# Calculate Cosine Similarity with Raw Frequency
cosine_sim_raw = cosine_similarity(count_matrix)

# Convert to DataFrame for better readability
count_df = pd.DataFrame(count_matrix, columns=count_vectorizer.get_feature_names_out())

print("Term-Document Matrix with Raw Frequency:")
print(count_df)
print("\nCosine Similarity Matrix with Raw Frequency:")
print(pd.DataFrame(cosine_sim_raw, index=[f'Book {i+1}' for i in range(4)], columns=[f'Book {i+1}' for i in range(4)]))


# 2. TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents).toarray()

# Calculate Cosine Similarity with TF-IDF
cosine_sim_tfidf = cosine_similarity(tfidf_matrix)

# Convert to DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_df)
print("\nCosine Similarity Matrix with TF-IDF:")
print(pd.DataFrame(cosine_sim_tfidf, index=[f'Book {i+1}' for i in range(4)], columns=[f'Book {i+1}' for i in range(4)]))


Term-Document Matrix with Raw Frequency:
   10  100  101  102  103  104  105  106  107  108  ...  zombie  zonko  \
0   1    1    1    1    1    1    1    1    0    1  ...       2      0   
1   1    1    1    1    1    1    1    1    0    1  ...       0      0   
2   1    1    1    1    1    1    1    1    0    1  ...       1     11   
3   1    1    1    1    1    1    1    0    1    1  ...       0      1   

   zontal  zoo  zoological  zoom  zoomed  zooming  éclair  éclairs  
0       0    7           0     1       1        2       0        1  
1       0    2           0     0       2        0       0        0  
2       0    0           0     0       9        3       0        0  
3       1    0           1     4       9       11       1        3  

[4 rows x 15458 columns]

Cosine Similarity Matrix with Raw Frequency:
          Book 1    Book 2    Book 3    Book 4
Book 1  1.000000  0.943297  0.941618  0.934673
Book 2  0.943297  1.000000  0.950790  0.952149
Book 3  0.941618  0.950790  1.

- Raw Frequency: This matrix is based on the frequency of the words in each document, but does not take into account the importance of the word in each document, resulting in common words that can blur the differences between books.
=> Tends to give higher similarity values for documents that share a lot of common words, even if those words are not particularly important or representative.
- TF-IDF:When using TF-IDF, words that appear more in one document but less in other documents will have a higher weight. This helps improve the distinction between books
=> Lowers the similarity for common words and focuses more on terms that are specific to a given document, resulting in a more meaningful similarity measure.

#### 3. Implement Pointwise Mutual Information (PMI) to replace the raw frequency of the term-doc matrix in Question 1. Recompute the similarity between 4 Harry Potter books with cosine similarity metric. 

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Create the term-document frequency matrix
def build_term_document_matrix(documents):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(documents)
    return X.toarray(), vectorizer.get_feature_names_out()

# Step 2: Calculate PMI for the term-document matrix
def compute_pmi(term_doc_matrix):
    # Total word occurrences in all documents
    total_word_count = np.sum(term_doc_matrix)
    
    # Calculate the frequency of each word (P(w))
    word_freq = np.sum(term_doc_matrix, axis=0)
    
    # Calculate the frequency of each document (P(d))
    doc_freq = np.sum(term_doc_matrix, axis=1)
    
    # Calculate joint probability P(w, d) = frequency of word w in document d / total word occurrences
    joint_prob = term_doc_matrix / total_word_count
    
    # Calculate PMI
    pmi_matrix = np.zeros_like(term_doc_matrix, dtype=float)
    
    for i in range(term_doc_matrix.shape[0]):  # Iterate over documents
        for j in range(term_doc_matrix.shape[1]):  # Iterate over words
            if term_doc_matrix[i, j] > 0:
                # Compute PMI(w, d) = log2(P(w, d) / (P(w) * P(d)))
                p_w = word_freq[j] / total_word_count  # Probability of word w
                p_d = doc_freq[i] / total_word_count  # Probability of document d
                pmi_value = np.log2(joint_prob[i, j] / (p_w * p_d))
                pmi_matrix[i, j] = max(pmi_value, 0)  # Apply PPMI (Positive PMI) to remove negative values
    
    return pmi_matrix

# Step 3: Compute cosine similarity for the PMI matrix
def cosine_similarity_pmi(documents):
    term_doc_matrix, terms = build_term_document_matrix(documents)
    pmi_matrix = compute_pmi(term_doc_matrix)
    cosine_sim_pmi = cosine_similarity(pmi_matrix)
    return cosine_sim_pmi, pmi_matrix, terms

# Step 4: Integrate the steps and compare 4 books
documents = [
    ' '.join(processed_text1),
    ' '.join(processed_text2),
    ' '.join(processed_text3),
    ' '.join(processed_text4)
]

# Calculate cosine similarity using the PMI matrix
cosine_sim_pmi, pmi_matrix, terms = cosine_similarity_pmi(documents)

# Display the PMI matrix
print("\nPMI Matrix:")
pmi_df = pd.DataFrame(pmi_matrix, columns=terms, index=[f'Doc {i+1}' for i in range(pmi_matrix.shape[0])])
print(pmi_df)

# Display the cosine similarity matrix
print("\nCosine Similarity Matrix with PMI:")
cosine_df = pd.DataFrame(cosine_sim_pmi, index=[f'Book {i+1}' for i in range(4)], 
                         columns=[f'Book {i+1}' for i in range(4)])
print(cosine_df)


PMI Matrix:
             10       100       101       102       103       104       105  \
Doc 1  0.603341  0.603341  0.603341  0.603341  0.603341  0.603341  0.603341   
Doc 2  0.416718  0.416718  0.416718  0.416718  0.416718  0.416718  0.416718   
Doc 3  0.087057  0.087057  0.087057  0.087057  0.087057  0.087057  0.087057   
Doc 4  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

            106       107       108  ...    zombie     zonko    zontal  \
Doc 1  1.018379  0.000000  0.603341  ...  2.018379  0.000000  0.000000   
Doc 2  0.831756  0.000000  0.416718  ...  0.000000  0.000000  0.000000   
Doc 3  0.502094  0.000000  0.087057  ...  0.502094  1.961526  0.000000   
Doc 4  0.000000  1.276481  0.000000  ...  0.000000  0.000000  1.276481   

            zoo  zoological      zoom    zoomed   zooming    éclair   éclairs  
Doc 1  2.240771    0.000000  0.281413  0.000000  0.000000  0.000000  0.603341  
Doc 2  0.246793    0.000000  0.000000  0.000000  0.000000  0