# Text Analytics

In [1]:
import PyPDF2
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import plotly.express as px

# Download NLTK stopwords
nltk.download('stopwords')

# Load the PDF
pdf_file = 'HAN335-E.pdf'
with open(pdf_file, 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()

# Text cleaning
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

cleaned_text = clean_text(text)

# Count word frequency
word_counts = Counter(cleaned_text.split())

# Convert to DataFrame
df_word_counts = pd.DataFrame(word_counts.items(), columns=['Word', 'Frequency'])

# Sort by frequency and get top 25 words
df_top_words = df_word_counts.sort_values(by='Frequency', ascending=False).head(50)

# Create a Plotly bar chart
fig = px.bar(df_top_words, x='Word', y='Frequency', title='Top 25 Most Frequent Words')
fig.show()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Information Extraction

In [2]:
import spacy

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Process the text
doc = nlp(text)

# Extract Named Entities
entities = [(ent.text, ent.label_) for ent in doc.ents]

# Convert to DataFrame for better visualization
df_entities = pd.DataFrame(entities, columns=['Entity', 'Label'])

# Display the extracted entities
print(df_entities)

# Optional: Show the most common entities
df_common_entities = df_entities['Entity'].value_counts().head(50)
print("Most Common Entities:")
print(df_common_entities)

                                          Entity     Label
0                                           44th   ORDINAL
1                                     PARLIAMENT       ORG
2                                            1st   ORDINAL
3     House of Commons Debates\nOfficial  Report       ORG
4                                            151  CARDINAL
...                                          ...       ...
5000                               de la Chambre    PERSON
5001                         tions de la Chambre    PERSON
5002                                 La\nChambre    PERSON
5003                         la présente permis‐       GPE
5004                        the House of Commons       ORG

[5005 rows x 2 columns]
Most Common Entities:
Entity
Canada                  127
Canadians               116
Speaker                  97
House                    65
CPC                      54
first                    48
Quebec                   47
Health Canada            46
Pfizer        

# Topic Modeling

In [3]:
import gensim
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import math
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis


# Ensure that the NLTK punkt tokenizer is available
nltk.download('punkt')

# Set the number of chunks
N = 1000  # You can adjust this number as needed

# Split the text into N equally sized chunks
def chunk_text(text, num_chunks):
    chunk_size = math.ceil(len(text) / num_chunks)
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Chunk the text
chunks = chunk_text(text, N)

# Preprocess the text
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    # Tokenize the document
    tokens = word_tokenize(doc.lower())
    # Remove stopwords and non-alphabetic words
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Apply preprocessing to each chunk
processed_chunks = [preprocess(chunk) for chunk in chunks]

# Create a dictionary representation of the chunks
dictionary = corpora.Dictionary(processed_chunks)

# Convert chunks into the bag-of-words format
corpus = [dictionary.doc2bow(chunk) for chunk in processed_chunks]

# Apply LDA (Latent Dirichlet Allocation) for topic modeling
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=50)

# Print the topics in a more human-readable format
for idx, topic in lda_model.print_topics(-1):
    print(f"\nTopic {idx + 1}:")
    print("="*30)
    words = topic.split(" + ")
    for word in words:
        weight, term = word.split("*")
        print(f"{term.strip()} ({float(weight):.4f})")
        
# Visualize the topics using pyLDAvis
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Topic 1:
"bill" (0.0150)
"speaker" (0.0130)
"report" (0.0120)
"act" (0.0110)
"house" (0.0100)
"canada" (0.0100)
"time" (0.0090)
"commons" (0.0090)
"motion" (0.0090)
"government" (0.0080)

Topic 2:
"speaker" (0.0100)
"poilievre" (0.0090)
"canadians" (0.0070)
"trudeau" (0.0070)
"cpc" (0.0060)
"leader" (0.0060)
"members" (0.0060)
"opposition" (0.0050)
"hon" (0.0050)
"people" (0.0050)

Topic 3:
"de" (0.0060)
"la" (0.0060)
"may" (0.0040)
"brampton" (0.0040)
"taylor" (0.0040)
"sidhu" (0.0030)
"modification" (0.0030)
"commons" (0.0030)
"chambre" (0.0030)
"house" (0.0030)

Topic 4:
"minister" (0.0170)
"prime" (0.0140)
"canada" (0.0100)
"hon" (0.0090)
"government" (0.0080)
"canadians" (0.0080)
"right" (0.0070)
"lib" (0.0070)
"speaker" (0.0060)
"justin" (0.0050)

Topic 5:
"canada" (0.0160)
"health" (0.0150)
"ii" (0.0110)
"regard" (0.0110)
"vaccine" (0.0080)
"iii" (0.0080)
"project" (0.0080)
"safety" (0.0070)
"hc" (0.0070)
"information" (0.0070)


# Text Clustering

In [4]:
import gensim
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import math
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px

# Ensure that the NLTK punkt tokenizer is available
nltk.download('punkt')

# Set the number of chunks and top N terms to display
N = 1000  # Number of chunks
top_N_terms = 10  # Number of top terms to display per cluster

# Split the text into N equally sized chunks
def chunk_text(text, num_chunks):
    chunk_size = math.ceil(len(text) / num_chunks)
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Chunk the text
chunks = chunk_text(text, N)

# Preprocess the text
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    # Tokenize the document
    tokens = word_tokenize(doc.lower())
    # Remove stopwords and non-alphabetic words
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to each chunk
processed_chunks = [preprocess(chunk) for chunk in chunks]

# Convert the text chunks into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=1000)  # Limit to 1000 features for simplicity
X = vectorizer.fit_transform(processed_chunks)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust the number of clusters as needed
kmeans.fit(X)

# Reduce dimensionality with PCA for 3D plotting
pca = PCA(n_components=3)
components = pca.fit_transform(X.toarray())

# Create a DataFrame for visualization
df = pd.DataFrame({
    'x': components[:, 0],
    'y': components[:, 1],
    'z': components[:, 2],
    'cluster': kmeans.labels_
})

# Visualize the clusters in 3D
fig = px.scatter_3d(df, x='x', y='y', z='z', color='cluster', title='Text Clusters Visualization')
fig.show()

# Print the top N terms from each cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

print("\nTop terms per cluster:")
for i in range(kmeans.n_clusters):
    print(f"\nCluster {i + 1}:")
    print("="*30)
    for ind in order_centroids[i, :top_N_terms]:
        print(f"{terms[ind]}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md





Top terms per cluster:

Cluster 1:
trudeau
canadians
speaker
poilievre
government
members
commons
thank
also
would

Cluster 2:
canada
regard
ii
broken
project
question
emissions
data
iii
government

Cluster 3:
bill
act
motion
speaker
house
read
order
time
member
standing

Cluster 4:
health
canada
hc
vaccine
safety
pfizer
dna
vaccines
answer
information

Cluster 5:
prime
minister
hon
right
justin
trudeau
leader
er
lib
opposition
