In [219]:
import nltk
import chardet
import random
import os
import re
import numpy as np
import pandas as pd
from collections import Counter
import plotly.express as px 
from nltk.corpus import reuters, stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.graph_objects as go
data_dir = "../data/"
nltk.data.path.append(os.path.abspath(data_dir))
nltk.download('stopwords', download_dir=data_dir)

[nltk_data] Downloading package stopwords to ../data/...
[nltk_data]   Package stopwords is already up-to-date!


True

In [220]:
# Preprocess documents
def preprocess(doc):
    stop_words = set(stopwords.words('english'))
    doc = doc.lower()
    doc = re.sub(r'[^a-z]+', ' ', doc)
    words = nltk.word_tokenize(doc)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [221]:

def detect_file_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

def create_dataframe_from_files(folder_path):
    data = {'Author': [], 'Content': []}

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            author = os.path.basename(root)  # Get the parent folder's name
            
            try:
                encoding = detect_file_encoding(file_path)
                with open(file_path, 'r', encoding=encoding) as f:
                    content = f.read()
                    data['Author'].append(author)
                    data['Content'].append(content)
            except Exception as e:
                print(f"Error reading '{file_path}': {e}")

    df = pd.DataFrame(data)
    return df

# Replace 'your_folder_path' with the actual path of the folder you want to start from
folder_path = '/Users/mandeep/Documents/GitHub/STA380/data/ReutersC50/C50train'
df = create_dataframe_from_files(folder_path)
df=df.iloc[1:].reset_index()
# Now 'df' is a DataFrame containing the 'Author' and 'Content' columns


In [222]:
df['Tokens']=df['Content'].map(preprocess)

In [223]:

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['Tokens']).toarray()
vocab = tfidf.vocabulary_
reverse_vocab = {v:k for k,v in vocab.items()}

feature_names = tfidf.get_feature_names_out()
df_tfidf = pd.DataFrame(X_tfidf, columns = feature_names)

idx = X_tfidf.argsort(axis=1)

tfidf_max10 = idx[:,-10:]

df_tfidf['top10'] = [[reverse_vocab.get(item) for item in row] for row in tfidf_max10 ]

df_tfidf['top10']


0       [aid, eckerd, hoven, inc, rite, offer, drugsto...
1       [krutick, miniature, proposed, boys, transacti...
2       [marcus, retailer, renfrew, isetan, holt, neim...
3       [offer, cash, stb, norfolk, wrote, shareholder...
4       [revised, cash, bid, transaction, shareholders...
                              ...                        
2495    [points, bank, gains, toronto, gold, shares, s...
2496    [tiazac, fallen, today, index, gold, ketchen, ...
2497    [declining, taking, nearly, heavy, index, meis...
2498    [canadian, golds, bank, bond, barrick, bre, to...
2499    [blocks, barrick, relative, canada, percent, l...
Name: top10, Length: 2500, dtype: object

In [224]:
pca = PCA(n_components=50)
vectors_pca = pca.fit_transform(Features.toarray())

In [225]:
tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=1000)
vectors_tsne = tsne.fit_transform(vectors_pca)
df_tsne = pd.DataFrame(vectors_tsne, columns=["Component 1", "Component 2"])
df_tsne['Author'] = df['Author']
df_tsne['top10'] =df_tfidf['top10']


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2500 samples in 0.000s...
[t-SNE] Computed neighbors for 2500 samples in 0.120s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2500
[t-SNE] Computed conditional probabilities for sample 2000 / 2500
[t-SNE] Computed conditional probabilities for sample 2500 / 2500
[t-SNE] Mean sigma: 0.118788
[t-SNE] KL divergence after 250 iterations with early exaggeration: 73.273346
[t-SNE] KL divergence after 1000 iterations: 0.739169


In [226]:


# Create a DataFrame from the provided data
df_kk = df_tsne[['Component 1','Component 2']]

# Specify the number of clusters
num_clusters = 10

# Initialize the KMeans model
kmeans = KMeans(n_clusters=num_clusters)

# Fit the model to the data
kmeans.fit(df_kk)

# Add cluster labels to the DataFrame
df_tsne['Cluster'] = kmeans.labels_
df_tsne['Cluster']=df_tsne['Cluster'].map(str)
# Display the resulting DataFrame
#print(df_tsne)

# Visualize the clusters
px.scatter(df_tsne,x='Component 1',y ='Component 2', color='Cluster')




In [229]:
# Let's find what each cluster represents
df_tsne['combined_top_words'] = df_tsne['top10'].apply(lambda words: ' '.join(words))

# Create a CountVectorizer to convert text data into a matrix of token counts
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_tsne['combined_top_words'])

# Convert the CountVectorizer matrix to a DataFrame
word_count_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the 'Cluster' column back to the word count DataFrame
word_count_df['Cluster'] = df_tsne['Cluster']

# Group by 'Cluster' and sum the counts for each word
cluster_word_counts = word_count_df.groupby('Cluster').sum()

# Get the top words for each cluster
top_words_per_cluster = {}
for cluster in cluster_word_counts.index:
    word_counts = cluster_word_counts.loc[cluster]
    sorted_words = word_counts.sort_values(ascending=False)
    top_words = sorted_words.index[:5]  # Get the top 5 words for each cluster
    top_words_per_cluster[cluster] = top_words.tolist()

# Print the top words for each cluster
for cluster, top_words in top_words_per_cluster.items():
    print(f"Cluster {cluster}: {', '.join(top_words)}")

Cluster 0: kong, hong, china, tung, legislature
Cluster 1: said, profit, gold, fund, francs
Cluster 2: bank, banks, china, banking, japan
Cluster 3: toronto, boeing, klaus, czech, air
Cluster 4: pounds, million, pence, profits, stg
Cluster 5: mci, bt, amp, distance, cable
Cluster 6: gm, workers, plant, uaw, plants
Cluster 7: tonnes, oil, russia, russian, colombia
Cluster 8: quarter, internet, computer, software, sales
Cluster 9: china, beijing, wang, chinese, taiwan


In [227]:
# How many times does each author show up in a cluster
author_clusters = df_tsne.groupby(['Author','Cluster'])['Component 1'].count()
author_clusters.reset_index()

px.scatter(author_clusters.reset_index(),x='Author',y='Cluster',color='Component 1')

In [238]:
cluster_for_author=author_clusters.unstack().idxmax(axis=1)
for i,v in cluster_for_author.items():
    print(i, 'writes most frequently about', top_words_per_cluster[v], 'topics')

AaronPressman writes most frequently about ['quarter', 'internet', 'computer', 'software', 'sales'] topics
AlanCrosby writes most frequently about ['toronto', 'boeing', 'klaus', 'czech', 'air'] topics
AlexanderSmith writes most frequently about ['said', 'profit', 'gold', 'fund', 'francs'] topics
BenjaminKangLim writes most frequently about ['china', 'beijing', 'wang', 'chinese', 'taiwan'] topics
BernardHickey writes most frequently about ['said', 'profit', 'gold', 'fund', 'francs'] topics
BradDorfman writes most frequently about ['quarter', 'internet', 'computer', 'software', 'sales'] topics
DarrenSchuettler writes most frequently about ['toronto', 'boeing', 'klaus', 'czech', 'air'] topics
DavidLawder writes most frequently about ['gm', 'workers', 'plant', 'uaw', 'plants'] topics
EdnaFernandes writes most frequently about ['pounds', 'million', 'pence', 'profits', 'stg'] topics
EricAuchard writes most frequently about ['quarter', 'internet', 'computer', 'software', 'sales'] topics
Fumik