In [43]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import plotly.express as px

In [44]:
train_dir = 'ReutersC50/C50train'
test_dir = 'ReutersC50/C50test'

def extract_data(folder_path, label):
    data = []
    train_authors = os.listdir(folder_path)
    train_authors.sort()
    for author in train_authors:
        if author[0] != '.':
            files = os.listdir(folder_path + '/' + author)
            files.sort()
            for file in files:
                file_path = folder_path + '/' + author + '/' + file
                with open(file_path, 'r') as fp:
                    for line in fp:
                        line = line.strip()
                        data.append([line, author, file, label])
    return data

train = extract_data(train_dir, label='train')
test = extract_data(test_dir, label='test')

In [45]:
train = pd.DataFrame(train, columns=['Text', 'Author', 'File_Name', 'Label'])
test = pd.DataFrame(test, columns=['Text', 'Author', 'File_Name', 'Label'])
data = pd.concat([train, test])
data

Unnamed: 0,Text,Author,File_Name,Label
0,The Internet may be overflowing with new techn...,AaronPressman,106247newsML.txt,train
1,The National Consumers League said Wednesday t...,AaronPressman,106247newsML.txt,train
2,"The league, a non-profit consumer advocacy gro...",AaronPressman,106247newsML.txt,train
3,"The site, which collects reports directly from...",AaronPressman,106247newsML.txt,train
4,"""Consumers who suspect a scam on the Internet ...",AaronPressman,106247newsML.txt,train
...,...,...,...,...
44210,The suspension followed the magazine's publica...,WilliamKazer,58312newsML.txt,test
44211,Western diplomats and Chinese analysts have de...,WilliamKazer,58312newsML.txt,test
44212,The president of the outspoken Beijing Youth D...,WilliamKazer,58312newsML.txt,test
44213,"The clampdown has been fuelled by a call for ""...",WilliamKazer,58312newsML.txt,test


In [31]:
X = list(data.loc[data['Label'] == 'train']['Text'])

vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(X)

num_topics = 25
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(X)

dict_topics = {}

feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda_model.components_):
    print(f"Topic {topic_idx + 1}:")
    top_words_idx = topic.argsort()[:-10 - 1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(top_words)
    dict_topics[topic_idx] = [(topic_idx + 1), top_words]

Topic 1:
['united', 'states', 'trade', 'said', 'drug', 'china', 'ban', 'department', 'colombia', 'congress']
Topic 2:
['hong', 'kong', 'china', 'said', 'tung', 'chinese', 'people', 'territory', 'rule', 'says']
Topic 3:
['internet', 'corp', 'new', 'computer', 'said', 'software', 'technology', 'microsoft', 'network', 'services']
Topic 4:
['said', 'financial', 'chairman', 'president', 'statement', 'company', 'vice', 'board', 'right', 'street']
Topic 5:
['amp', 'local', 'long', 'market', 'competition', 'service', 'phone', 'cable', 'rules', 'companies']
Topic 6:
['told', 'reuters', 'director', 'interview', 'reporters', 'quality', 'telephone', 'areas', 'conference', 'managing']
Topic 7:
['china', 'said', 'beijing', 'chinese', 'official', 'taiwan', 'officials', 'economic', 'communist', 'state']
Topic 8:
['news', 'said', 'early', 'fund', '1997', 'joint', 'year', 'venture', 'start', '1998']
Topic 9:
['000', 'tonnes', 'said', 'saying', '100', 'cocoa', 'year', 'copper', '500', 'figures']
Topic 10

In [47]:
def predict_topic(new_doc):
    new_doc = [new_doc]
    new_doc_vectorized = vectorizer.transform(new_doc)
    new_doc_topic_dist = lda_model.transform(new_doc_vectorized)
    # print("New Document Topic Distribution:", new_doc_topic_dist)

    for probabilities in new_doc_topic_dist:
        max_prob = max(probabilities)
        for i, probability in enumerate(probabilities):
            if probability == max_prob:
                topic = dict_topics[i][0]
    return topic

test = data.loc[data['Label'] == 'test']
test = test.groupby(['Author', 'File_Name']).agg(
            Text = pd.NamedAgg(column='Text', aggfunc=''.join)
)
test = test.reset_index(drop=False)
test['Topic'] = test['Text'].apply(lambda row: predict_topic(row))
test

Unnamed: 0,Author,File_Name,Text,Topic
0,AaronPressman,421829newsML.txt,U.S. Senators on Tuesday sharply criticized a ...,17
1,AaronPressman,424074newsML.txt,Two members of Congress criticised the Federal...,11
2,AaronPressman,42764newsML.txt,Commuters stuck in traffic on the Leesburg Pik...,3
3,AaronPressman,43033newsML.txt,A broad coalition of corporations went to Capi...,3
4,AaronPressman,433558newsML.txt,"On the Internet, where new products come and g...",3
...,...,...,...,...
2495,WilliamKazer,504283newsML.txt,China has scored new successes in its fight ag...,10
2496,WilliamKazer,504526newsML.txt,China has scored new successes in its fight ag...,10
2497,WilliamKazer,51502newsML.txt,China is on target with plans to to promote 10...,22
2498,WilliamKazer,522090newsML.txt,China may need to adjust the mix of its treasu...,16


In [78]:
df_freq = pd.DataFrame(test['Topic'].value_counts()).reset_index(drop=False)
df_freq = df_freq.rename(columns={'Topic':'Value_Count', 'index':'Topic'})
df_freq = df_freq.astype({'Topic': object})
df_freq = df_freq.sort_values('Value_Count', ascending=False)

fig = px.bar(df_freq_sorted, x="Topic", y="Value_Count", title='Test Set Frequency Count')
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = df_freq['Topic'],
        ticktext = df_freq['Topic'],
        title = "Identified Topic"
    ),
    yaxis_title="Frequency",
    title='Test Set Frequency Count'
)

fig.show()

In [81]:
df_authors = test.groupby(['Author', 'Topic']).agg(
                count_freq = pd.NamedAgg(column='Topic', aggfunc='count')
)
df_authors = df_authors.reset_index(drop=False)
df_authors = df_authors.sort_values(['Author', 'count_freq']).drop_duplicates('Author', keep='last')
df_authors = df_authors.reset_index(drop=False)

fig = px.bar(df_authors, x="Author", y="Topic", title='Most Frequent Topic')
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = df_authors['Author'],
        ticktext = df_authors['Author'],
        title = "Authors"
    ),
    yaxis_title="Topic",
    title='Most Frequent Topic'
)

fig.show()