In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
# Specify the input file
input_file = '../data/data_clustered.csv'

# Load the clustered data
df = pd.read_csv(input_file)

# Print the first 5 rows
df.head()

In [None]:
# Analyze the distribution of clusters
cluster_counts = df['cluster'].value_counts().sort_index()
print(cluster_counts)

# Generate word clouds for each cluster
for cluster in cluster_counts.index:
    text = ' '.join(df[df['cluster'] == cluster]['text'])
    wordcloud = WordCloud(width=800, height=400).generate(text)
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Cluster {cluster}')
    plt.show()

# print five frequent words in each cluster in a table where each row is a cluster and each column is a word
for cluster in cluster_counts.index:
    text = ' '.join(df[df['cluster'] == cluster]['text'])
    word_counts = Counter(text.split()).most_common(5)
    print(f'Cluster {cluster}: {word_counts}')

In [None]:
# Extract the year from the 'update_date' column
df['year'] = pd.to_datetime(df['update_date']).dt.year

# Analyze the distribution of articles over time
year_counts = df['year'].value_counts().sort_index()
# Convert the series to a dataframe
year_counts = year_counts.to_frame()
year_counts.columns = ['count']
print(year_counts)

# Plot the distribution of articles over time
year_counts.plot(kind='bar', figsize=(15, 8), title='Distribution of articles over time')
plt.show()

In [None]:
# show the distribution of primary categories of each year using bar charts 
fig, axes = plt.subplots(4, 4, figsize=(15, 20))
for i, ax in enumerate(axes.flatten()):
    df[df['year'] == 2008 + i]['primary_category'].value_counts().head(5).plot(kind='bar', ax=ax, title=f'Year {2008 + i}')
plt.tight_layout()
plt.ylabel('Number of articles')
plt.xlabel('Primary category')
plt.show()

In [None]:
# Visualize the distribution of clusters
cluster_counts.plot(kind='bar')
plt.xlabel('Cluster')
plt.ylabel('Number of Articles')
plt.title('Distribution of Clusters')
plt.show()

In [None]:
# Visualize the distribution of articles over time
year_counts.plot(kind='bar')
plt.xlabel('Year')
plt.ylabel('Number of Articles')
plt.title('Distribution of Articles Over Time')
plt.show()

In [None]:
# Visualize the distribution of clusters over time
cluster_year_counts.unstack().plot(kind='bar', stacked=True, figsize=(12, 7))
plt.xlabel('Year')
plt.ylabel('Number of Articles')
plt.title('Distribution of Clusters Over Time')
plt.legend(title='Cluster')
plt.show()

In [None]:
# Visualize the distribution of primary categories over time
top_categories = df['primary_category'].value_counts().index[:10]
category_year_counts = df[df['primary_category'].isin(top_categories)].groupby(['year', 'primary_category'])['primary_category'].count().to_frame()
category_year_counts.columns = ['count']
category_year_counts = category_year_counts.reset_index()
category_year_counts = category_year_counts.pivot(index='year', columns='primary_category', values='count')
category_year_counts.plot(kind='bar', stacked=True, figsize=(12, 7))
plt.xlabel('Year')
plt.ylabel('Number of Articles')
plt.title('Distribution of Primary Categories Over Time')
plt.legend(title='Primary Category')
plt.show()