In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns

In [None]:
# Specify the input file
input_file = '../data/data_clustered.csv'

# Load the clustered data
df = pd.read_csv(input_file)

# Print the first 5 rows
df.head()

# save sample of data in a new csv file
df_sample = df.sample(n=1000)
df_sample.to_csv('../data/clustered_data_sample.csv', index=False)

In [None]:
# Convert string representation of list to actual list
df['categories_list'] = df['categories_list'].str.strip('[]').str.split(', ')

# Split categories_list into separate columns
categories_df = df['categories_list'].apply(pd.Series)

# Rename columns
max_columns = categories_df.shape[1]
categories_df.columns = [f'category_{i+1}' for i in range(max_columns)]

# Remove quotes from the columns
categories_df = categories_df.applymap(lambda x: x.strip("'") if isinstance(x, str) else x)

# Concatenate the original data with the new category columns
df = pd.concat([df, categories_df], axis=1)

# Display the updated data
df.head()

In [None]:
# Concatenate all category columns to form a single series
all_categories = pd.concat([df[f'category_{i+1}'] for i in range(13)]).dropna()

# Count the occurrences of each category
category_counts = all_categories.value_counts()

# Display the top 20 categories
top_categories = category_counts.head(20)

# Plotting the top 20 categories
plt.figure(figsize=(12, 8))
top_categories.plot(kind='barh', color='skyblue')
plt.title("Top 20 Categories in the Dataset")
plt.xlabel("Number of Papers")
plt.ylabel("Category")
plt.gca().invert_yaxis()
plt.show()

In [None]:
# print each cluster top categories
for i in range(9):
    cluster = df[df['cluster'] == i]
    cluster_categories = pd.concat([cluster[f'category_{i+1}'] for i in range(9)]).dropna()
    cluster_category_counts = cluster_categories.value_counts()
    cluster_top_categories = cluster_category_counts.head(10)
    plt.figure(figsize=(12, 8))
    cluster_top_categories.plot(kind='barh', color='skyblue')
    plt.title(f"Top 10 Categories in Cluster {i}")
    plt.xlabel("Number of Papers")
    plt.ylabel("Category")
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
def get_top_terms_in_cluster(cluster_num, top_n=10):
    cluster_texts = df[df['cluster'] == cluster_num]['text']
    all_terms = ' '.join(cluster_texts).split()
    return Counter(all_terms).most_common(top_n)

for cluster_num in df['cluster'].unique():
    print(f"Cluster {cluster_num}: {get_top_terms_in_cluster(cluster_num)}")

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='cluster')
plt.title('Distribution of Articles Across Clusters')
plt.show()

In [None]:
# Extract the year from the 'update_date' column
df['year'] = pd.to_datetime(df['update_date']).dt.year
cluster_popularity = df.groupby(['year', 'cluster']).size().reset_index(name='counts')
plt.figure(figsize=(15, 7))
sns.lineplot(data=cluster_popularity, x='year', y='counts', hue='cluster', palette='tab10', linewidth=2.5, style='cluster', markers=True, dashes=False)
plt.title('Yearly Publications by Cluster')
plt.show()

In [None]:
# Split the categories and explode the DataFrame to have one category per row
df_categories = df.assign(categories=df['categories_list'].str.split(' ')).explode('categories_list')

# Group by year and category, then count the number of articles
category_yearly_counts = df_categories.groupby(['year', 'categories_list']).size().reset_index(name='counts')

# Plot the top N categories
N = 10
top_categories = category_yearly_counts.groupby('categories_list')['counts'].sum().nlargest(N).index
filtered_counts = category_yearly_counts[category_yearly_counts['categories_list'].isin(top_categories)]

plt.figure(figsize=(15, 7))
sns.lineplot(data=filtered_counts, x='year', y='counts', hue='categories_list', palette='tab10', linewidth=2.5)
plt.title('Yearly Publications by Top Categories')
plt.show()


In [None]:
# Analyze the distribution of articles by month and plot them in a heatmap without considering the year
df['month'] = pd.to_datetime(df['update_date']).dt.month
month_counts = df['month'].value_counts().sort_index()
# Convert the series to a dataframe
month_counts = month_counts.to_frame()
month_counts.columns = ['count']
print(month_counts)

# show the distribution of articles by month in a heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(month_counts, annot=True, fmt='g', cmap='Blues')
plt.title('Number of Articles Published Each Month')
plt.savefig('../images/num_articles_each_month.png')
plt.show()


In [None]:
# Calculate the year-on-year growth for each cluster
cluster_popularity['yearly_growth'] = cluster_popularity.groupby('cluster')['counts'].pct_change() * 100


plt.figure(figsize=(15, 7))
sns.lineplot(data=cluster_popularity, x='year', y='yearly_growth', hue='cluster', palette='tab10', linewidth=2.5)
plt.title('Yearly Growth Rate of Clusters')
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine all texts in each cluster
cluster_texts = df.groupby('cluster')['text'].apply(' '.join)

# Compute TF-IDF vectors for each cluster's combined texts
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cluster_texts)

# Compute cosine similarity between clusters
similarity_matrix = cosine_similarity(tfidf_matrix)


plt.figure(figsize=(10, 8))
sns.heatmap(similarity_matrix, annot=True, fmt=".2f", cmap='coolwarm', xticklabels=['1','2','3','4','5','6','7','8'], yticklabels=['1','2','3','4','5','6','7','8'])
plt.title('Cosine Similarity Between Clusters')
plt.xlabel('Clusters')
plt.ylabel('Clusters')
plt.show()


In [None]:
# Extract the year from the 'update_date' column
df['year'] = pd.to_datetime(df['update_date']).dt.year

# Analyze the distribution of articles over time
year_counts = df['year'].value_counts().sort_index()
print(year_counts)

# Analyze the distribution of clusters over time
cluster_year_counts = df.groupby('year')['cluster'].value_counts()
print(cluster_year_counts)


In [None]:
# Papers with more than one category can be considered interdisciplinary
interdisciplinary_papers = df[df['num_categories'] > 1]

# Count interdisciplinary papers in each cluster
interdisciplinary_counts = interdisciplinary_papers['cluster'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
interdisciplinary_counts.plot(kind='bar', color='skyblue')
plt.title('Number of Interdisciplinary Papers by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.show()

In [None]:
for cluster_num in df['cluster'].unique():
    cluster_text = ' '.join(df[df['cluster'] == cluster_num]['text'])
    wordcloud = WordCloud(width=1000, height=500, background_color='white').generate(cluster_text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Cluster {cluster_num}')
    plt.show()
