# Arxiv Dataset Clustering
This notebook aims to cluster scholarly articles from the Arxiv dataset.

# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

## Load and Preview Data

In [2]:
input_file = '../data/data_preprocessed.csv'
df = pd.read_csv(input_file)
df.head()

Unnamed: 0,categories,update_date,text
0,hep-ph,2008-11-26,calculation prompt diphoton production section...
1,math.CO cs.CG,2008-12-13,sparsity certifying decomposition algorithm el...
2,physics.gen-ph,2008-01-13,evolution earth moon dark matter field fluid e...
3,math.CO,2007-05-23,determinant stirling cycle number count unlabe...
4,math.CA math.FA,2013-10-15,dyadic lambda alpha lambda alpha compute lambd...


## Text Vectorization

In [None]:
def vectorize_text(df):
    # Replace NaN values in the 'text' column with empty strings
    df['text'] = df['text'].fillna('')
    # Instantiate TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(df['text'])
    return X, vectorizer

tfidf_matrix, vectorizer = vectorize_text(df)

In [9]:
# Sample a fraction of your data (e.g., 10%)
sample_fraction = 0.1
df_sample = df.sample(frac=sample_fraction)

# Vectorize the sampled text data:
vectorizer_tfidf = TfidfVectorizer(norm="l2", analyzer="word", ngram_range=(2,3), min_df=5, max_df=0.99, max_features=1500, lowercase=False)
tfidf_vectors = vectorizer_tfidf.fit_transform(df_sample['text'])
tfidf_terms = vectorizer_tfidf.get_feature_names_out()

# Looking for most frequent words
sums = tfidf_vectors.sum(axis=0)
data = []
for col, term in enumerate(tfidf_terms):
    data.append((term, sums[0,col]))
frequency = pd.DataFrame(data, columns=['words', 'freq'])

print("\nNumber of words:\n", frequency.sort_values('freq', ascending=False)[:20])
print(f"Data samples: {tfidf_vectors.shape[0]}")



Number of words:
                        words         freq
894           neural network  2956.289274
745           magnetic field  2903.412315
101               black hole  2484.004071
736         machine learning  1889.475077
245              dark matter  1706.234735
854              monte carlo  1606.282571
278            deep learning  1527.605227
304    differential equation  1276.674484
535                gamma ray  1128.579621
1003               power law  1119.474131
287           degree freedom  1103.094287
814               mean field  1028.609843
1132  reinforcement learning  1000.080371
112       boundary condition   971.083950
1152         result obtained   963.345883
1171            scalar field   930.532231
1316          star formation   916.473632
574       gravitational wave   875.489761
1070        quantum mechanic   856.858586
696              lie algebra   835.680642
Data samples: 228812


## Dimensionality Reduction

In [None]:
def reduce_dimensionality(X):
    svd = TruncatedSVD(n_components=50, random_state=42)
    X_reduced = svd.fit_transform(X)
    return X_reduced

X_reduced = reduce_dimensionality(tfidf_matrix)

## Visual Inspection using t-SNE

In [None]:
def visualize_tsne(X):
    # Taking a random sample of 1% of the data
    sample_indices = np.random.choice(X.shape[0], int(0.01 * X.shape[0]), replace=False)
    X_sample = X[sample_indices]

    # Instantiating and fitting t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X_sample)

    # Plotting the t-SNE features in a scatter plot and coloring neighbor clusters differently
    plt.figure(figsize=(12, 8))
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], marker='.' , alpha=0.6)
    plt.title('Data visualization using t-SNE')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.show()


visualize_tsne(X_reduced)

## Determine Optimal Clusters

In [None]:
def determine_optimal_clusters(X):
    wcss = []
    silhouette_scores = []
    davies_bouldin_scores = []

    # Taking a random sample of 5% of the data
    sample_indices = np.random.choice(X.shape[0], int(0.05 * X.shape[0]), replace=False)
    X_sample = X[sample_indices]

    # Define a range of clusters to test
    cluster_range = range(2, 11)

    for i in cluster_range:
        kmeans = KMeans(n_clusters=i, n_init=10, random_state=0)
        kmeans.fit(X_sample)
        wcss.append(kmeans.inertia_)

        # Calculate silhouette score
        silhouette_avg = silhouette_score(X_sample, kmeans.labels_)
        silhouette_scores.append(silhouette_avg)


        # Calculate Davies-Bouldin index
        davies_bouldin_avg = davies_bouldin_score(X_sample, kmeans.labels_)
        davies_bouldin_scores.append(davies_bouldin_avg)

        # Print the score for each cluster
        print("For n_clusters = {}, silhouette score is {})".format(i, silhouette_avg))
        print("For n_clusters = {}, Davies-Bouldin score is {})".format(i, davies_bouldin_avg))


    # Plot the scores 
    plt.figure(figsize=(15, 5))

    # Plot Silhouette Score
    plt.subplot(1, 2, 1)
    plt.plot(cluster_range, silhouette_scores, marker='o', linestyle='-', color='g')
    plt.title('Silhouette Score')
    plt.xlabel('Number of clusters')
    plt.ylabel('Score')

    # Plot Davies-Bouldin Score
    plt.subplot(1, 2, 2)
    plt.plot(cluster_range, davies_bouldin_scores, marker='o', linestyle='-', color='r')
    plt.title('Davies-Bouldin Score')
    plt.xlabel('Number of clusters')
    plt.ylabel('Score')
    
    plt.show()

# Call the function to determine optimal clusters
determine_optimal_clusters(X_reduced)

## Apply Clustering

In [None]:
def apply_clustering(X, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=0)
    clusters = kmeans.fit_predict(X)
    return clusters

optimal_clusters = 8  # Adjusted based on observation.
df['cluster'] = apply_clustering(X_reduced, optimal_clusters)

In [None]:
# print each cluster most frequent words in a dataframe and show a table
def print_cluster_words(df, vectorizer, cluster_number, top_n_words):
    # Filter the dataframe to only get the rows associated with the given cluster
    cluster_df = df[df['cluster'] == cluster_number]

    # Get the text data from the dataframe
    text_data = cluster_df['text']

    # Instantiate the vectorizer
    vectorizer = vectorizer

    # Fit and transform the vectorizer on the text data
    vectorized_data = vectorizer.fit_transform(text_data)

    # Create a list of the vectorized words
    vectorized_data_as_array = vectorized_data.toarray()

    # Create a DataFrame with the words
    vocab = vectorizer.get_feature_names()
    word_counts = pd.DataFrame({'word': vocab, 'count': np.sum(vectorized_data_as_array, axis=0)})
    word_counts = word_counts.sort_values(by='count', ascending=False)


    # Print the top n words from the DataFrame
    print(word_counts.sort_values(by='count', ascending=False).head(top_n_words))

    # Create and generate a word cloud image
    wordcloud = WordCloud().generate(' '.join(vocab))

    # Display the generated image:
    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()


for i in range(optimal_clusters):
    print_cluster_words(df, vectorizer, i, 10)


## Visualizing Clusters

In [None]:
def plot_clusters(df):
    plt.figure(figsize=(10,6))
    sns.countplot(x='cluster', data=df)
    plt.title('Distribution of Clusters')
    plt.xlabel('Cluster')
    plt.ylabel('Number of Samples')
    plt.show()

plot_clusters(df)

def visualize_clusters(X, clusters):
    # Taking a random sample of 1% of the data
    sample_indices = np.random.choice(X.shape[0], int(0.01 * X.shape[0]), replace=False)
    X_sample = X[sample_indices]
    clusters_sample = clusters[sample_indices]

    # Instantiating and fitting t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X_sample)

    # Plotting the t-SNE features
    plt.figure(figsize=(12, 8))
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], marker='.', c=clusters_sample, cmap='viridis')
    plt.title('Data visualization using t-SNE')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.show()

    # plotting in 3D

    # Instantiating and fitting t-SNE
    tsne = TSNE(n_components=3, random_state=42)
    X_tsne = tsne.fit_transform(X_sample)

    # Plotting the t-SNE features
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(X_tsne[:, 0], X_tsne[:, 1], X_tsne[:, 2], marker='.', c=clusters_sample, cmap='viridis')
    plt.title('Data visualization using t-SNE')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.show()


visualize_clusters(X_reduced, df['cluster'])

## Save Results

In [None]:
output_file = '../data/data_clustered.csv'
df.to_csv(output_file, index=False)
