# Arxiv Dataset Clustering
This notebook aims to cluster scholarly articles from the Arxiv dataset.

# Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns


## Load and Preview Data

In [None]:
input_file = 'data_preprocessed.csv'
df = pd.read_csv(input_file)
print(df.shape[0])
df.head()

## Text Vectorization

In [None]:
stop_words = set(stopwords.words('english'))
custom_stopwords = [
        "ability", "able", "absolute", "absolutely", "account", "accurate", "achieve", "address",
        "allowing", "also", "analyze", "analyzes", "answer", "application", "approach",
        "around", "art", "article", "aspect", "audience", "author", "available", "based", "begin", "best", "better",
        "beyond", "bound", "brief", "called", "capable", "capture", "carefully", "case", "certain", "challenging",
        "compare", "compared" "complex", "component", "comprehensive", "concept", "conceptual",
        "conclusion", "condition", "conduct", "conjecture", "consider", "construct", "content", "context", "cost",
        "cross", "crucial", "current", "demonstrate", "derive", "derived", "describe",
        "described", "describes", "detailed", "determine", "developed", "different", "difficult", "directly",
        "discourse", "discuss", "distinguish", "driven", "due", "effect", "effective", "efficient", "efficiently",
        "eight", "element", "emphasis", "end", "enhanced", "evaluate", "even", "example", "existing", "experiment", "experimental",
        "explain", "extensive", "family", "feature", "figure", "finally", "find", "fine", "finite", "finitely", "first",
        "fit", "five", "found", "four", "form", "framework", "function", "fundamental", "future", "general", "give",
        "given", "good", "grained", "graph", "group", "handed", "high", "higher", "however", "illustrate", "impact",
        "implement", "important", "include", "included", "including", "integrate", "interest", "introduce", "introduced", "introduction",
        "investigate", "issue", "iteration", "known", "large", "last", "leading", "left", "let", "like", "long", "low", "lower", "make",
        "many", "maximal", "may", "method", "methodology", "minimal", "model", "moreover", "multiple",
        "necessary", "need", "needed", "new", "news", "next", "nine", "non", "note", "novel", "number", "numerical",
        "objective", "observables", "observation", "obtain", 'obtained', "often", "one", "open", "operator", "optimal", "order",
        "outline", "outlines", "output", "paper", "papr", "parameter", "part", "particular", "perform",
        "performance", "performed", "performing", "performance", "phase", "point", "possible",
        "potential", "pre", "precisely", "present", "previous", "principle", "problem", "process", "prof",
        "proof", "proper", "property", "propose", "proposed", "proposes", "prove", "provide", "provided", "provides",
        "publicly", "publish", "purpose", "quality", "question", "range", "real", "recent", "recently",
        "recommendation", "related", "reliable", "representation", "require", "research", "result", "rev", "review",
        "right", "rigorous", "role", "scale", "scenario", "second", "section", "selection", "series", "serious", "set", "setting",
        "seven", "several", "show", "shown", "significant", "significantly", "simulation", "simple", "single", "six", "solution", "state",
        "strongly", "structure", "studied", "study", "sufficient", "suggestion", "sum", "synthesize", "system",
        "table", "take", "taken", "task", "technique", "ten", "term", "theorem", "theory", "third",
        "though", "three", "thus", "time", "topic", "two", "type", "upper", "use", "used", "using", "utilize", "valid",
        "value", "variable", "variety", "various", "via", "view", "way", "well", "whether", "wide", "widely", "within",
        "without", "work", "world", "written", "year", "zero", "zeroth"]
stop_words.update(custom_stopwords)

tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(2, 2),  # Use both unigrams and bigrams
    max_features=1500,   # Limit the number of features
    stop_words=stop_words, # Remove common English stop words
    sublinear_tf=True     # Apply sublinear scaling
)

# Fit and transform the text data to a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])


In [None]:
# Show top 50 features (unigrams and bigrams)
feature_names = np.array(tfidf_vectorizer.get_feature_names())
sorted_indices = np.argsort(tfidf_vectorizer.idf_)
top_features = feature_names[sorted_indices[:50]]
top_features_df = pd.DataFrame(top_features, columns=['Top 50 Features'])
print(top_features_df)

## Dimensionality Reduction

In [None]:
# Apply PCA
pca = PCA(n_components=150)  # Choose the number of components
pca_matrix = pca.fit_transform(tfidf_matrix.toarray())

## Determine Optimal Clusters

In [None]:
def check_clustering(X, K):
    # Taking a random sample of 5% of the data
    sample_indices = np.random.choice(X.shape[0], int(0.01 * X.shape[0]), replace=False)
    X_sample = X[sample_indices]

    sse,db,slc = {}, {}, {}
    for k in range(2, K):
        # seed of 10 for reproducibility.
        kmeans = KMeans(n_clusters=k, max_iter=1000, random_state=42, n_init=10).fit(X_sample)
        clusters = kmeans.labels_
        sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
        db[k] = davies_bouldin_score(X_sample,clusters)
        slc[k] = silhouette_score(X_sample,clusters)


    plt.figure(figsize=(15,10))
    plt.plot(list(sse.keys()), list(sse.values()), marker='o', linestyle='-', color='r')
    plt.xlabel("Number of cluster")
    plt.ylabel("SSE")
    plt.grid(True)
    plt.show()

    plt.figure(figsize=(15,10))
    plt.plot(list(db.keys()), list(db.values()), marker='o', linestyle='-', color='g')
    plt.xlabel("Number of cluster")
    plt.ylabel("Davies-Bouldin values")
    plt.grid(True)
    plt.show()

    plt.figure(figsize=(15,10))
    plt.plot(list(slc.keys()), list(slc.values()), marker='o', linestyle='-', color='b')
    plt.xlabel("Number of cluster")
    plt.ylabel("Silhouette score")
    plt.grid(True)
    plt.show()


# Call the function to check clustering
check_clustering(pca_matrix, 16)

## Apply Clustering

In [None]:
def apply_clustering(X, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    clusters = kmeans.fit_predict(X)
    return clusters

optimal_clusters = 9  # Adjusted based on observation.
df['cluster'] = apply_clustering(pca_matrix, optimal_clusters)

## Visualizing Clusters

In [None]:
# 1. Distribution of Data Points in Clusters
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='cluster', palette='tab20')
plt.title('Number of Data Points in Each Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.show()

In [None]:
# 2. PCA Plot (2D)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_matrix[:, 0], y=pca_matrix[:, 1], hue=df['cluster'], palette='tab20', s=20)
plt.title('2D PCA Plot')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Initialize a 3D plot
fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
scatter = ax.scatter(pca_matrix[:, 0], pca_matrix[:, 1], pca_matrix[:, 2], 
                     c=df['cluster'], cmap='tab20', s=20)

ax.set_xlim([-0.25, 0.25])
ax.set_ylim([-0.25, 0.25])
ax.set_zlim([-0.25, 0.25])

# Add colorbar and labels
colorbar = plt.colorbar(scatter)
ax.set_title('3D PCA Plot')
ax.set_xlabel('First Principal Component')
ax.set_ylabel('Second Principal Component')
ax.set_zlabel('Third Principal Component')

plt.show()

## Save Results

In [None]:
output_file = '../data/data_clustered.csv'
df.to_csv(output_file, index=False)