In [1]:
import os

import torch
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans

from preprocess import get_csv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

In [3]:
# Set the file path for your CSV file
file_path = 'arxiv-metadata-oai-snapshot.csv'

# Check if the file exists
if not os.path.exists(file_path):
    # If the file does not exist, call the get_csv method
    print("Creating CSV from JSON file.")
    get_csv()
else:
    print("The file already exists.")

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to confirm it's loaded correctly
print(df.head())

The file already exists.
                                               title       categories  \
0  Calculation of prompt diphoton production cros...           hep-ph   
1           Sparsity-certifying Graph Decompositions    math.CO cs.CG   
2  The evolution of the Earth-Moon system based o...   physics.gen-ph   
3  A determinant of Stirling cycle numbers counts...          math.CO   
4  From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...  math.CA math.FA   

                                            abstract    versions  
0  A fully differential calculation in perturbati...  2007-07-24  
1  We describe a new algorithm, the $(k,\ell)$-pe...  2008-12-13  
2  The evolution of Earth-Moon system is describe...  2008-01-13  
3  We show that a determinant of Stirling cycle n...  2007-03-31  
4  In this paper we show how to compute the $\Lam...  2007-04-02  


In [4]:
corpus = df["abstract"]
title = df["title"]

# Set the file path for your CSV file
embeddings_path = 'embeddings.npy'

# Check if the file exists
if not os.path.exists(embeddings_path):
    # If the file does not exist, call the get_embedding method
    print("Creating CSV from JSON file.")
    corpus_embeddings = model.encode(corpus, show_progress_bar=True)
    np.save("./embeddings.npy", corpus_embeddings, allow_pickle=True)
else:
    print("The file already exists.")
    corpus_embeddings = np.load("./embeddings.npy", allow_pickle=True)

print(corpus_embeddings.shape)

The file already exists.
(2370685, 384)


In [5]:
### We need to make this cell into a function.

num_clusters = 11
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

# clustered_sentences = [[] for i in range(num_clusters)]
# for sentence_id, cluster_id in enumerate(cluster_assignment):
#     clustered_sentences[cluster_id].append(title[sentence_id])

# # for i, cluster in enumerate(clustered_sentences):
# #     print("Cluster ", i + 1)
# #     print(cluster)
# #     print("")



In [6]:
print(cluster_assignment)
np.save("./clusters_11.npy", cluster_assignment, allow_pickle=True)

[3 4 1 ... 8 8 8]


In [10]:
df.categories

0                              hep-ph
1                       math.CO cs.CG
2                      physics.gen-ph
3                             math.CO
4                     math.CA math.FA
                      ...            
2370680    supr-con cond-mat.supr-con
2370681    supr-con cond-mat.supr-con
2370682    supr-con cond-mat.supr-con
2370683    supr-con cond-mat.supr-con
2370684    supr-con cond-mat.supr-con
Name: categories, Length: 2370685, dtype: object

In [11]:
df["clustering-{}".format(num_clusters)] = cluster_assignment
df

Unnamed: 0,title,categories,abstract,versions,clustering-11
0,Calculation of prompt diphoton production cros...,hep-ph,A fully differential calculation in perturbati...,2007-07-24,3
1,Sparsity-certifying Graph Decompositions,math.CO cs.CG,"We describe a new algorithm, the $(k,\ell)$-pe...",2008-12-13,4
2,The evolution of the Earth-Moon system based o...,physics.gen-ph,The evolution of Earth-Moon system is describe...,2008-01-13,1
3,A determinant of Stirling cycle numbers counts...,math.CO,We show that a determinant of Stirling cycle n...,2007-03-31,4
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,math.CA math.FA,In this paper we show how to compute the $\Lam...,2007-04-02,9
...,...,...,...,...,...
2370680,On the origin of the irreversibility line in t...,supr-con cond-mat.supr-con,We report on measurements of the angular depen...,1996-08-26,8
2370681,Nonlinear Response of HTSC Thin Film Microwave...,supr-con cond-mat.supr-con,The non-linear microwave surface impedance of ...,1996-08-31,8
2370682,Critical State Flux Penetration and Linear Mic...,supr-con cond-mat.supr-con,The vortex contribution to the dc field (H) de...,1996-09-03,8
2370683,Density of States and NMR Relaxation Rate in A...,supr-con cond-mat.supr-con,We show that the density of states in an aniso...,1996-09-18,8


In [7]:
df.to_csv("arxiv-metadata-oai-snapshot-clustering-{}.csv".format(num_clusters), index=False)

In [None]:
df = pd.read_csv("arxiv-metadata-oai-snapshot-clustering-{}.csv".format(num_clusters))
df

In [38]:
x = df['versions'].copy()
for i in range(len(x)):
    x[i] = int(x[i][:4])
    
np.save("./years.npy", x, allow_pickle=True)

In [None]:
df[df["clustering-11"] == 0].categories

In [19]:
vote = df[df["clustering-11"] == 0].categories.value_counts()

In [22]:
print(vote[0:5])

math.AG    18775
math.NT    11293
math.DG     7994
math.GT     7246
hep-th      6556
Name: categories, dtype: int64


In [36]:
for i in range(11):
    vote = df[df["clustering-11"] == i].categories.value_counts()#.reset_index()
    print(f'cluster {i}') # {len(df[df["clustering-11"] == i].categories)}')
    print(f'{vote.head(3)}\n')

cluster 0
math.AG    18775
math.NT    11293
math.DG     7994
Name: categories, dtype: int64

cluster 1
astro-ph       78866
astro-ph.GA    28459
astro-ph.SR    25249
Name: categories, dtype: int64

cluster 2
hep-th          45381
gr-qc           25082
hep-th gr-qc    10321
Name: categories, dtype: int64

cluster 3
hep-ph     70922
hep-ex     16738
nucl-th    15039
Name: categories, dtype: int64

cluster 4
math.CO    23971
math.NT     6353
cs.DS       6279
Name: categories, dtype: int64

cluster 5
cs.CV    47118
cs.CL    19593
cs.LG    11222
Name: categories, dtype: int64

cluster 6
quant-ph             58131
physics.optics        6951
cond-mat.mes-hall     4725
Name: categories, dtype: int64

cluster 7
cs.IT math.IT    15440
math.OC           9769
stat.ME           9045
Name: categories, dtype: int64

cluster 8
cond-mat.mtrl-sci    32285
cond-mat.mes-hall    28091
cond-mat.str-el      23374
Name: categories, dtype: int64

cluster 9
math.AP    28068
math.PR    11765
math.FA     9351
Nam