# Novel classification of papers based on content

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import sklearn as sk
import spacy
import tqdm

In [2]:
comp_data = pd.read_csv("/kaggle/input/comp-data/comp_data.csv")
stats_data = pd.read_csv("/kaggle/input/stats-data/stats_data.csv")

# Categorizing papers based on content - combining datasets

In [3]:
combined_data = pd.concat([comp_data, stats_data], axis=0)
combined_data.head()
combined_data.tail()

Unnamed: 0.1,Unnamed: 0,id,title,category,category_code,published_date,updated_date,authors,first_author,summary,summary_word_count,year
12061,131585,abs-1501.03214v1,Quantifying Prosodic Variability in Middle Eng...,Applications (Statistics),stat.AP,2015-01-14,2015-01-14,['Roger Bilisoly'],'Roger Bilisoly',Interest in the mathematical structure of poet...,154,2015
12062,131630,abs-1709.00071v1,Weather impacts expressed sentiment,Applications (Statistics),stat.AP,2017-08-31,2017-08-31,"['Patrick Baylis', 'Nick Obradovich', 'Yury Kr...",'Patrick Baylis',We conduct the largest ever investigation into...,96,2017
12063,131731,abs-1904.06941v1,A framework for streamlined statistical predic...,Applications (Statistics),stat.AP,2019-04-15,2019-04-15,"['Vanessa Glenny', 'Jonathan Tuke', 'Nigel Bea...",'Vanessa Glenny',"In the Humanities and Social Sciences, there i...",136,2019
12064,133271,abs-2202.07081v2,Introducing the ICBe Dataset: Very High Recall...,Applications (Statistics),stat.AP,2022-02-14,2022-07-26,"['Rex W. Douglass', 'Thomas Leo Scherer', 'J. ...",'Rex W. Douglass',How do international crises unfold? We concept...,156,2022
12065,134320,abs-2210.11612v2,Searching for a higher power in the human eval...,Applications (Statistics),stat.AP,2022-10-20,2022-11-09,"['Johnny Tian-Zheng Wei', 'Tom Kocmi', 'Christ...",'Johnny Tian-Zheng Wei',"In MT evaluation, pairwise comparisons are con...",187,2022


In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
documents = combined_data[["id", "title", "summary"]]

In [6]:
# Use spaCy's pipe method for faster processing with multithreading
spacy.prefer_gpu()
from tqdm import tqdm  # Optional, for progress bar

def preprocess(doc):
    # Lemmatize and remove stop words
    return [token.lemma_ for token in doc if not token.is_stop]

# Batch and stream documents into the pipeline
processed_docs = []
for doc in tqdm(nlp.pipe(documents.summary, batch_size=64)):
    processed_docs.append(preprocess(doc))

31808it [09:22, 56.58it/s]


KeyboardInterrupt: 

In [None]:
processed_docs.head()

In [7]:
documents.to_csv("paper_doc_data.csv")