# Exploration notebook (EDA)

In [2]:
import numpy as np
import pandas as pd
import sklearn as sk
import statsmodels.api as sm
import plotly.express as px

In [3]:
papers = pd.read_parquet("arXiv.parquet")

In [4]:
papers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136238 entries, 0 to 136237
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  136238 non-null  object
 1   title               136238 non-null  object
 2   category            136238 non-null  object
 3   category_code       136238 non-null  object
 4   published_date      136238 non-null  object
 5   updated_date        136238 non-null  object
 6   authors             136238 non-null  object
 7   first_author        136238 non-null  object
 8   summary             136238 non-null  object
 9   summary_word_count  136238 non-null  int64 
dtypes: int64(1), object(9)
memory usage: 10.4+ MB


In [5]:
papers.describe()

Unnamed: 0,summary_word_count
count,136238.0
mean,162.38683
std,49.374474
min,1.0
25%,128.0
50%,160.0
75%,195.0
max,552.0


In [6]:
papers.head()

Unnamed: 0,id,title,category,category_code,published_date,updated_date,authors,first_author,summary,summary_word_count
0,cs-9308101v1,Dynamic Backtracking,Artificial Intelligence,cs.AI,1993-08-01,1993-08-01,['M. L. Ginsberg'],'M. L. Ginsberg',Because of their occasional need to return to ...,79
1,cs-9308102v1,A Market-Oriented Programming Environment and ...,Artificial Intelligence,cs.AI,1993-08-01,1993-08-01,['M. P. Wellman'],'M. P. Wellman',Market price systems constitute a well-underst...,119
2,cs-9309101v1,An Empirical Analysis of Search in GSAT,Artificial Intelligence,cs.AI,1993-09-01,1993-09-01,"['I. P. Gent', 'T. Walsh']",'I. P. Gent',We describe an extensive study of search in GS...,167
3,cs-9311101v1,The Difficulties of Learning Logic Programs wi...,Artificial Intelligence,cs.AI,1993-11-01,1993-11-01,"['F. Bergadano', 'D. Gunetti', 'U. Trinchero']",'F. Bergadano',As real logic programmers normally use cut (!)...,174
4,cs-9311102v1,Software Agents: Completing Patterns and Const...,Artificial Intelligence,cs.AI,1993-11-01,1993-11-01,"['J. C. Schlimmer', 'L. A. Hermens']",'J. C. Schlimmer',To support the goal of allowing users to recor...,187


In [7]:
papers.tail()

Unnamed: 0,id,title,category,category_code,published_date,updated_date,authors,first_author,summary,summary_word_count
136233,abs-2408.08541v1,Where is the signal in tokenization space?,Computation and Language (Natural Language Pro...,cs.CL,2024-08-16,2024-08-16,"['Renato Lui Geh', 'Honghua Zhang', 'Kareem Ah...",'Renato Lui Geh',Large Language Models (LLMs) are typically shi...,170
136234,abs-2408.08564v1,Collaborative Cross-modal Fusion with Large La...,Information Retrieval,cs.IR,2024-08-16,2024-08-16,"['Zhongzhou Liu', 'Hao Zhang', 'Kuicai Dong', ...",'Zhongzhou Liu',Despite the success of conventional collaborat...,157
136235,abs-2408.08624v1,RealMedQA: A pilot biomedical question answeri...,Computation and Language (Natural Language Pro...,cs.CL,2024-08-16,2024-08-16,"['Gregory Kell', 'Angus Roberts', 'Serge Umans...",'Gregory Kell',Clinical question answering systems have the p...,153
136236,abs-2408.08648v1,Understanding Enthymemes in Argument Maps: Bri...,Artificial Intelligence,cs.AI,2024-08-16,2024-08-16,"['Jonathan Ben-Naim', 'Victor David', 'Anthony...",'Jonathan Ben-Naim',Argument mining is natural language processing...,194
136237,abs-2408.08651v2,Reasoning Beyond Bias: A Study on Counterfactu...,Computation and Language (Natural Language Pro...,cs.CL,2024-08-16,2024-09-06,"['Kyle Moore', 'Jesse Roberts', 'Thao Pham', '...",'Kyle Moore',Language models are known to absorb biases fro...,156


In [8]:
papers.published_date = pd.to_datetime(papers.published_date)
papers['year'] = papers.published_date.dt.year

#Get rid of 2025 (incomplete yearly data)
papers = papers.query("year != 2025")

In [9]:
print(len(pd.unique(papers.category_code)))
print(len(pd.unique(papers.category)))

139
138


In [10]:
papers.category.info
papers['category'] = pd.Categorical(papers.category)
papers['category_code'] = pd.Categorical(papers.category_code)
papers['first_author'] = pd.Categorical(papers.first_author)

In [11]:
ai = papers.query('category == "Artificial Intelligence"').groupby('year')['authors'].size().reset_index()

In [12]:
px.bar(ai, x='year', y = 'authors', color = 'year', title="Artifical Intelligence papers published by year")

In [13]:
categories = pd.Series(pd.unique(papers.category))
pd.DataFrame(categories)

Unnamed: 0,0
0,Artificial Intelligence
1,Disordered Systems and Neural Networks
2,Computational Complexity
3,Logic in Computer Science
4,Multiagent Systems
...,...
133,Superconductivity
134,Pattern Formation and Solitons
135,Computation and Language (Legacy category)
136,Group Theory


In [14]:
stats_topics = pd.Series(categories[categories.astype(str).str.contains("Statistics")])
stats_topics

36     Data Analysis, Statistics and Probability
37                 Machine Learning (Statistics)
43                      Methodology (Statistics)
44                      Computation (Statistics)
49                              Other Statistics
51                     Applications (Statistics)
124                            Statistics Theory
dtype: category
Categories (138, object): ['Accelerator Physics', 'Adaptation and Self-Organizing Systems', 'Algebraic Geometry', 'Algebraic Topology', ..., 'Systems and Control', 'Systems and Control (Electrical Engineering a..., 'Tissues and Organs', 'Trading and Market Microstructure']

In [23]:
stats_papers = papers.query('category.isin(@stats_topics)').groupby(['year', 'category'])['authors'].size().reset_index().query("authors > 0")





In [25]:
stats_papers

Unnamed: 0,year,category,authors
130,2005,Statistics Theory,1
143,2007,Applications (Statistics),1
173,2007,"Data Analysis, Statistics and Probability",1
218,2007,Machine Learning (Statistics),21
225,2007,Methodology (Statistics),4
...,...,...,...
2506,2024,Computation (Statistics),14
2519,2024,"Data Analysis, Statistics and Probability",1
2564,2024,Machine Learning (Statistics),645
2571,2024,Methodology (Statistics),87


In [34]:
fig = px.bar(stats_papers, x='year', y = 'authors', color='category', title="Statistics papers published by year and subdomain")
fig.update_layout(yaxis_title="Number of papers")