# Notebook for statistical research analysis

In [41]:
import numpy as np
import pandas as pd
import sklearn as sk
import statsmodels.api as sm
import plotly.express as px

## To-do list for this 
### Question 1: What statistical fields have seen the largest increase in publications?
- Rate of change
- Look at different time frames
### Question 2: How have the most published statistical fields changed over time?
- Ordering by count, top 3-5 (variable based on observation)
- Try to find external sources and link to developments
### Question 3: What statistical fields are projected to grow the most in the coming years?
- Little bit of modeling
- Also look at recent rates of change and trend analysis

In [42]:
stats_data = pd.read_csv('stats_data.csv')

## Question 1

In [43]:
cat_year_num = stats_data.groupby(['category', 'year']).count()['Unnamed: 0'].reset_index()
cat_year_num.rename(columns={'Unnamed: 0': 'count'}, inplace=True)
print(pd.unique(cat_year_num.category))

['Applications (Statistics)' 'Computation (Statistics)'
 'Data Analysis, Statistics and Probability'
 'Machine Learning (Statistics)' 'Methodology (Statistics)'
 'Other Statistics' 'Statistics Theory']


In [44]:
#Saving colors for each topic for later
default = px.colors.qualitative.Plotly

color_dict = {'Applications (Statistics)': default[0], 'Computation (Statistics)':default[1], 'Data Analysis, Statistics and Probability':default[2],'Machine Learning (Statistics)':default[3], 'Methodology (Statistics)':default[4], 'Other Statistics':default[5], 'Statistics Theory':default[6]}

### Bar plots for individual statistical topics

In [45]:
top = 'Applications (Statistics)'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [46]:
top = 'Computation (Statistics)'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [47]:
top = 'Data Analysis, Statistics and Probability'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [48]:
top = 'Machine Learning (Statistics)'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [49]:
top = 'Methodology (Statistics)'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [50]:
top = 'Other Statistics'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [51]:
top = 'Statistics Theory'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

### Calculating percent increases per subtopic and identifying the max output and its associated year

In [52]:
topics = pd.unique(cat_year_num.category)

In [53]:
topics

array(['Applications (Statistics)', 'Computation (Statistics)',
       'Data Analysis, Statistics and Probability',
       'Machine Learning (Statistics)', 'Methodology (Statistics)',
       'Other Statistics', 'Statistics Theory'], dtype=object)

In [54]:
cat_year_num.query('category == @topics[0]')['count'].pct_change()

0          NaN
1     1.000000
2     0.000000
3    -0.500000
4     1.000000
5     4.000000
6    -0.700000
7     1.000000
8     1.000000
9     1.916667
10   -0.314286
11    0.291667
12    0.225806
13    0.289474
14   -0.571429
15   -0.238095
16   -0.312500
17    0.272727
Name: count, dtype: float64

In [55]:
pct_change = cat_year_num.groupby(['category'])['count'].pct_change() * 100
cat_year_num['pct'] = pd.Series(pct_change)
cat_year_num

Unnamed: 0,category,year,count,pct
0,Applications (Statistics),2007,1,
1,Applications (Statistics),2008,2,100.000000
2,Applications (Statistics),2009,2,0.000000
3,Applications (Statistics),2010,1,-50.000000
4,Applications (Statistics),2011,2,100.000000
...,...,...,...,...
107,Statistics Theory,2020,39,-11.363636
108,Statistics Theory,2021,10,-74.358974
109,Statistics Theory,2022,10,0.000000
110,Statistics Theory,2023,2,-80.000000


In [56]:
for x in topics:
    fig = px.line(cat_year_num.query("category == @x"), x = "year", y = "pct", color_discrete_sequence=[color_dict[x]], title="Percentage change in publication count: " + x)
    fig.show()

In [57]:
px.line(cat_year_num, x = "year", y = "pct", color='category', facet_col='category',title="Percentage change in publication count: All topics")

In [58]:
avg_pct_growth = cat_year_num.groupby(['category'])['pct'].mean().reset_index()
px.bar(avg_pct_growth, x = 'category', y = 'pct', color='category')

In [59]:
count_groups = cat_year_num.groupby('category').agg(first=('count', 'first'), last=('count', 'last'))
count_groups['ratio'] = count_groups['last'] / count_groups['first']
count_groups = count_groups.reset_index()
count_groups

Unnamed: 0,category,first,last,ratio
0,Applications (Statistics),1,14,14.0
1,Computation (Statistics),3,14,4.666667
2,"Data Analysis, Statistics and Probability",1,1,1.0
3,Machine Learning (Statistics),21,645,30.714286
4,Methodology (Statistics),4,87,21.75
5,Other Statistics,1,1,1.0
6,Statistics Theory,1,1,1.0


In [60]:
#Comparing the 2024 paper counts vs the first paper publication year's counts

px.bar(count_groups, x = 'category', y = 'ratio', color='category')

## Question 2

How have the top fields changed? Look at the top categories per year in paper output

In [61]:
top_4 = cat_year_num.groupby('year').apply(lambda x: x.nlargest(4, 'count')).reset_index(drop=True)
top_4





Unnamed: 0,category,year,count,pct
0,Statistics Theory,2005,1,
1,Machine Learning (Statistics),2007,21,
2,Statistics Theory,2007,8,700.000000
3,Methodology (Statistics),2007,4,
4,Applications (Statistics),2007,1,
...,...,...,...,...
68,Computation (Statistics),2023,5,-54.545455
69,Machine Learning (Statistics),2024,645,25.486381
70,Methodology (Statistics),2024,87,47.457627
71,Applications (Statistics),2024,14,27.272727


In [62]:
px.bar(top_4, x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: All years")

For scaling reasons, I will break up the time span into chunks

In [63]:
px.bar(top_4.query('year <= 2011'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: Up until 2011")

#### Year  - topics in order (G to L)

- 2005 : Statistics Theory
- 2006 : NA
- 2007 : Machine Learning, Statistics Theory, Methodology, Applications
- 2008 : Machine Learning, (TIE) Statistics Theory and Methodology, Computation
- 2009 : Machine Learning, Statistics Theory, Methodology, Computation
- 2010 : Machine Learning, Methodology, TIE (Statistics Theory and Computation)
- 2011 : Machine Learning, Methodology, Computation, Applications

In [64]:
px.bar(top_4.query('year >= 2011 & year <= 2015'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: 2011 - 2015")

- 2012 : Machine Learning, Methodology, Statistics Theory, Computation
- 2013 : Machine Learning, Methodology, Computation, Applications
- 2014 : Machine Learning, Methodology, Computation, Statistics Theory
- 2015 : Machine Learning, TIE (Methodology and Computation), Applications 

In [65]:
px.bar(top_4.query('year >= 2015 & year <= 2020'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: 2015 - 2020")

- 2016 : Machine Learning, Methodology, Applications, Computation
- 2017 : Machine Learning, Methodology, Aplications, Statistics Theory
- 2018 : Machine Learning, Methodology, Computation, Applications
- 2019 : Machine Learning, Methodology, Statistics Theory, Applications
- 2020 : Machine Learning, Methodology, Applications, Statistics Theory

In [66]:
px.bar(top_4.query('year >= 2020'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: 2020 - 2024")

- 2021 : Machine Learning, Methodology, Applications, Statistics Theory
- 2022 : Machine Learning, Methodology, Applications, Computation
- 2023 : Machine Learning, Methodology, Applications, Computation
- 2024 : Machine Learning, Methodology, TIE (Applications and Computation)

### One definitive top topics list

- 2005 : Statistics Theory
- 2006 : NA
- 2007 : Machine Learning, Statistics Theory, Methodology, Applications
- 2008 : Machine Learning, (TIE) Statistics Theory and Methodology, Computation
- 2009 : Machine Learning, Statistics Theory, Methodology, Computation
- 2010 : Machine Learning, Methodology, TIE (Statistics Theory and Computation)
- 2011 : Machine Learning, Methodology, Computation, Applications
- 2012 : Machine Learning, Methodology, Statistics Theory, Computation
- 2013 : Machine Learning, Methodology, Computation, Applications
- 2014 : Machine Learning, Methodology, Computation, Statistics Theory
- 2015 : Machine Learning, TIE (Methodology and Computation), Applications 
- 2016 : Machine Learning, Methodology, Applications, Computation
- 2017 : Machine Learning, Methodology, Aplications, Statistics Theory
- 2018 : Machine Learning, Methodology, Computation, Applications
- 2019 : Machine Learning, Methodology, Statistics Theory, Applications
- 2020 : Machine Learning, Methodology, Applications, Statistics Theory
- 2021 : Machine Learning, Methodology, Applications, Statistics Theory
- 2022 : Machine Learning, Methodology, Applications, Computation
- 2023 : Machine Learning, Methodology, Applications, Computation
- 2024 : Machine Learning, Methodology, TIE (Applications and Computation)

With starting analysis from 2007 (due to the NA for 2006), Machine Learning has been KING with Methodology second and Applications a clear third

## Question 3: What statistical fields are projected to grow the most in the coming years?

In [67]:
# Using 2019 as a reference

recent = cat_year_num.query("year >= 2019")
recent_growth = recent.groupby(['category'])['pct'].mean().reset_index()
px.bar(recent_growth, x = 'category', y = 'pct', color = 'category')

We can see recent increases in Methodology and Computation (meaning our assertion of statistical growth being linked to the growth of computation may have some truth)

### Time Series Prediction

Our initial stats data we used here was yearly, but for predicting future rates of change in these fields, we want to use all the monthly data from the original dataset.

In [68]:
original = pd.read_parquet("arXiv.parquet")

In [69]:
original

Unnamed: 0,id,title,category,category_code,published_date,updated_date,authors,first_author,summary,summary_word_count
0,cs-9308101v1,Dynamic Backtracking,Artificial Intelligence,cs.AI,1993-08-01,1993-08-01,['M. L. Ginsberg'],'M. L. Ginsberg',Because of their occasional need to return to ...,79
1,cs-9308102v1,A Market-Oriented Programming Environment and ...,Artificial Intelligence,cs.AI,1993-08-01,1993-08-01,['M. P. Wellman'],'M. P. Wellman',Market price systems constitute a well-underst...,119
2,cs-9309101v1,An Empirical Analysis of Search in GSAT,Artificial Intelligence,cs.AI,1993-09-01,1993-09-01,"['I. P. Gent', 'T. Walsh']",'I. P. Gent',We describe an extensive study of search in GS...,167
3,cs-9311101v1,The Difficulties of Learning Logic Programs wi...,Artificial Intelligence,cs.AI,1993-11-01,1993-11-01,"['F. Bergadano', 'D. Gunetti', 'U. Trinchero']",'F. Bergadano',As real logic programmers normally use cut (!)...,174
4,cs-9311102v1,Software Agents: Completing Patterns and Const...,Artificial Intelligence,cs.AI,1993-11-01,1993-11-01,"['J. C. Schlimmer', 'L. A. Hermens']",'J. C. Schlimmer',To support the goal of allowing users to recor...,187
...,...,...,...,...,...,...,...,...,...,...
136233,abs-2408.08541v1,Where is the signal in tokenization space?,Computation and Language (Natural Language Pro...,cs.CL,2024-08-16,2024-08-16,"['Renato Lui Geh', 'Honghua Zhang', 'Kareem Ah...",'Renato Lui Geh',Large Language Models (LLMs) are typically shi...,170
136234,abs-2408.08564v1,Collaborative Cross-modal Fusion with Large La...,Information Retrieval,cs.IR,2024-08-16,2024-08-16,"['Zhongzhou Liu', 'Hao Zhang', 'Kuicai Dong', ...",'Zhongzhou Liu',Despite the success of conventional collaborat...,157
136235,abs-2408.08624v1,RealMedQA: A pilot biomedical question answeri...,Computation and Language (Natural Language Pro...,cs.CL,2024-08-16,2024-08-16,"['Gregory Kell', 'Angus Roberts', 'Serge Umans...",'Gregory Kell',Clinical question answering systems have the p...,153
136236,abs-2408.08648v1,Understanding Enthymemes in Argument Maps: Bri...,Artificial Intelligence,cs.AI,2024-08-16,2024-08-16,"['Jonathan Ben-Naim', 'Victor David', 'Anthony...",'Jonathan Ben-Naim',Argument mining is natural language processing...,194


In [None]:
original.published_date = pd.to_datetime(original.published_date)
original['year'] = original.published_date.dt.year
original['month'] = original.published_date.dt.month

original = original.query("year != 2025") #Want full year data (2025 incomplete)

In [81]:
categories = pd.Series(pd.unique(original.category))
stats_topics = pd.Series(categories[categories.astype(str).str.contains("Statistics")])
stats_date = original.query('category.isin(@stats_topics)').groupby(['published_date', 'category']).size().reset_index(name='count').query("count > 0")

In [82]:
stats_date

Unnamed: 0,published_date,category,count
0,2005-09-29,Statistics Theory,1
1,2007-01-07,Statistics Theory,1
2,2007-04-09,Statistics Theory,1
3,2007-04-13,Applications (Statistics),1
4,2007-05-02,Statistics Theory,1
...,...,...,...
5215,2024-12-29,Computation (Statistics),1
5216,2024-12-29,Machine Learning (Statistics),1
5217,2024-12-29,Methodology (Statistics),1
5218,2024-12-30,Machine Learning (Statistics),2
