# Notebook for computational research analysis

In [115]:
import numpy as np
import pandas as pd
import sklearn as sk
import statsmodels.api as sm
import plotly.express as px
import statsmodels.tsa.api
from statsmodels.tsa.api import VAR

In [116]:
comp_data = pd.read_csv('comp_data.csv')

In [117]:
comp_data['category'] = comp_data['category'].replace('Computation and Language (Legacy category)', 'Computation and Language (Natural Language Processing)')
print(pd.unique(comp_data['category']))

['Artificial Intelligence'
 'Computation and Language (Natural Language Processing)'
 'Distributed, Parallel, and Cluster Computing'
 'Neural and Evolutionary Computing' 'Computer Science and Game Theory'
 'Computer Vision and Pattern Recognition' 'Computational Physics']


## Question 1 

In [118]:
cat_year_num = comp_data.groupby(['category', 'year']).count()['Unnamed: 0'].reset_index(name = "count")
print(pd.unique(cat_year_num.category))

['Artificial Intelligence'
 'Computation and Language (Natural Language Processing)'
 'Computational Physics' 'Computer Science and Game Theory'
 'Computer Vision and Pattern Recognition'
 'Distributed, Parallel, and Cluster Computing'
 'Neural and Evolutionary Computing']


In [119]:
default = px.colors.qualitative.Plotly

color_dict = {'Computation and Language (Natural Language Processing)': default[1], 'Computational Physics':default[2], 'Computer Science and Game Theory':default[3],'Computer Vision and Pattern Recognition':default[4], 'Distributed, Parallel, and Cluster Computing':default[5], 'Neural and Evolutionary Computing':default[6], 'Artificial Intelligence':default[0]}

### Bar plots for individual computational topics

In [120]:
top = 'Computation and Language (Natural Language Processing)'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [121]:
top = 'Computational Physics'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [122]:
top = 'Computer Science and Game Theory'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [123]:
top = 'Computer Vision and Pattern Recognition'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [124]:
top = 'Distributed, Parallel, and Cluster Computing'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [125]:
top = 'Neural and Evolutionary Computing'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [126]:
top = 'Artificial Intelligence'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

### Calculating percent increases per subtopic and identifying the max output and its associated year

In [127]:
topics = pd.unique(cat_year_num.category)

In [128]:
cat_year_num.query('category == @topics[0]')['count'].pct_change()
pct_change = cat_year_num.groupby(['category'])['count'].pct_change() * 100
cat_year_num['pct'] = pd.Series(pct_change)
cat_year_num

Unnamed: 0,category,year,count,pct
0,Artificial Intelligence,1993,6,
1,Artificial Intelligence,1994,14,133.333333
2,Artificial Intelligence,1995,27,92.857143
3,Artificial Intelligence,1996,28,3.703704
4,Artificial Intelligence,1997,19,-32.142857
...,...,...,...,...
162,Neural and Evolutionary Computing,2020,557,33.253589
163,Neural and Evolutionary Computing,2021,438,-21.364452
164,Neural and Evolutionary Computing,2022,693,58.219178
165,Neural and Evolutionary Computing,2023,599,-13.564214


In [129]:
for x in topics:
    fig = px.line(cat_year_num.query("category == @x"), x = "year", y = "pct", color_discrete_sequence=[color_dict[x]], title="Percentage change in publication count: " + x)
    fig.show()

In [130]:
px.line(cat_year_num, x = "year", y = "pct", color='category', facet_col='category',title="Percentage change in publication count: All topics")

In [131]:
avg_pct_growth = cat_year_num.groupby(['category'])['pct'].mean().reset_index()
px.bar(avg_pct_growth, x = 'category', y = 'pct', color='category')

In [132]:
count_groups = cat_year_num.groupby('category').agg(first=('count', 'first'), last=('count', 'last'))
count_groups['ratio'] = count_groups['last'] / count_groups['first']
count_groups = count_groups.reset_index()
count_groups

Unnamed: 0,category,first,last,ratio
0,Artificial Intelligence,6,1573,262.166667
1,Computation and Language (Natural Language Pro...,161,3467,21.534161
2,Computational Physics,1,3,3.0
3,Computer Science and Game Theory,4,9,2.25
4,Computer Vision and Pattern Recognition,2,498,249.0
5,"Distributed, Parallel, and Cluster Computing",1,13,13.0
6,Neural and Evolutionary Computing,10,708,70.8


In [133]:
#Comparing the 2024 paper counts vs the first paper publication year's counts

px.bar(count_groups, x = 'category', y = 'ratio', color='category')

## Question 2

How have the top fields changed? Look at the top categories per year in paper output

In [134]:
top_4 = cat_year_num.groupby('year').apply(lambda x: x.nlargest(4, 'count')).reset_index(drop=True)
top_4





Unnamed: 0,category,year,count,pct
0,Artificial Intelligence,1993,6,
1,Computation and Language (Natural Language Pro...,1994,161,
2,Artificial Intelligence,1994,14,133.333333
3,Computation and Language (Natural Language Pro...,1995,224,39.130435
4,Artificial Intelligence,1995,27,92.857143
...,...,...,...,...
111,Neural and Evolutionary Computing,2023,599,-13.564214
112,Computation and Language (Natural Language Pro...,2024,3467,-19.689599
113,Artificial Intelligence,2024,1573,11.639461
114,Neural and Evolutionary Computing,2024,708,18.196995


In [135]:
px.bar(top_4, x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: All years")

Chunks for scaling

In [136]:
top_4.query('year >= 2017')

Unnamed: 0,category,year,count,pct
84,Computer Vision and Pattern Recognition,2017,3287,56.897375
85,Computation and Language (Natural Language Pro...,2017,1565,80.507497
86,Artificial Intelligence,2017,850,43.097643
87,Neural and Evolutionary Computing,2017,268,-0.371747
88,Computer Vision and Pattern Recognition,2018,3549,7.970794
89,Computation and Language (Natural Language Pro...,2018,1713,9.456869
90,Artificial Intelligence,2018,838,-1.411765
91,Neural and Evolutionary Computing,2018,369,37.686567
92,Computer Vision and Pattern Recognition,2019,3425,-3.493942
93,Computation and Language (Natural Language Pro...,2019,2448,42.90718


In [137]:
px.bar(top_4.query('year <= 2011'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: Up until 2011")

### Starting at 1998 for category diversity
- 1998: NLP, AI, Neural and Evolutionary, Computer Vision
- 1999: NLP, AI, Neural and Evolutionary, Computer Vision
- 2000: NLP, AI, Computer Vision, Neural and Evolutionary
- 2001: NLP, AI, Neural and Evolutionary
- 2002: AI, NLP, Neural and Evolutionary, Game Theory, Computer Vision
- 2003: AI, NLP, TIE Neural and Evolutionary and Computer Vision, Game Theory
- 2004: AI, Neural and Evolutionary, NLP, Computer Vision
- 2005: AI, Neural and Evolutionary, Computer Vision, NLP
- 2006: AI, Neural and Evolutionary, Computer Vision, NLP
- 2007: AI, Neural and Evolutionary, NLP, Computer Vision, Game Theory
- 2008: AI, Neural and Evolutionary, Computer Vision, NLP, Game Theory
- 2009: AI, Neural and Evolutionary, Computer Vision, NLP, DP&C Computing
- 2010: Computer Vision, AI, Neural and Evolutionary, NLP, Game Theory
- 2011: AI, Computer Vision, Neural and Evolutionary, TIE NLP and Game Theory

In [138]:
px.bar(top_4.query('year >= 2011 & year <= 2015'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: 2011 - 2015")

- 2012: AI, Computer Vision, Neural and Evolutionary, Game Theory, NLP
- 2013: AI, Computer Vision, Neural and Evolutionary, NLP, Game Theory
- 2014: Computer Vision, AI, NLP, Neural and Evolutionary, Game Theory
- 2015: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing

In [139]:
px.bar(top_4.query('year >= 2015 & year <= 2020'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: 2015 - 2020")

- 2016: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2017: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2018: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2019: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2020: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing

In [140]:
px.bar(top_4.query('year >= 2020'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: 2020 - 2024")

- 2021: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2022: NLP, Computer Vision, AI, Neural and Evolutionary, DP&C Computing
- 2023: NLP, Computer Vision, AI, Neural and Evolutionary, DP&C Computing
- 2024: NLP, AI, Neural and Evolutionary, Computer Vision, DP&C Computing

### One definitive top topics list

- 1998: NLP, AI, Neural and Evolutionary, Computer Vision
- 1999: NLP, AI, Neural and Evolutionary, Computer Vision
- 2000: NLP, AI, Computer Vision, Neural and Evolutionary
- 2001: NLP, AI, Neural and Evolutionary
- 2002: AI, NLP, Neural and Evolutionary, Game Theory, Computer Vision
- 2003: AI, NLP, TIE Neural and Evolutionary and Computer Vision, Game Theory
- 2004: AI, Neural and Evolutionary, NLP, Computer Vision
- 2005: AI, Neural and Evolutionary, Computer Vision, NLP
- 2006: AI, Neural and Evolutionary, Computer Vision, NLP
- 2007: AI, Neural and Evolutionary, NLP, Computer Vision, Game Theory
- 2008: AI, Neural and Evolutionary, Computer Vision, NLP, Game Theory
- 2009: AI, Neural and Evolutionary, Computer Vision, NLP, DP&C Computing
- 2010: Computer Vision, AI, Neural and Evolutionary, NLP, Game Theory
- 2011: AI, Computer Vision, Neural and Evolutionary, TIE NLP and Game Theory
- 2012: AI, Computer Vision, Neural and Evolutionary, Game Theory, NLP
- 2013: AI, Computer Vision, Neural and Evolutionary, NLP, Game Theory
- 2014: Computer Vision, AI, NLP, Neural and Evolutionary, Game Theory
- 2015: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2016: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2017: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2018: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2019: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2020: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2021: Computer Vision, NLP, AI, Neural and Evolutionary, DP&C Computing
- 2022: NLP, Computer Vision, AI, Neural and Evolutionary, DP&C Computing
- 2023: NLP, Computer Vision, AI, Neural and Evolutionary, DP&C Computing
- 2024: NLP, AI, Neural and Evolutionary, Computer Vision, DP&C Computing

NLP at the forefront over the years but only the biggest field from 1998 to 2003 and then from 2022 to the present

## Question 3: What statistical fields are projected to grow the most in the coming years?

In [141]:
# Using 2015 as a reference

recent = cat_year_num.query("year >= 2015")
recent_growth = recent.groupby(['category'])['pct'].mean().reset_index()
px.bar(recent_growth, x = 'category', y = 'pct', color = 'category')

### Time Series Prediction (just sklearn)