# Notebook for computational research analysis

In [15]:
import numpy as np
import pandas as pd
import sklearn as sk
import statsmodels.api as sm
import plotly.express as px
import statsmodels.tsa.api
from statsmodels.tsa.api import VAR

In [16]:
comp_data = pd.read_csv('comp_data.csv')

In [17]:
comp_data['category'] = comp_data['category'].replace('Computation and Language (Legacy category)', 'Computation and Language (Natural Language Processing)')
print(pd.unique(comp_data['category']))

['Computation and Language (Natural Language Processing)'
 'Distributed, Parallel, and Cluster Computing'
 'Neural and Evolutionary Computing' 'Computer Science and Game Theory'
 'Computer Vision and Pattern Recognition' 'Other Computer Science'
 'Computational Physics']


## Question 1 

In [18]:
cat_year_num = comp_data.groupby(['category', 'year']).count()['Unnamed: 0'].reset_index(name = "count")
print(pd.unique(cat_year_num.category))

['Computation and Language (Natural Language Processing)'
 'Computational Physics' 'Computer Science and Game Theory'
 'Computer Vision and Pattern Recognition'
 'Distributed, Parallel, and Cluster Computing'
 'Neural and Evolutionary Computing' 'Other Computer Science']


In [21]:
default = px.colors.qualitative.Plotly

color_dict = {'Computation and Language (Natural Language Processing)': default[0], 'Computational Physics':default[1], 'Computer Science and Game Theory':default[2],'Computer Vision and Pattern Recognition':default[3], 'Distributed, Parallel, and Cluster Computing':default[4], 'Neural and Evolutionary Computing':default[5], 'Other Computer Science':default[6]}

### Bar plots for individual computational topics

In [24]:
top = 'Computation and Language (Natural Language Processing)'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [28]:
top = 'Computational Physics'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [27]:
top = 'Computer Science and Game Theory'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [29]:
top = 'Computer Vision and Pattern Recognition'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [30]:
top = 'Distributed, Parallel, and Cluster Computing'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [32]:
top = 'Neural and Evolutionary Computing'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

In [31]:
top = 'Other Computer Science'
px.bar(cat_year_num.query("category == @top"), x = 'year', y = 'count', title= top + " Papers by Year", color_discrete_sequence=[color_dict[top]])

### Calculating percent increases per subtopic and identifying the max output and its associated year

In [33]:
topics = pd.unique(cat_year_num.category)

In [35]:
cat_year_num.query('category == @topics[0]')['count'].pct_change()
pct_change = cat_year_num.groupby(['category'])['count'].pct_change() * 100
cat_year_num['pct'] = pd.Series(pct_change)
cat_year_num

Unnamed: 0,category,year,count,pct
0,Computation and Language (Natural Language Pro...,1994,161,
1,Computation and Language (Natural Language Pro...,1995,224,39.130435
2,Computation and Language (Natural Language Pro...,1996,201,-10.267857
3,Computation and Language (Natural Language Pro...,1997,163,-18.905473
4,Computation and Language (Natural Language Pro...,1998,23,-85.889571
...,...,...,...,...
139,Other Computer Science,2017,2,100.000000
140,Other Computer Science,2018,1,-50.000000
141,Other Computer Science,2019,2,100.000000
142,Other Computer Science,2020,4,100.000000


In [36]:
for x in topics:
    fig = px.line(cat_year_num.query("category == @x"), x = "year", y = "pct", color_discrete_sequence=[color_dict[x]], title="Percentage change in publication count: " + x)
    fig.show()

In [37]:
px.line(cat_year_num, x = "year", y = "pct", color='category', facet_col='category',title="Percentage change in publication count: All topics")

In [38]:
avg_pct_growth = cat_year_num.groupby(['category'])['pct'].mean().reset_index()
px.bar(avg_pct_growth, x = 'category', y = 'pct', color='category')

In [39]:
count_groups = cat_year_num.groupby('category').agg(first=('count', 'first'), last=('count', 'last'))
count_groups['ratio'] = count_groups['last'] / count_groups['first']
count_groups = count_groups.reset_index()
count_groups

Unnamed: 0,category,first,last,ratio
0,Computation and Language (Natural Language Pro...,161,3467,21.534161
1,Computational Physics,1,3,3.0
2,Computer Science and Game Theory,4,9,2.25
3,Computer Vision and Pattern Recognition,2,498,249.0
4,"Distributed, Parallel, and Cluster Computing",1,13,13.0
5,Neural and Evolutionary Computing,10,708,70.8
6,Other Computer Science,1,3,3.0


In [40]:
#Comparing the 2024 paper counts vs the first paper publication year's counts

px.bar(count_groups, x = 'category', y = 'ratio', color='category')

## Question 2

How have the top fields changed? Look at the top categories per year in paper output

In [42]:
top_4 = cat_year_num.groupby('year').apply(lambda x: x.nlargest(4, 'count')).reset_index(drop=True)
top_4





Unnamed: 0,category,year,count,pct
0,Computation and Language (Natural Language Pro...,1994,161,
1,Computation and Language (Natural Language Pro...,1995,224,39.130435
2,Computation and Language (Natural Language Pro...,1996,201,-10.267857
3,Computation and Language (Natural Language Pro...,1997,163,-18.905473
4,Computation and Language (Natural Language Pro...,1998,23,-85.889571
...,...,...,...,...
104,"Distributed, Parallel, and Cluster Computing",2023,22,10.000000
105,Computation and Language (Natural Language Pro...,2024,3467,-19.689599
106,Neural and Evolutionary Computing,2024,708,18.196995
107,Computer Vision and Pattern Recognition,2024,498,-82.470961


In [43]:
px.bar(top_4, x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: All years")

Chunks for scaling

In [44]:
px.bar(top_4.query('year <= 2011'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: Up until 2011")

In [45]:
px.bar(top_4.query('year >= 2011 & year <= 2015'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: 2011 - 2015")

In [46]:
px.bar(top_4.query('year >= 2015 & year <= 2020'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: 2015 - 2020")

In [47]:
px.bar(top_4.query('year >= 2020'), x = 'year', y = 'count', color='category', title="Top 4 categories by paper output: 2020 - 2024")

### One definitive top topics list

- 2005 : Statistics Theory
- 2006 : NA
- 2007 : Machine Learning, Statistics Theory, Methodology, Applications
- 2008 : Machine Learning, (TIE) Statistics Theory and Methodology, Computation
- 2009 : Machine Learning, Statistics Theory, Methodology, Computation
- 2010 : Machine Learning, Methodology, TIE (Statistics Theory and Computation)
- 2011 : Machine Learning, Methodology, Computation, Applications
- 2012 : Machine Learning, Methodology, Statistics Theory, Computation
- 2013 : Machine Learning, Methodology, Computation, Applications
- 2014 : Machine Learning, Methodology, Computation, Statistics Theory
- 2015 : Machine Learning, TIE (Methodology and Computation), Applications 
- 2016 : Machine Learning, Methodology, Applications, Computation
- 2017 : Machine Learning, Methodology, Aplications, Statistics Theory
- 2018 : Machine Learning, Methodology, Computation, Applications
- 2019 : Machine Learning, Methodology, Statistics Theory, Applications
- 2020 : Machine Learning, Methodology, Applications, Statistics Theory
- 2021 : Machine Learning, Methodology, Applications, Statistics Theory
- 2022 : Machine Learning, Methodology, Applications, Computation
- 2023 : Machine Learning, Methodology, Applications, Computation
- 2024 : Machine Learning, Methodology, TIE (Applications and Computation)