### This notebook wrangles the 'track' column for NIPS-2016 data to match it to NIPS-2019 tracks

The tracks from 2019 'track' column have been aggregated into 9 'main_tracks'. 
This notebooks attempts to correlate the tracks from 2016 to 2019 to converge the two.   
- I will read in the track labels from 2019 data
- we will split the track column by " -- " delimiter to extract track information for 2016
- we will then compare the two by grouping data
- we will reconcile tracks with minor labeling differences across 2016 and 2019 
- the labels which can't be reconciled will be grouped under a 'Not Found/NF' main_track label for now

In [2]:
import pandas as pd
import re

In [3]:
#read in NIPS data
nips = pd.read_csv("../data/nips.csv")

#read in 2019 data with tracks information
nips19 = pd.read_csv("../data/nips_with_track_cleaned.csv")
nips19 = nips19[nips19['year'] == 2019]

In [4]:
#create a dict to map tracks to main_tracks(canonical label)

tracks19 = nips19.track_original.unique().tolist()

mt19 = nips19.main_track.unique().tolist()

doc = {}

for t in tracks19:   
    t = t.split(" -- ")
    doc[t[1]] = t[0]
    
for t in mt19:
    doc[t] = t

In [5]:
#subset 2016 data

nips16 = nips[nips['year'] == 2016].copy()

In [6]:
#create a column to record the original track info

nips16['track_original'] = nips16['track']

In [7]:
#edit track values to conform to nips2019 tracks

nips16.loc[nips16['track'] == 'Component Analysis (ICA,PCA,CCA, FLDA)', 'track'] =  "Components Analysis (e.g., CCA, ICA, LDA, PCA)"

nips16.loc[nips16['track'] == 'Ensemble Methods and Boosting', 'track'] =  "Boosting and Ensemble Methods"

nips16.loc[nips16['track'] == 'Game Theory and Econometrics', 'track'] =  "Game Theory and Computational Economics"

nips16.loc[nips16['track'] == 'Graph-based Learning', 'track'] =  "Graphical Models"

nips16.loc[nips16['track'] == 'Large Scale Learning and Big Data', 'track'] =  "Large Scale Learning"

nips16.loc[nips16['track'] == 'Matrix Factorization', 'track'] =  "Matrix and Tensor Factorization"

nips16.loc[nips16['track'] == 'Multi-task and Transfer Learning', 'track'] =  "Multitask and Transfer Learning"

nips16.loc[nips16['track'] == 'Nonlinear Dimension Reduction and Manifold Learning', 'track'] =  "Nonlinear Dimensionality Reduction and Manifold Learning"

nips16.loc[nips16['track'] == 'Regularization and Large Margin Methods', 'track'] =  "Regularization"

nips16.loc[nips16['track'] == 'Reinforcement Learning Algorithms', 'track'] =  "Reinforcement Learning and Planning"

In [8]:
#pass the track column through the dict to create the main_track column
# NF = 'not found' for tracks which couldn't be matched to 2019 track data

tracks16 =  nips16.track.tolist()

mt16 = []

for t in tracks16:
    
    if t in doc:
        mt16.append(doc[t])
    else:
        mt16.append('NF')
        
nips16['main_track'] = mt16

In [9]:
nips16.groupby('main_track').size()

main_track
Algorithms                             418
Applications                            53
NF                                     856
Optimization                            95
Probabilistic Methods                  130
Reinforcement Learning and Planning     21
Theory                                 151
dtype: int64

In [10]:
#create a list of all tracks which had main_track = NF to parse them indiviually

nftracks = nips16[nips16['main_track'] == 'NF'].track.unique().tolist()

In [11]:
#create a new dict to transform tracks with main_track info in parantheses 
new = {}

for t in nftracks:
    if '('  in t:
        mt = re.search("\((.*)\)", t).group(1)
        temp = '('+ mt +')'
        rt = t.replace(temp, "").strip()
        new[rt] = mt
        nips16.loc[nips16['track'] == t, 'track'] = rt        

In [12]:
new

{'Bioinformatics and Systems Biology': 'Application',
 'Collaborative Filtering and Recommender Systems': 'Application',
 'Computer Vision': 'Application',
 'Information Retrieval': 'Application',
 'Natural Language and Text Processing': 'Application',
 'Object and Pattern Recognition': 'Application',
 'Privacy, Anonymity, and Security': 'Application',
 'Signal and Speech Processing': 'Application',
 'Social Networks': 'Application',
 'Web Applications and Internet Data': 'Application',
 'Language': 'Cognitive/Neuroscience',
 'Neural Coding': 'Cognitive/Neuroscience',
 'Perception': 'Cognitive/Neuroscience',
 'Reinforcement Learning': 'Cognitive/Neuroscience',
 'Theoretical Neuroscience': 'Cognitive/Neuroscience',
 'Applications': 'Other',
 'Bayesian Inference': 'Other',
 'Classification': 'Other',
 'Cognitive Science': 'Other',
 'Machine Learning Topics': 'Other',
 'Neuroscience': 'Other',
 'Optimization': 'Other',
 'Probabilistic Models and Methods': 'Other',
 'Regression': 'Other',


In [13]:
#manually update the dict for tracks where main_track = "Other" (more detail in notes below)

new['Algorithms']='Algorithms'

new['Applications'] = 'Applications'

new['Neuroscience'] = 'Neuroscience and Cognitive Science'

new['Cognitive Science'] = 'Neuroscience and Cognitive Science'

new['Classification'] = 'Algorithms'

new['Optimization'] = 'Optimization'

new['Probabilistic Models and Methods'] = 'Probabilistic Methods'

new['Robotics and Control'] =  'Applications'

new['Unsupervised Learning Methods']= 'Algorithms'
    
new['Regression']= 'Algorithms'
    
new['Bayesian Inference'] = 'NF'
    
new['Machine Learning Topics']= 'NF'
    
new['Statistics'] = 'NF'

In [14]:
#merge the two dicts

mdoc = {**doc, **new}

In [15]:
#rerun the rows through new master dict of track:main_track 

tracks16 =  nips16.track.tolist()
mt162 = []

for t in tracks16:
    
    if t in mdoc:
        mt162.append(mdoc[t])
    else:
        mt162.append('NF')
        
nips16['main_track'] = mt162

In [16]:
#adding notes for "not found or NF" values

nips16[nips16['main_track'] == 'NF'].track.unique().tolist()

['Causality',
 'Deep Learning or Neural Networks',
 'Bayesian Inference',
 'Machine Learning Topics',
 'Statistics',
 'Sparsity and Feature Selection']

NF notes: 

    1. 'Causality' ---> similar to 'causal inference' in tracks_2019?
    2. 'Deep Learning or Neural Networks' --> matches: 'Deep Learning' in main_tracks_2019 and 'Memory-Augmented Neural Networks' for NN in tracks_2019
    3. 'Bayesian Inference' --> 'Bayesian Nonparametrics' in tracks_2019
    10. 'Machine Learning Topics' --> no match
    11. 'Statistics' --> no match

    14. 'Sparsity and Feature Selection' --> 2 options: 'Sparse Coding and Dimensionality Expansion' and 'Sparsity and Compressed Sensing'


In [17]:
nips16.head()

Unnamed: 0,title,abstract,pdf_link,year,track,track_original,main_track
0,Adaptive optimal training of animal behavior,Neuroscience experiments often require trainin...,http://papers.nips.cc/paper/6344-adaptive-opti...,2016,Active Learning,Active Learning,Algorithms
1,Active Learning with Oracle Epiphany,We present a theoretical analysis of active le...,http://papers.nips.cc/paper/6155-active-learni...,2016,Active Learning,Active Learning,Algorithms
2,Cooperative Inverse Reinforcement Learning,For an autonomous system to be helpful to huma...,http://papers.nips.cc/paper/6420-cooperative-i...,2016,Active Learning,Active Learning,Algorithms
3,Safe Exploration in Finite Markov Decision Pro...,In classical reinforcement learning agents acc...,http://papers.nips.cc/paper/6358-safe-explorat...,2016,Active Learning,Active Learning,Algorithms
4,Active Learning from Imperfect Labelers,We study active learning where the labeler can...,http://papers.nips.cc/paper/6162-active-learni...,2016,Active Learning,Active Learning,Algorithms


In [18]:
#there is a discrepancy in main_track = 'Application'/'Applications'

nips16.loc[nips16['main_track'] == 'Application', 'main_track'] =  "Applications"

#we notice that there is a main_track 'Cognitive/Neuroscience' equivalent to 'Neuroscience and Cognitive Science'
#replace the same in the df

nips16.loc[nips16['main_track'] == 'Cognitive/Neuroscience', 'main_track'] =  "Neuroscience and Cognitive Science"

In [19]:
#check data

nips16.groupby("main_track").size()

main_track
Algorithms                             523
Applications                           278
NF                                     309
Neuroscience and Cognitive Science      94
Optimization                           156
Probabilistic Methods                  192
Reinforcement Learning and Planning     21
Theory                                 151
dtype: int64

In [20]:
#review NF content

nips16[nips16['main_track'] == 'NF'].groupby('track').size()

track
Bayesian Inference                   26
Causality                             8
Deep Learning or Neural Networks    147
Machine Learning Topics              29
Sparsity and Feature Selection       52
Statistics                           47
dtype: int64

In [21]:
nips16.to_csv("../data/nips_with_track_cleaned.csv", mode='a', index= False, header= False)

In [22]:
document = nips16[['track', 'track_original', 'main_track', 'year']]

document = document.drop_duplicates()

document.to_csv("../data/nips_yearwise_trackinfo.csv", mode='a', index= False, header= False)