# NIPS - Neural Information Processing Systems

## Objective: Use Natural Language Processing to discover trends in machine learning over time by analyzing research papers submitted to the NIPS conference from 1987 to 2016

In [18]:
# Import Relevant Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import sqlite3
import re
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import NMF, LatentDirichletAllocation, PCA
from sklearn.manifold import TSNE

from nltk import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer, PorterStemmer

from gensim.matutils import Sparse2Corpus
from gensim import models

%matplotlib inline

## For this notebook, we'll take a look at the pickled files created by the luigi pipeline

**Read Clean DataFrame File**

In [3]:
df_clean = pd.read_pickle('../data/pipeline_data/clean_df.pkl')

In [4]:
df_clean.reset_index(drop=True, inplace=True)

In [5]:
df_clean.head(n=3)

Unnamed: 0,id,year,title,pdf_name,abstract,paper_text,author_id,author,paper_text_clean,title_clean,abstract_clean
0,1,1987,Self-Organization of Associative Database and ...,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1,Hisashi Suzuki,self organization of associative database and...,self organization of associative database and ...,abstract missing
1,2,1987,The Capacity of the Kanerva Associative Memory...,2-the-capacity-of-the-kanerva-associative-memo...,Abstract Missing,184\n\nTHE CAPACITY OF THE KANERVA ASSOCIATIVE...,3,Philip A. Chou,the capacity of the kanerva associative memor...,the capacity of the kanerva associative memory...,abstract missing
2,3,1987,Supervised Learning of Probability Distributio...,3-supervised-learning-of-probability-distribut...,Abstract Missing,52\n\nSupervised Learning of Probability Distr...,252,Eric B. Baum,supervised learning of probability distributi...,supervised learning of probability distributio...,abstract missing


**Read Vectorized Files**

In [6]:
with open('../data/pipeline_data/count_vectorizer.p','rb') as _in:
    count_vectorizer = pickle.load(_in)
with open('../data/pipeline_data/count_sparse.p','rb') as _in:
    count_sparse = pickle.load(_in)
with open('../data/pipeline_data/tfidf_vectorizer.p','rb') as _in:
    tfidf_vectorizer = pickle.load(_in)
with open('../data/pipeline_data/tfidf_sparse.p','rb') as _in:
    tfidf_sparse = pickle.load(_in)

**Let's take a look at how the sparse matrix looks like**

In [7]:
pd.DataFrame(count_sparse.toarray(), columns=count_vectorizer.get_feature_names()).head()

Unnamed: 0,abil,abl,abov,ac,accord,account,accur,accuraci,achiev,acknowledg,...,xt,yi,yield,yj,yn,york,yt,zero,zi,zt
0,0,0,1,0,0,0,0,0,2,0,...,4,3,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,5,0,...,0,1,0,0,1,1,0,0,0,0
2,1,2,0,0,0,1,1,0,1,1,...,0,0,2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,1,...,0,3,5,0,0,0,0,0,0,0
4,0,0,5,0,2,1,0,0,1,0,...,0,0,0,0,0,0,0,2,0,0


In [8]:
pd.DataFrame(tfidf_sparse.toarray(), columns=tfidf_vectorizer.get_feature_names()).head()

Unnamed: 0,abil,abl,abov,ac,accord,account,accur,accuraci,achiev,acknowledg,...,xt,yi,yield,yj,yn,york,yt,zero,zi,zt
0,0.0,0.0,0.007131,0.0,0.0,0.0,0.0,0.0,0.015412,0.0,...,0.059179,0.03675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.008567,0.0,0.0,0.0,0.0,0.0,0.0,0.034487,0.0,...,0.0,0.010965,0.0,0.0,0.015818,0.010866,0.0,0.0,0.0,0.0
2,0.019595,0.031426,0.0,0.0,0.0,0.018112,0.017577,0.0,0.012651,0.013011,...,0.0,0.0,0.030791,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006622,0.0,0.005069,...,0.0,0.023506,0.02999,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.02212,0.0,0.01087,0.006844,0.0,0.0,0.004781,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009632,0.0,0.0


## Create K-Means Model

In [9]:
def create_kmeans_model(sparse_matrix, num_clusters):
    '''
    Creates a sklearn kmeans model
    Input: sparse_matrix - Scipy Sparse Matrix 
           num_topics - Int
    Output: model - sklearn kmeans model
            clusters - List (Cluster label index for each sample)
            cluster_space - Numpy Array (Sparse Matrix transformed to cluster-distance space)
    '''
    model = KMeans(n_clusters=num_clusters, n_jobs=-1)
    model.fit(sparse_matrix)
    clusters = model.predict(sparse_matrix)
    cluster_space = model.transform(sparse_matrix)
    return model, clusters, cluster_space

In [10]:
kmeans_model, clusters, cluster_space = create_kmeans_model(tfidf_sparse, 10)

In [14]:
cluster_space[0]

array([ 1.06687375,  0.93124268,  0.93741028,  0.93665158,  1.0171091 ,
        0.94699071,  0.90523835,  1.01393171,  1.00278103,  1.0514043 ])

In [25]:
pca = PCA(n_components=100)

In [26]:
pca.fit(tfidf_sparse.toarray())

PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [28]:
sum(pca.explained_variance_ratio_)

0.53058270076162228