# A Simple Notebook to Ensure that Luigi Pipeline Ran Correctly

In [16]:
# Import Relevant Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3
import re
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE

from nltk import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer, PorterStemmer

from gensim.matutils import Sparse2Corpus
from gensim import models

%matplotlib inline

**ImportData Check**

In [2]:
df = pd.read_pickle('../data/pipeline_data/df.pkl')

In [3]:
df.head(n=1)

Unnamed: 0,id,year,title,pdf_name,abstract,paper_text,author_id,author
0,1,1987,Self-Organization of Associative Database and ...,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1,Hisashi Suzuki


**CleanData Check**

In [4]:
df_clean = pd.read_pickle('../data/pipeline_data/a/clean_df.pkl')

In [5]:
df_clean.head(n=1)

Unnamed: 0,id,year,title,pdf_name,abstract,paper_text,author_id,author,paper_text_clean,title_clean,abstract_clean
0,1,1987,Self-Organization of Associative Database and ...,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1,Hisashi Suzuki,self organization of associative database and...,self organization of associative database and ...,abstract missing


**Vectorize Check**

In [6]:
with open('../data/pipeline_data/count_vectorizer.p','rb') as _in:
    count_vectorizer = pickle.load(_in)

In [7]:
count_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=1000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_and_stem at 0x146057840>,
        vocabulary=None)

In [8]:
with open('../data/pipeline_data/count_sparse.p','rb') as _in:
    count_sparse = pickle.load(_in)

In [9]:
count_sparse

<6557x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 2412571 stored elements in Compressed Sparse Row format>

In [10]:
with open('../data/pipeline_data/tfidf_vectorizer.p','rb') as _in:
    tfidf_vectorizer = pickle.load(_in)

In [11]:
tfidf_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=1000, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_and_stem at 0x146057840>,
        use_idf=True, vocabulary=None)

In [12]:
with open('../data/pipeline_data/tfidf_sparse.p','rb') as _in:
    tfidf_sparse = pickle.load(_in)

In [13]:
tfidf_sparse

<6557x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 2412571 stored elements in Compressed Sparse Row format>

**Nmf Check**

In [14]:
with open('../data/pipeline_data/nmf_tfidf.p','rb') as _in:
    nmf_tfidf = pickle.load(_in)

In [15]:
nmf_tfidf

NMF(alpha=0.1, beta=1, eta=0.1, init=None, l1_ratio=0.5, max_iter=200,
  n_components=10, nls_max_iter=2000, random_state=0, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [17]:
with open('../data/pipeline_data/nmf_topicspace_tfidf.p','rb') as _in:
    nmf_topicspace_scaled = pickle.load(_in)

In [18]:
nmf_topicspace_scaled

array([[ 0.07812171,  0.15419594, -0.02826964, ..., -0.35059613,
        -0.1091215 ,  0.46195149],
       [ 0.22540754,  0.33855354, -0.3855269 , ..., -0.35059613,
        -0.68336661, -0.36776831],
       [-0.23126946,  2.92429305, -0.3855269 , ..., -0.35059613,
        -0.19656832,  1.68654416],
       ..., 
       [ 2.95572308, -0.51938611, -0.26498189, ..., -0.35059613,
        -0.65931961, -0.43670437],
       [ 1.00869525, -0.65147523, -0.3855269 , ...,  5.78911531,
        -0.31897545, -0.43670437],
       [-0.31571017, -0.33321094, -0.37259611, ..., -0.35059613,
        -0.12779004,  0.14772887]])

**Tsne Check**

In [19]:
with open('../data/pipeline_data/tsne_model_2d_tfidf.p','rb') as _in:
    tsne_model_2d_tfidf = pickle.load(_in)
with open('../data/pipeline_data/tsne_matrix_2d_tfidf.p','rb') as _in:
    tsne_matrix_2d_tfidf = pickle.load(_in) 
with open('../data/pipeline_data/tsne_model_3d_tfidf.p','rb') as _in:
    tsne_model_3d_tfidf = pickle.load(_in)
with open('../data/pipeline_data/tsne_matrix_3d_tfidf.p','rb') as _in:
    tsne_matrix_3d_tfidf = pickle.load(_in)    

In [20]:
tsne_model_2d_tfidf

TSNE(angle=0.5, early_exaggeration=4.0, init='random', learning_rate=1000.0,
   method='barnes_hut', metric='euclidean', min_grad_norm=1e-07,
   n_components=2, n_iter=1000, n_iter_without_progress=30,
   perplexity=30.0, random_state=0, verbose=0)

In [21]:
tsne_matrix_2d_tfidf

array([[  2.6731425 ,  -1.93170056],
       [ -2.16763341,   9.29423999],
       [ -0.94516951,   3.85329969],
       ..., 
       [ -5.40176952,  -7.75070271],
       [ 10.51444543,  -4.51331548],
       [  4.19972232,   2.09218688]])

In [22]:
tsne_model_3d_tfidf

TSNE(angle=0.5, early_exaggeration=4.0, init='random', learning_rate=1000.0,
   method='barnes_hut', metric='euclidean', min_grad_norm=1e-07,
   n_components=3, n_iter=1000, n_iter_without_progress=30,
   perplexity=30.0, random_state=0, verbose=0)

In [23]:
tsne_matrix_3d_tfidf

array([[-0.05245611,  2.47186519, -2.42409382],
       [ 1.09030163,  0.45468597, -4.05451731],
       [-3.22721421, -5.07296745, -2.83715782],
       ..., 
       [ 4.92467902,  5.08469332,  0.29310551],
       [-2.09705761,  4.04065517, -5.30997489],
       [ 1.02592699,  2.28198381,  4.15933009]])

In [24]:
tsne_df_2d_tfidf = pd.read_pickle('../data/pipeline_data/tsne_df_2d_tfidf.pkl')

In [25]:
tsne_df_2d_tfidf.head()

Unnamed: 0,X,Y,Year,Topic
0,2.673143,-1.931701,1987,5
1,-2.167633,9.29424,1987,1
2,-0.94517,3.8533,1987,1
3,-3.231773,11.287357,1987,0
4,-1.092127,6.978098,1987,1


In [26]:
tsne_df_3d_tfidf = pd.read_pickle('../data/pipeline_data/tsne_df_3d_tfidf.pkl')

In [27]:
tsne_df_3d_tfidf.head()

Unnamed: 0,X,Y,Z,Year,Topic
0,-0.052456,2.471865,-2.424094,1987,5
1,1.090302,0.454686,-4.054517,1987,1
2,-3.227214,-5.072967,-2.837158,1987,1
3,-0.558237,4.678076,0.095853,1987,0
4,1.758374,1.681996,-5.529217,1987,1
