# A Simple Notebook to Ensure that Luigi Pipeline Ran Correctly

In [16]:
# Import Relevant Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3
import re
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE

from nltk import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer, PorterStemmer

from gensim.matutils import Sparse2Corpus
from gensim import models

%matplotlib inline

**ImportData Check**

In [2]:
df = pd.read_pickle('../data/pipeline_data/df.pkl')

In [3]:
df.head(n=1)

Unnamed: 0,id,year,title,pdf_name,abstract,paper_text,author_id,author
0,1,1987,Self-Organization of Associative Database and ...,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1,Hisashi Suzuki


**CleanData Check**

In [4]:
df_clean = pd.read_pickle('../data/pipeline_data/clean_df.pkl')

In [5]:
df_clean.head(n=1)

Unnamed: 0,id,year,title,pdf_name,abstract,paper_text,author_id,author,paper_text_clean,title_clean,abstract_clean
0,1,1987,Self-Organization of Associative Database and ...,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1,Hisashi Suzuki,self organization of associative database and...,self organization of associative database and ...,abstract missing


**Vectorize Check**

In [6]:
with open('../data/pipeline_data/count_vectorizer.p','rb') as _in:
    count_vectorizer = pickle.load(_in)

In [7]:
count_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=1000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_and_stem at 0x146057840>,
        vocabulary=None)

In [8]:
with open('../data/pipeline_data/count_sparse.p','rb') as _in:
    count_sparse = pickle.load(_in)

In [9]:
count_sparse

<6557x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 2412571 stored elements in Compressed Sparse Row format>

In [10]:
with open('../data/pipeline_data/tfidf_vectorizer.p','rb') as _in:
    tfidf_vectorizer = pickle.load(_in)

In [11]:
tfidf_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=1000, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_and_stem at 0x146057840>,
        use_idf=True, vocabulary=None)

In [12]:
with open('../data/pipeline_data/tfidf_sparse.p','rb') as _in:
    tfidf_sparse = pickle.load(_in)

In [13]:
tfidf_sparse

<6557x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 2412571 stored elements in Compressed Sparse Row format>

**Nmf Check**

In [14]:
with open('../data/pipeline_data/nmf_tfidf.p','rb') as _in:
    nmf_tfidf = pickle.load(_in)

In [15]:
nmf_tfidf

NMF(alpha=0.1, beta=1, eta=0.1, init=None, l1_ratio=0.5, max_iter=200,
  n_components=10, nls_max_iter=2000, random_state=0, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)