In [47]:
cd '/home/jovyan/lib'

/home/jovyan/lib


In [48]:
import re
import requests
import db_helper as db
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import seaborn as sns
%matplotlib inline

In [49]:
connection, cursor = db.connect_to_db()

In [85]:
get_some_docs = """
with tb1 as( SELECT A.PAGEID, A.TITLE, A.PAGE_TEXT, C.CATEGORY_NAME FROM PAGE_DATA A, CATEGORY_DATA B, CATEGORY_INFO C
WHERE A.PAGEID = B.PAGEID AND B.MAINCATEGORYID = C.CATEGORYID AND C.CATEGORY_NAME = 'machine_learning' limit 500),
tb2 as 
(SELECT A.PAGEID, A.PAGE_TEXT, A.TITLE, C.CATEGORY_NAME FROM PAGE_DATA A, CATEGORY_DATA B, CATEGORY_INFO C
WHERE A.PAGEID = B.PAGEID AND B.MAINCATEGORYID = C.CATEGORYID AND C.CATEGORY_NAME = 'business_software' limit 500)
select * from tb1 union all select * from tb2;
"""

In [86]:
df = db.query_to_dataframe(get_some_docs)

In [87]:
df.shape

(1000, 4)

In [88]:
df.head()

Unnamed: 0,category_name,page_text,pageid,title
0,machine_learning,Data exploration is an approach similar to ini...,43385931,Data exploration
1,machine_learning,These datasets are used for machine learning r...,49082762,List of datasets for machine learning research
2,machine_learning,These datasets are used for machine learning r...,49082762,List of datasets for machine learning research
3,machine_learning,Machine learning is the subfield of computer s...,233488,Machine learning
4,machine_learning,The following outline is provided as an overvi...,53587467,Outline of machine learning


### Perform SVD

In [90]:
n_components = 500
algorithm = 'randomized'
random_state = 42
ngram_range=(1,2)
min_df = 1
max_df = .7

In [91]:
#make the pipeline
svd_pipe = Pipeline([
    ('tfidf_vec', TfidfVectorizer( ngram_range, max_df=max_df, min_df=min_df, stop_words = 'english')),
    ('trun_svd', TruncatedSVD(n_components=n_components, algorithm = algorithm, random_state = random_state)),
    ('normalizer', Normalizer(copy=False))
])

In [92]:
#fit and transform model
#after fit and transform the model is trained and ready to find related documents given search term.
svd_matrix = svd_pipe.fit_transform(df['page_text'])

In [93]:
svd_matrix.shape

(1000, 500)

In [94]:
#test the code
#make the search query into a query vector
query = ['Data exploration is an approach machine learning is a science']
query_vector = svd_pipe.transform(query)

In [95]:
query_vector.shape

(1, 500)

In [96]:
tmp = pd.DataFrame(np.dot(svd_matrix, query_vector.T))

In [97]:
df['cosine_distance'] = tmp[0]

In [107]:
df.sort_values(['cosine_distance'], ascending = False).head(5)

Unnamed: 0,category_name,page_text,pageid,title,cosine_distance,cos_distance
0,machine_learning,Data exploration is an approach similar to ini...,43385931,Data exploration,0.718926,0.314657
3,machine_learning,Machine learning is the subfield of computer s...,233488,Machine learning,0.643819,0.36673
4,machine_learning,The following outline is provided as an overvi...,53587467,Outline of machine learning,0.580284,0.408299
185,machine_learning,Meta learning is a subfield of Machine learnin...,4615464,Meta learning (computer science),0.570281,0.479387
73,machine_learning,Data pre processing is an important step in th...,12386904,Data pre-processing,0.5507,0.610937


In [109]:
return_df = pd.DataFrame(df.sort_values(['cosine_distance'], ascending = False).head(5))
return_df.drop(['category_name','page_text', 'cosine_distance'], axis = 1, inplace = True)
return_df

Unnamed: 0,pageid,title
0,43385931,Data exploration
3,233488,Machine learning
4,53587467,Outline of machine learning
185,4615464,Meta learning (computer science)
73,12386904,Data pre-processing


## Code Below is for NO Pipeline for my own learning/understanding