In [47]:
cd '/home/jovyan/lib'

/home/jovyan/lib


In [48]:
import re
import requests
import db_helper as db
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import seaborn as sns
%matplotlib inline

In [49]:
#connection, cursor = db.connect_to_db()

In [85]:
get_some_docs = """
with tb1 as( SELECT A.PAGEID, A.TITLE, A.PAGE_TEXT, C.CATEGORY_NAME FROM PAGE_DATA A, CATEGORY_DATA B, CATEGORY_INFO C
WHERE A.PAGEID = B.PAGEID AND B.MAINCATEGORYID = C.CATEGORYID AND C.CATEGORY_NAME = 'machine_learning' limit 500),
tb2 as 
(SELECT A.PAGEID, A.PAGE_TEXT, A.TITLE, C.CATEGORY_NAME FROM PAGE_DATA A, CATEGORY_DATA B, CATEGORY_INFO C
WHERE A.PAGEID = B.PAGEID AND B.MAINCATEGORYID = C.CATEGORYID AND C.CATEGORY_NAME = 'business_software' limit 500)
select * from tb1 union all select * from tb2;
"""

In [86]:
df = db.query_to_dataframe(get_some_docs)
df2 = df.copy(deep=True)

In [87]:
df.shape

(1000, 4)

In [88]:
df.head()

Unnamed: 0,category_name,page_text,pageid,title
0,machine_learning,Data exploration is an approach similar to ini...,43385931,Data exploration
1,machine_learning,These datasets are used for machine learning r...,49082762,List of datasets for machine learning research
2,machine_learning,These datasets are used for machine learning r...,49082762,List of datasets for machine learning research
3,machine_learning,Machine learning is the subfield of computer s...,233488,Machine learning
4,machine_learning,The following outline is provided as an overvi...,53587467,Outline of machine learning


In [89]:
df2.head()

Unnamed: 0,category_name,page_text,pageid,title
0,machine_learning,Data exploration is an approach similar to ini...,43385931,Data exploration
1,machine_learning,These datasets are used for machine learning r...,49082762,List of datasets for machine learning research
2,machine_learning,These datasets are used for machine learning r...,49082762,List of datasets for machine learning research
3,machine_learning,Machine learning is the subfield of computer s...,233488,Machine learning
4,machine_learning,The following outline is provided as an overvi...,53587467,Outline of machine learning


### Perform SVD

In [90]:
n_components = 500
algorithm = 'randomized'
random_state = 42
ngram_range=(1,2)
min_df = 1
max_df = .7

In [91]:
#make the pipeline
svd_pipe = Pipeline([
    ('tfidf_vec', TfidfVectorizer( ngram_range, max_df=max_df, min_df=min_df, stop_words = 'english')),
    ('trun_svd', TruncatedSVD(n_components=n_components, algorithm = algorithm, random_state = random_state)),
    ('normalizer', Normalizer(copy=False))
])

In [92]:
#fit and transform model
#after fit and transform the model is trained and ready to find related documents given search term.
svd_matrix = svd_pipe.fit_transform(df['page_text'])

In [93]:
svd_matrix.shape

(1000, 500)

In [94]:
#make the search query into a query vector
query = ['Data exploration is an approach machine learning is a science']
query_vector = svd_pipe.transform(query)

In [95]:
query_vector.shape

(1, 500)

In [96]:
tmp = pd.DataFrame(np.dot(svd_matrix, query_vector.T))

In [97]:
df['cosine_distance'] = tmp[0]

In [107]:
df.sort_values(['cosine_distance'], ascending = False).head(5)

Unnamed: 0,category_name,page_text,pageid,title,cosine_distance,cos_distance
0,machine_learning,Data exploration is an approach similar to ini...,43385931,Data exploration,0.718926,0.314657
3,machine_learning,Machine learning is the subfield of computer s...,233488,Machine learning,0.643819,0.36673
4,machine_learning,The following outline is provided as an overvi...,53587467,Outline of machine learning,0.580284,0.408299
185,machine_learning,Meta learning is a subfield of Machine learnin...,4615464,Meta learning (computer science),0.570281,0.479387
73,machine_learning,Data pre processing is an important step in th...,12386904,Data pre-processing,0.5507,0.610937


In [109]:
return_df = pd.DataFrame(df.sort_values(['cosine_distance'], ascending = False).head(5))
return_df.drop(['category_name','page_text', 'cosine_distance'], axis = 1, inplace = True)
return_df

Unnamed: 0,pageid,title
0,43385931,Data exploration
3,233488,Machine learning
4,53587467,Outline of machine learning
185,4615464,Meta learning (computer science)
73,12386904,Data pre-processing


In [64]:
#after transformation above, we just need some kind of similarity measure.  
#we will try cosine distance
distance_matrix = pairwise_distances(svd_matrix, 
                                     query_vector, 
                                     metric='cosine', 
                                     n_jobs=-1)

In [65]:
distance_matrix.shape

(1000, 1)

In [104]:
df['cos_distance'] = distance_matrix

In [67]:
df.sort_values(['cos_distance'],ascending= True).head(10)

Unnamed: 0,category_name,page_text,pageid,cosine_distance,cos_distance
0,machine_learning,Data exploration is an approach similar to ini...,43385931,0.685343,0.314657
3,machine_learning,Machine learning is the subfield of computer s...,233488,0.63327,0.36673
4,machine_learning,The following outline is provided as an overvi...,53587467,0.591701,0.408299
185,machine_learning,Meta learning is a subfield of Machine learnin...,4615464,0.520613,0.479387
8,machine_learning,Active learning is a special case of semi supe...,28801798,0.511588,0.488412
550,business_software,Data Preparation is the act of preparing or pr...,54133478,0.467157,0.532843
76,machine_learning,Deep Feature Synthesis is an algorithm develop...,48290454,0.420777,0.579223
175,machine_learning,Machine Learning a subfield of Computer Scienc...,53970843,0.392151,0.607849
211,machine_learning,In machine learning systems which employ offli...,10748030,0.389395,0.610605
73,machine_learning,Data pre processing is an important step in th...,12386904,0.389063,0.610937


In [105]:
return_df = pd.DataFrame(df.sort_values(['cosine_distance'],ascending= False).head(5))

In [106]:
return_df.drop(['category_name, page_text','cosine_distance', 'cos_distance'])

ValueError: labels ['category_name, page_text' 'cosine_distance' 'cos_distance'] not contained in axis

## Code Below is for NO Pipeline for my own learning/understanding

In [68]:
#df2['page_text'] is the name of dataframe where the data from postgres is stored


In [69]:
#Tokenize and vectorize using TfidfVectorizer
#ngram_range, max_df, min_df defined above in the pipeline implementation

tfidf_vec = TfidfVectorizer(ngram_range=ngram_range, max_df=max_df, min_df=min_df, stop_words='english')

In [70]:
#SVD 
#n_components, algorithm, random_state defined above in pipeline implementation
svd = TruncatedSVD(n_components = n_components, algorithm = algorithm ,random_state = random_state)

In [71]:
#normalize result
normalize = Normalizer(copy = False)

In [72]:
#now we try to plug in the data
document_term_matrix_sparse = tfidf_vec.fit_transform(df2['page_text'])

In [73]:
#we can create a dense dataframe(not needed) to check things out
document_term_matrix_dense = pd.DataFrame(document_term_matrix_sparse.toarray(),
                                          index = df2.index,
                                          columns=tfidf_vec.get_feature_names())

In [99]:
document_term_matrix_dense.shape

(1000, 241297)

In [74]:
#concatenate the dense document term matrix with df['page_text'] to see 
#the document term matrix for each page_text

pd.concat([df2['page_text'],document_term_matrix_dense],axis = 1).sample(1)

Unnamed: 0,page_text,aaa,aaa cvp,aaai,aaai classic,aaai conferences,aaai proceedings,aaai rosenfeld,aaajxb,aaajxb htm,...,zurich reception,zusammenarbeit,zusammenarbeit german,zwanziger,zwanziger assessment,zwanziger large,zwischen,zwischen moneybee,zx,zx learning
464,Learning rule or Learning process is a method ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
#now we can plug the SPARSE df from TfidfVectorizer into TruncatedSVD to fit_transform
svd_model= svd.fit_transform(document_term_matrix_sparse)

In [76]:
#now we nomalize
svd_normalized = normalize.fit_transform(svd_model)


In [77]:
#FIRST: We need to tokenize the query list using the tokenizer fit_transform from above. 
#Make sure do only 'TRANSFORM'
#SECOND: We need to perform SVD on the query list using the svd_model fit_transform from above.  
#Make sure do only "TRANSFORM"
query1 = ['Data exploration is an approach machine learning is a science']
query1_vector = tfidf_vec.transform(query1)
query1_matrix = svd.transform(query1_vector)

In [101]:
query1_matrix.shape

(1, 500)

In [78]:
tmp2 = pd.DataFrame(np.dot(svd_normalized,query1_matrix.T))

In [79]:
df2['cosine_distance'] = tmp2[0]

In [80]:
df2.sort_values(['cosine_distance'], ascending = False).head(10)

Unnamed: 0,category_name,page_text,pageid,cosine_distance
0,machine_learning,Data exploration is an approach similar to ini...,43385931,0.228816
3,machine_learning,Machine learning is the subfield of computer s...,233488,0.185774
4,machine_learning,The following outline is provided as an overvi...,53587467,0.169408
550,business_software,Data Preparation is the act of preparing or pr...,54133478,0.139975
175,machine_learning,Machine Learning a subfield of Computer Scienc...,53970843,0.124173
185,machine_learning,Meta learning is a subfield of Machine learnin...,4615464,0.117005
9,machine_learning,Adversarial machine learning is a research fie...,45049676,0.107293
8,machine_learning,Active learning is a special case of semi supe...,28801798,0.102264
118,machine_learning,In machine learning feature learning or repres...,38870173,0.094404
134,machine_learning,Supervised learning is the machine learning ta...,20926,0.081478


In [81]:
#we normalize the query_matrix just like we did before
query1_normalized = normalize.transform(query1_matrix)

In [82]:
#after we normalize everything, now we can find similar terms using cosine distance
distance_matrix2 = pairwise_distances(svd_normalized,
                                      query1_normalized,
                                     metric='cosine', 
                                     n_jobs=-1)

In [45]:
distance_matrix2.shape

(1000, 1)

In [84]:
#take the transpose of distance_matrix2 and merge with df2 dataframe to find 
#the pages that resembles the query text the most
df2['cos_distance']=  distance_matrix2
df2.sort_values(['cos_distance'], ascending = True).head(10)


Unnamed: 0,category_name,page_text,pageid,cosine_distance,cos_distance
0,machine_learning,Data exploration is an approach similar to ini...,43385931,0.228816,0.208849
3,machine_learning,Machine learning is the subfield of computer s...,233488,0.185774,0.357672
4,machine_learning,The following outline is provided as an overvi...,53587467,0.169408,0.414257
550,business_software,Data Preparation is the act of preparing or pr...,54133478,0.139975,0.516025
175,machine_learning,Machine Learning a subfield of Computer Scienc...,53970843,0.124173,0.570663
185,machine_learning,Meta learning is a subfield of Machine learnin...,4615464,0.117005,0.595447
9,machine_learning,Adversarial machine learning is a research fie...,45049676,0.107293,0.629027
8,machine_learning,Active learning is a special case of semi supe...,28801798,0.102264,0.646415
118,machine_learning,In machine learning feature learning or repres...,38870173,0.094404,0.67359
134,machine_learning,Supervised learning is the machine learning ta...,20926,0.081478,0.718283


In [None]:
plt.figure(figsize=(6,6))
pc_1 = latent_semantic_analysis['component_1'].values
pc_2 = latent_semantic_analysis['component_2'].values

plt.scatter(pc_1, pc_2, c=df['category_numerical'], cmap='rainbow')

plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.axvline(linewidth=0.5)
plt.axhline(linewidth=0.5)
plt.xlim(-.1,.8)
plt.ylim(-.5,.8)

In [None]:
plt.figure(figsize=(6,6))
pc_1 = latent_semantic_analysis['component_1'].values
pc_2 = latent_semantic_analysis['component_2'].values

strings = df['page_text'].values
for i, (x, y) in enumerate(zip(pc_1, pc_2)): 
    plt.text(x,y,strings[i][:10])

plt.scatter(pc_1, pc_2, c=df['category_numerical'], cmap='rainbow')

plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.axvline(linewidth=0.5)
plt.axhline(linewidth=0.5)
plt.xlim(-.1,.8)
plt.ylim(-.5,.8)