In [3]:
cd '/home/jovyan/lib'

/home/jovyan/lib


In [56]:
!pip install --upgrade pandas

Requirement already up-to-date: pandas in /opt/conda/lib/python3.6/site-packages
Requirement already up-to-date: python-dateutil>=2 in /opt/conda/lib/python3.6/site-packages (from pandas)
Requirement already up-to-date: numpy>=1.7.0 in /opt/conda/lib/python3.6/site-packages (from pandas)
Requirement already up-to-date: pytz>=2011k in /opt/conda/lib/python3.6/site-packages (from pandas)
Requirement already up-to-date: six>=1.5 in /opt/conda/lib/python3.6/site-packages (from python-dateutil>=2->pandas)


In [4]:
import re
import requests
import db_helper as db
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.externals import joblib

%matplotlib inline

In [5]:
get_some_docs = """
with tb1 as( SELECT DISTINCT A.PAGEID, A.PAGE_TEXT, C.CATEGORY_NAME FROM PAGE_DATA A, CATEGORY_DATA B, CATEGORY_INFO C
WHERE A.PAGEID = B.PAGEID AND B.MAINCATEGORYID = C.CATEGORYID AND C.CATEGORY_NAME = 'machine_learning' limit 1000),
tb2 as 
(SELECT DISTINCT A.PAGEID, A.PAGE_TEXT, C.CATEGORY_NAME FROM PAGE_DATA A, CATEGORY_DATA B, CATEGORY_INFO C
WHERE A.PAGEID = B.PAGEID AND B.MAINCATEGORYID = C.CATEGORYID AND C.CATEGORY_NAME = 'business_software' limit 1000)
select * from tb1 union all select * from tb2;
"""

In [6]:
df = db.query_to_dataframe(get_some_docs)


In [13]:
n_components = 500
algorithm = 'randomized'
random_state = 42
ngram_range=(1,2)
min_df = 1
max_df = .7

In [14]:
tfidf_vec = TfidfVectorizer(ngram_range, max_df = max_df, min_df = min_df, stop_words = 'english')

In [15]:
X = tfidf_vec.fit_transform(df['page_text'])

In [7]:
type(df['page_text'])

pandas.core.series.Series

In [16]:
#make a new DENSE dataframe called text_df from SPARSE matrix X(from tfidf_vectorizer) and put the column headings
#on the dataframe and put the category as index so it can be used easily
text_df = pd.DataFrame(X.toarray(), columns=tfidf_vec.get_feature_names())
text_df.index = df['category_name']

In [17]:
text_df.head()

Unnamed: 0_level_0,aa,aaa,aaaa,aaabc,aaabcaaabcaaabcaaabc,aaai,aaas,aab,aachen,aaf,...,zuzax,zwischen,zx,zylberberg,zynga,zynx,zynxambulatory,zynxcare,zynxevidence,zynxorder
category_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
machine_learning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
machine_learning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
machine_learning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
machine_learning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
machine_learning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
pd.__version__

'0.20.3'

In [19]:
mnb = MultinomialNB()

In [20]:
model = mnb.fit(text_df, text_df.index)

In [21]:
model.classes_

array(['business_software', 'machine_learning'],
      dtype='<U17')

In [22]:
mnb.classes_[0]

'business_software'

In [72]:
predicted = model.predict_proba(text_df)

In [73]:
df[mnb.classes_[0] + '_proba']=predicted[:,0]

In [74]:
df[mnb.classes_[1] + '_proba']=predicted[:,1]

In [75]:
predicted

array([[ 0.00104207,  0.99895793],
       [ 0.00281419,  0.99718581],
       [ 0.00357713,  0.99642287],
       ..., 
       [ 0.99454731,  0.00545269],
       [ 0.99156283,  0.00843717],
       [ 0.90729955,  0.09270045]])

In [76]:
df.head()

Unnamed: 0,category_name,page_text,pageid,business_software_proba,machine_learning_proba
0,machine_learning,In statistics and data mining k medians cluste...,12069242,0.001042,0.998958
1,machine_learning,A multiple sequence alignment MSA is a sequenc...,4066308,0.002814,0.997186
2,machine_learning,The term hybrid neural network can have two me...,3907217,0.003577,0.996423
3,machine_learning,In applied probability a population process is...,239384,0.078872,0.921128
4,machine_learning,Yann LeCun born is a computer scientist wi...,23534873,0.007878,0.992122


In [77]:
df.tail()

Unnamed: 0,category_name,page_text,pageid,business_software_proba,machine_learning_proba
1995,business_software,TradeCard Inc was an American software company...,38134506,0.978811,0.021189
1996,business_software,Airbrite is an e commerce software company tha...,41053071,0.913394,0.086606
1997,business_software,A Mobile Content Management system MCMs is a t...,22464607,0.994547,0.005453
1998,business_software,GX WebManager Community Edition was a propriet...,17422853,0.991563,0.008437
1999,business_software,Business Process Modeling Language BPML is an ...,5938839,0.9073,0.0927


In [78]:
con, cur = db.connect_to_db()

In [79]:
get_test_docs = """
with tb1 as( SELECT DISTINCT A.PAGEID, A.PAGE_TEXT, C.CATEGORY_NAME FROM PAGE_DATA A, CATEGORY_DATA B, CATEGORY_INFO C
WHERE A.PAGEID = B.PAGEID AND B.MAINCATEGORYID = C.CATEGORYID AND C.CATEGORY_NAME = 'machine_learning' limit 1),
tb2 as 
(SELECT DISTINCT A.PAGEID, A.PAGE_TEXT, C.CATEGORY_NAME FROM PAGE_DATA A, CATEGORY_DATA B, CATEGORY_INFO C
WHERE A.PAGEID = B.PAGEID AND B.MAINCATEGORYID = C.CATEGORYID AND C.CATEGORY_NAME = 'business_software' limit 500)
select * from tb1 union all select * from tb2;
"""

In [80]:
cur.execute("SELECT PAGEID FROM PAGE_DATA ;")
pageid = cur.fetchall()
for i in pageid:
    print(i)

{'pageid': 43385931}
{'pageid': 49082762}
{'pageid': 233488}
{'pageid': 53587467}
{'pageid': 53198248}
{'pageid': 3771060}
{'pageid': 43808044}
{'pageid': 28801798}
{'pageid': 45049676}
{'pageid': 52642349}
{'pageid': 30511763}
{'pageid': 50773876}
{'pageid': 20890511}
{'pageid': 49242352}
{'pageid': 19463198}
{'pageid': 14003441}
{'pageid': 31877832}
{'pageid': 5210054}
{'pageid': 9732182}
{'pageid': 35867897}
{'pageid': 40973765}
{'pageid': 50211107}
{'pageid': 40678189}
{'pageid': 205393}
{'pageid': 50646178}
{'pageid': 52404701}
{'pageid': 31103500}
{'pageid': 1191936}
{'pageid': 44439173}
{'pageid': 53631046}
{'pageid': 39182554}
{'pageid': 8964665}
{'pageid': 17114678}
{'pageid': 22795783}
{'pageid': 3056879}
{'pageid': 4231161}
{'pageid': 2252278}
{'pageid': 28650287}
{'pageid': 2934910}
{'pageid': 9583985}
{'pageid': 387537}
{'pageid': 3118600}
{'pageid': 6968451}
{'pageid': 4118276}
{'pageid': 847558}
{'pageid': 54550729}
{'pageid': 28255458}
{'pageid': 34042707}
{'pageid': 57

{'pageid': 28475136}
{'pageid': 35685954}
{'pageid': 27292230}
{'pageid': 2938370}
{'pageid': 39269581}
{'pageid': 37196}
{'pageid': 3105999}
{'pageid': 39834}
{'pageid': 14659441}
{'pageid': 3404894}
{'pageid': 9649605}
{'pageid': 59861}
{'pageid': 1900609}
{'pageid': 1305037}
{'pageid': 45418224}
{'pageid': 25091359}
{'pageid': 28168154}
{'pageid': 14474114}
{'pageid': 16722927}
{'pageid': 18134289}
{'pageid': 2940730}
{'pageid': 163180}
{'pageid': 5700418}
{'pageid': 394392}
{'pageid': 46096}
{'pageid': 39177819}
{'pageid': 50073184}
{'pageid': 404084}
{'pageid': 47805}
{'pageid': 1037763}
{'pageid': 41270069}
{'pageid': 5211212}
{'pageid': 28502793}
{'pageid': 44133735}
{'pageid': 12715119}
{'pageid': 8648665}
{'pageid': 22847264}
{'pageid': 35959361}
{'pageid': 19657756}
{'pageid': 53113973}
{'pageid': 34026570}
{'pageid': 38722262}
{'pageid': 522230}
{'pageid': 11971726}
{'pageid': 53108275}
{'pageid': 41672405}
{'pageid': 49680032}
{'pageid': 25957629}
{'pageid': 32797209}
{'pag

{'pageid': 40831889}
{'pageid': 39503317}
{'pageid': 41858787}
{'pageid': 27619386}
{'pageid': 33691237}
{'pageid': 41228603}
{'pageid': 3253181}
{'pageid': 41779553}
{'pageid': 347756}
{'pageid': 50592835}
{'pageid': 50525634}
{'pageid': 1969357}
{'pageid': 1168160}
{'pageid': 384969}
{'pageid': 1665246}
{'pageid': 49932957}
{'pageid': 29675473}
{'pageid': 53197668}
{'pageid': 1429233}
{'pageid': 916858}
{'pageid': 439738}
{'pageid': 40049726}
{'pageid': 5547932}
{'pageid': 5637934}
{'pageid': 40940600}
{'pageid': 54190562}
{'pageid': 34422226}
{'pageid': 6257947}
{'pageid': 45239147}
{'pageid': 44614066}
{'pageid': 3638408}
{'pageid': 52912495}
{'pageid': 21526443}
{'pageid': 22361750}
{'pageid': 25386905}
{'pageid': 1029000}
{'pageid': 54731246}
{'pageid': 36493799}
{'pageid': 33731299}
{'pageid': 23328550}
{'pageid': 1493846}
{'pageid': 53714460}
{'pageid': 4882039}
{'pageid': 10026612}
{'pageid': 900118}
{'pageid': 52279612}
{'pageid': 12337441}
{'pageid': 20593805}
{'pageid': 429

{'pageid': 49946142}
{'pageid': 6911863}
{'pageid': 40868671}
{'pageid': 34634786}
{'pageid': 23734372}
{'pageid': 42003835}
{'pageid': 37782281}
{'pageid': 26596513}
{'pageid': 13378749}
{'pageid': 22307466}
{'pageid': 37551396}
{'pageid': 16706051}
{'pageid': 373790}
{'pageid': 29156200}
{'pageid': 32644119}
{'pageid': 43364188}
{'pageid': 29012707}
{'pageid': 23943805}
{'pageid': 7043619}
{'pageid': 30810114}
{'pageid': 52499445}
{'pageid': 43458776}
{'pageid': 11958900}
{'pageid': 27920400}
{'pageid': 43771984}
{'pageid': 47147050}
{'pageid': 3660182}
{'pageid': 16391524}
{'pageid': 29871283}
{'pageid': 25026795}
{'pageid': 16391127}
{'pageid': 12610483}
{'pageid': 31128982}
{'pageid': 37797484}
{'pageid': 28391921}
{'pageid': 16391201}
{'pageid': 16390592}
{'pageid': 3565161}
{'pageid': 744498}
{'pageid': 42968292}
{'pageid': 16390515}
{'pageid': 6052012}
{'pageid': 30297116}
{'pageid': 23805110}
{'pageid': 3161187}
{'pageid': 46191240}
{'pageid': 21401689}
{'pageid': 10062721}
{'

In [83]:
query = """
SELECT DISTINCT A.PAGEID, A.PAGE_TEXT, C.CATEGORY_NAME FROM PAGE_DATA A, CATEGORY_DATA B, CATEGORY_INFO C
WHERE A.PAGEID = B.PAGEID AND B.MAINCATEGORYID = C.CATEGORYID AND  A.PAGEID in (3139122, 37103476);
"""
test_df = db.query_to_dataframe(query)
test_df

Unnamed: 0,category_name,page_text,pageid
0,business_software,JetForm was the name of a Canadian software ma...,3139122
1,machine_learning,Causal inference is the process of drawing a c...,37103476


Unnamed: 0,category_name,page_text,pageid
0,machine_learning,Binary or binomial classification is the task ...,205393
1,machine_learning,The Bradley Terry model is a probability model...,44439173
2,machine_learning,Bing Predicts is a prediction engine developed...,50646178


In [84]:
X_pred = tfidf_vec.transform(test_df['page_text'])

In [85]:
pred_df = pd.DataFrame(X_pred.toarray(), columns=tfidf_vec.get_feature_names())


In [86]:
predicted = model.predict_proba(pred_df)
test_df[model.classes_[0] + '_proba']=predicted[:,0]
test_df[model.classes_[1] + '_proba']=predicted[:,1]

In [87]:
test_df

Unnamed: 0,category_name,page_text,pageid,business_software_proba,machine_learning_proba
0,business_software,JetForm was the name of a Canadian software ma...,3139122,0.975452,0.024548
1,machine_learning,Causal inference is the process of drawing a c...,37103476,0.004254,0.995746


In [1]:
cd '/home/jovyan/pkl'

/home/jovyan/pkl


In [2]:

joblib.dump(tfidf_vec, 'tfidf_vec.pkl') 
joblib.dump(model, 'prediction_model.pkl')
#tfidf_vec = joblib.load('tfidf_vec.pkl')
#prediction_model = joblib.load('prediction_model.pkl')

SyntaxError: invalid syntax (<ipython-input-2-7550e074508a>, line 1)