# Project 4


In [3]:
###########
# Imports #
###########

In [4]:
!pip install psycopg2



In [5]:
import requests
import pandas as pd
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor
import re
import numpy as np

In [6]:
#######################################
# Regular Expression Cleaner Function #
#######################################

In [7]:
def cleaner(message):
    message = re.sub('\.+', ' ', message)
    message = re.sub('[^a-z0-9 ]','', message.lower())
    message = re.sub('\d+','NUMBER ',message)
    message = re.sub('\s+',' ',message)
    return message

In [8]:
########################
# Part 1 -- Collection #
########################

In [9]:
# The baseurl below is passed to the requests.get() method.
baseurl = 'http://en.wikipedia.org/w/api.php'

# All of the parameters for the .get() are put into a dictionary.
my_atts = {}
my_atts['action'] = 'query'  # action=query
my_atts['prop'] = 'info'     # prop=info
my_atts['format'] = 'json'   # format=json
my_atts['list'] = 'categorymembers'
my_atts['cmtitle'] = 'Category:Machine_learning'
my_atts['cmlimit'] = 'max'

# Here is teh response object
resp = requests.get(baseurl, params = my_atts)

# Putting the json into a variable
data = resp.json()

In [10]:
# Testing the .get()
resp.url

'https://en.wikipedia.org/w/api.php?action=query&prop=info&cmtitle=Category%3AMachine_learning&format=json&cmlimit=max&list=categorymembers'

In [12]:
# Stepping down into the dict to access the list of categories which contains the pageid, which will be used in the 
# next api call

data['query']['categorymembers']


In [19]:
# Reassigning the list back to data.
data = data['query']['categorymembers']


In [23]:
# Turning the dict into a df
pid_df = pd.DataFrame(data, columns=['pageid', 'title'])

In [24]:
# There are a bunch a sub-category pages listed, so this mask will remove them.
mask = pid_df['title'].str.contains("Category")
pid_df = pid_df[~mask]


In [26]:
# df with subcategories removed
pid_df.head(10)

Unnamed: 0,pageid,title
0,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...
1,43385931,Data exploration
2,49082762,List of datasets for machine learning research
3,233488,Machine learning
4,53587467,Outline of machine learning
5,3771060,Accuracy paradox
6,43808044,Action model learning
7,28801798,Active learning (machine learning)
8,45049676,Adversarial machine learning
9,52642349,AIVA


In [27]:
# Creates a list of page ids that will be used in the second call to collect the page content.
pageid = []

for ids in pid_df['pageid']:
    pageid.append(ids)

In [13]:
# Testing the list

pageid

In [13]:
# For loop will loop through the list of pageids and extract the content and put it into a dictionary.

baseurl = 'http://en.wikipedia.org/w/api.php'
count = 0

base_dict = {}
for i in pageid: 
    
    extract_atts = {}
    extract_atts['action'] = 'query'  # action=query
    extract_atts['prop'] = 'extracts'     # prop=info
    extract_atts['format'] = 'json'   # format=json
    extract_atts['explaintext'] = 'True'
    extract_atts['pageids'] = '{}'.format(i)

    extract_resp = requests.get(baseurl, params = extract_atts)
    
    json = extract_resp.json()
    base_dict[i] = json['query']['pages'][str(i)]
    count += 1
    
    if count % 4 == 0:
            print ('{:.1f}%'.format(count/int(len(pageid)) * 100),end=' - ')
    
    
#print(len(extract_resp.url))

2.0% - 4.1% - 6.1% - 8.1% - 10.2% - 12.2% - 14.2% - 16.2% - 18.3% - 20.3% - 22.3% - 24.4% - 26.4% - 28.4% - 30.5% - 32.5% - 34.5% - 36.5% - 38.6% - 40.6% - 42.6% - 44.7% - 46.7% - 48.7% - 50.8% - 52.8% - 54.8% - 56.9% - 58.9% - 60.9% - 62.9% - 65.0% - 67.0% - 69.0% - 71.1% - 73.1% - 75.1% - 77.2% - 79.2% - 81.2% - 83.2% - 85.3% - 87.3% - 89.3% - 91.4% - 93.4% - 95.4% - 97.5% - 99.5% - 

In [14]:
#Test to see if it capture all the content. 197 was the number of pages listed wikipedia, so it worked :)
count

197

In [15]:
# Another test
base_dict[43385931]['title']

'Data exploration'

In [16]:
# This loop pulls the relevant information from the json
extracts = []
titles = []
keys = []

for key_id, item_dict in base_dict.items():
    keys.append(key_id)
    extracts.append(item_dict['extract'])
    titles.append(item_dict['title'])
   

In [17]:
# Turn the lists into a df
new_df = pd.DataFrame({
    'title': titles,
    'extract': extracts,
    'id': keys
})

In [14]:
#df test

new_df

In [19]:
# Run the cleaner on the df
new_df['clean_extract'] = new_df['extract'].apply(cleaner)

In [20]:
clean_ml_df = new_df.drop('extract', axis=1)

In [21]:
clean_ml_df['category'] = 'machine learning'

In [15]:
clean_ml_df

In [23]:
# The above process is repeated for the for the business categories

baseurl = 'http://en.wikipedia.org/w/api.php'

my_atts = {}
my_atts['action'] = 'query'  # action=query
my_atts['prop'] = 'info'     # prop=info
my_atts['format'] = 'json'   # format=json
my_atts['list'] = 'categorymembers'
my_atts['cmtitle'] = 'Category:Business_software'
my_atts['cmlimit'] = 'max'


bus_resp = requests.get(baseurl, params = my_atts)
bus_data = bus_resp.json()

In [16]:
bus_data['query']['categorymembers']

In [25]:
bus_data = bus_data['query']['categorymembers']

In [26]:
bus_pid_df = pd.DataFrame(bus_data, columns=['pageid', 'title'])

In [27]:
biz_mask = bus_pid_df['title'].str.contains("Category")
bus_pid_df = bus_pid_df[~biz_mask]

In [28]:
bus_pid_df

Unnamed: 0,pageid,title
0,1037763,Business software
1,41270069,AccuSystems
2,5211212,Active policy management
3,28502793,Alexandria (library software)
4,44133735,Alteryx
5,12715119,Amadeus CRS
6,24061342,AMS Device Manager
7,54594603,Angelfish software
8,1762176,Applicant tracking system
9,22847264,Application retirement


In [29]:
biz_pageid = []

for ids in bus_pid_df['pageid']:
    biz_pageid.append(ids)

In [17]:
biz_pageid

In [31]:
baseurl = 'http://en.wikipedia.org/w/api.php'
count = 0

biz_dict = {}
for i in biz_pageid: 
    
    biz_atts = {}
    biz_atts['action'] = 'query'  # action=query
    biz_atts['prop'] = 'extracts'     # prop=info
    biz_atts['format'] = 'json'   # format=json
    biz_atts['explaintext'] = 'True'
    biz_atts['pageids'] = '{}'.format(i)

    biz_resp = requests.get(baseurl, params = biz_atts)
    
    json = biz_resp.json()
    biz_dict[i] = json['query']['pages'][str(i)]
    #print(base_dict[i])
    count += 1
    
    if count % 4 == 0:
            print ('{:.1f}%'.format(count/int(len(biz_pageid)) * 100),end=' - ')
            

1.3% - 2.7% - 4.0% - 5.4% - 6.7% - 8.1% - 9.4% - 10.7% - 12.1% - 13.4% - 14.8% - 16.1% - 17.4% - 18.8% - 20.1% - 21.5% - 22.8% - 24.2% - 25.5% - 26.8% - 28.2% - 29.5% - 30.9% - 32.2% - 33.6% - 34.9% - 36.2% - 37.6% - 38.9% - 40.3% - 41.6% - 43.0% - 44.3% - 45.6% - 47.0% - 48.3% - 49.7% - 51.0% - 52.3% - 53.7% - 55.0% - 56.4% - 57.7% - 59.1% - 60.4% - 61.7% - 63.1% - 64.4% - 65.8% - 67.1% - 68.5% - 69.8% - 71.1% - 72.5% - 73.8% - 75.2% - 76.5% - 77.9% - 79.2% - 80.5% - 81.9% - 83.2% - 84.6% - 85.9% - 87.2% - 88.6% - 89.9% - 91.3% - 92.6% - 94.0% - 95.3% - 96.6% - 98.0% - 99.3% - 

In [18]:
biz_dict

In [33]:
biz_extracts = []
biz_titles = []
biz_keys = []
for key_id, item_dict in biz_dict.items():
    biz_keys.append(key_id)
    biz_extracts.append(item_dict['extract'])
    biz_titles.append(item_dict['title'])
    #text = extracts.append(item_dict['query']['pages'][str(key_id)]['extract'])
    #base_dict[key_id]['text'] = text

In [34]:
biz_df = pd.DataFrame({
    'title': biz_titles,
    'extract': biz_extracts,
    'id': biz_keys
})

In [19]:
biz_df

In [36]:
biz_df['clean_extract'] = biz_df['extract'].apply(cleaner)

In [37]:
clean_biz_df = biz_df.drop('extract', axis=1)

In [20]:
clean_biz_df

In [39]:
clean_biz_df['category'] = 'business software'

In [125]:
clean_biz_df.tail()

Unnamed: 0,id,title,clean_extract,category
293,24310774,Enterprise forms automation,enterprise forms automation is a companywide c...,business software
294,34208511,Employee relationship management,an employee relationship management erm system...,business software
295,4996092,Laserfiche,laserfiche is a privately owned software devel...,business software
296,22350998,SpeedTax,speedtax is a sales tax automation saas produc...,business software
297,52529493,OpenProcurement,openprocurement is an open source procurement ...,business software


In [41]:
# This is where I combined the Machine Learning and Business Solutions dataframes
cl_ml_biz = clean_ml_df.append(clean_biz_df)

In [123]:
cl_ml_biz.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            288, 289, 290, 291, 292, 293, 294, 295, 296, 297],
           dtype='int64', length=495)

In [21]:
cursor = connection.cursor(cursor_factory=RealDictCursor)

In [47]:
# connection
from sqlalchemy import create_engine
engine = create_engine("postgresql://postgres@postgres/postgres")

In [22]:
# inserted the dataframes into the SQL database
cl_ml_biz.to_sql("test", con=engine, if_exists="replace", index=False)

In [23]:
# # Tested the insert
sql = """
SELECT * FROM test
"""

pd.read_sql(sql, con=engine)

In [None]:
####################
# Part 2 -- Search #
####################

In [None]:
###########
# Imports #
###########

In [50]:
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [71]:
## Instantiate the vectorizer and set the stop words to 'english'
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

## Fit and transform the articles from the dataframe above and assign it to X
document_term_matrix_sps = tfidf_vectorizer.fit_transform(cl_ml_biz['clean_extract'])

In [72]:
## Sparse matrix output
document_term_matrix_sps

<495x24726 sparse matrix of type '<class 'numpy.float64'>'
	with 118837 stored elements in Compressed Sparse Row format>

In [84]:
## Convert the sparse matrix to a dense matrix, in order to look at the data.
#
#         (1): Convert the sparse matrix into a dataframe 
#            
#              (1.1): data = use the 'toarray()' method on the sparse matrix to fill the null values with zeros
#              (1.2): index = use the 'index' attribute with the df to set the index of the resulting df
#              (1.3): columns = use 'get_feature_names()' method on the tfidf vectorizer to set the columns
#                     of the resulting df.

document_term_matrix_df = pd.DataFrame(document_term_matrix_sps.toarray(),
                                       index=cl_ml_biz.index,
                                       columns=tfidf_vectorizer.get_feature_names())

In [96]:
## df with the titles of each article concatenate

pd.concat([cl_ml_biz.title, document_term_matrix_df], axis=1).sample(4)

Unnamed: 0,title,aaai,aaainumber,aabar,aabright,aabsubset,aachen,aai,aaip,aais,...,zurich,zvfsizleq,zvfszivfsizileq,zvfszmathbb,zvfszvfsizleq,zvfz,zwanziger,zxrightarrow,zxtimes,zxy
10,Machine learning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
164,HP Cloud Service Automation Software,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152,Uncertain data,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,Conditional random field,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
## Assign the sparce matrix to X, b/c document_term_matrix_sps is a lot to look at
X = document_term_matrix_sps

In [99]:
## Time for dimensionality reduction
n_components = 500

In [100]:
## Instantiate TruncatedSVD and use the number of n_components defined above
SVD = TruncatedSVD(n_components)

In [101]:
## Fit and transform X on SVD
SVD_matrix = SVD.fit_transform(X)

In [104]:
## Search term must be put into a list before testing
search_term = ['The business data is a data from data data data machine learning']

In [106]:
search_term_vec = tfidf_vectorizer.transform(search_term)
search_term_lsa = SVD.transform(search_term_vec)

In [107]:
cosine_similarities = SVD_matrix.dot(search_term_lsa.T).ravel()

In [112]:
cosine_similarities.argsort()[:-6:-1]

array([ 32, 339,  37, 126,  10])

In [121]:
print(cl_ml_biz.loc[126]['clean_extract'][:500])

126     customer intelligence management customer int...
126    digital nervous system is a phrase popularly a...
Name: clean_extract, dtype: object


In [126]:
cl_ml_biz.index = cl_ml_biz['category']

In [130]:
cl_ml_biz.index

Index(['machine learning', 'machine learning', 'machine learning',
       'machine learning', 'machine learning', 'machine learning',
       'machine learning', 'machine learning', 'machine learning',
       'machine learning',
       ...
       'business software', 'business software', 'business software',
       'business software', 'business software', 'business software',
       'business software', 'business software', 'business software',
       'business software'],
      dtype='object', name='category', length=495)

In [131]:
X.shape

(495, 24726)

In [132]:
cl_ml_biz.shape

(495, 4)

In [31]:
##############################
# Part 3 -- Predictive Model # 
##############################

In [None]:
###########
# Imports #
###########

from sklearn.naive_bayes import MultinomialNB

In [133]:
mnb = MultinomialNB()
model = mnb.fit(X, cl_ml_biz.index)

In [135]:
predicted = model.predict_proba(X)

In [138]:
model.classes_

array(['business software', 'machine learning'], 
      dtype='<U17')

In [24]:
predicted

In [146]:
cl_ml_biz['proba_biz'] = predicted[:,0]
cl_ml_biz['proba_ml'] = predicted[:,1]
cl_ml_biz.drop('proba', axis=1, inplace=True)

In [25]:
cl_ml_biz

In [363]:
# This function, when given a url will return the proba of it either being a Machine Learning or
# a Business Solutions article

def cat_proba(url):

    url_list = url.split('/')
    url_list[-1]

#########################################################
    
    baseurl = 'http://en.wikipedia.org/w/api.php'

    func_atts = {}
    func_atts['action'] = 'query'  # action=query
    func_atts['prop'] = 'extracts' # prop=info
    func_atts['format'] = 'json'   # format=json
    func_atts['titles'] = '{}'.format(url_list[-1])
    func_atts['explaintext'] = 'True'

#########################################################

    resp = requests.get(baseurl, params = func_atts)
    data = resp.json()

    page_id = list(data['query']['pages'].keys())[0]
    data_dict = data['query']['pages'][page_id]['extract']
    
    matrix = tfidf_vectorizer.transform([cleaner(data_dict)])
    
    predict_result = model.predict_proba(matrix)[0]
    
    
    predict_dict = {}

    
    predict_result = list(predict_result) 
    model_result = list(model.classes_)
    
    for a,b in zip(model_result,predict_result):
        predict_dict[a] = round(b *100,1)
    
    return predict_dict


In [364]:
# This is a test - IT WORKED!

cat_proba('https://en.wikipedia.org/wiki/Machine_learning')

{'business software': 0.10000000000000001,
 'machine learning': 99.900000000000006}