In [1]:
# Use first query to capture relevant data on Machine Learning and Business software

In [2]:
# Grab all first and second layer articles for both

In [3]:
# Seperate the Category from not category on the first pass

Define library for connecting to database

In [5]:
!pip install psycopg2

Collecting psycopg2
  Using cached psycopg2-2.7.3.1-cp36-cp36m-manylinux1_x86_64.whl
Installing collected packages: psycopg2
Successfully installed psycopg2-2.7.3.1


In [6]:
import psycopg2 as pg2
from pandas import DataFrame
from psycopg2.extras import RealDictCursor
import pandas as pd
def connect_to_db():
    con = pg2.connect(host='postgres', 
                      dbname='postgres', 
                      user='postgres')
    cur = con.cursor(cursor_factory=RealDictCursor)
    return con, cur
    
def query_to_dictionary(query, fetch_res=True):
    con, cur = connect_to_db()
    cur.execute(query)
    if fetch_res:
        results = cur.fetchall()
    else:
        results = None
    con.close()
    return results

def query_to_dataframe(query):
    return DataFrame(query_to_dictionary(query))

First grab all the articles from Machine learning and business software with one layer down

In [7]:
!pip install wikipedia --quiet
import re
import requests
import pandas as pd
import wikipedia

In [8]:
def generate_category(category):
    '''
    format a category for insertion in to a wikipedia api call
    '''
    category = re.sub('\s','+',category)
    return category

In [9]:
def generate_query(category):
    '''
    Format an api call for requests
    '''
    query = '''https://en.wikipedia.org/w/api.php?
            action=query&
            format=json&
            list=categorymembers&
            cmtitle=Category:{}&
            cmlimit=max
            '''.format(generate_category(category))
    query = re.sub('\s','',query)
    return query

In [10]:
generate_query('Machine learning')

'https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:Machine+learning&cmlimit=max'

In [11]:
def execute_category_query(category):
    '''
    Executes a category query and returns a DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    
    return pd.DataFrame(response['query']['categorymembers'])

In [12]:
execute_category_query('Machine learning')

Unnamed: 0,ns,pageid,title
0,2,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...
1,0,43385931,Data exploration
2,0,49082762,List of datasets for machine learning research
3,0,233488,Machine learning
4,0,53587467,Outline of machine learning
5,0,3771060,Accuracy paradox
6,0,43808044,Action model learning
7,0,28801798,Active learning (machine learning)
8,0,45049676,Adversarial machine learning
9,0,52642349,AIVA


In [13]:
def update_table(table_name, col_values):
    query = '''
        BEGIN;
        INSERT INTO {} VALUES ({});
        COMMIT;
        '''.format(table_name, format_data_for_query(col_values))
    query_to_dictionary(query,fetch_res=False)


In [11]:
# def write_to_dbpage(title,text):
#     con, cur = connect_to_db()
#     query = """
#     BEGIN; 
#     INSERT INTO page (title, text) VALUES ('{}','{}'); 
#     COMMIT;
#     """.format(title,text)
#     cur.execute(query)

In [35]:
def put_data_to_dbpage(category):
    category_df = execute_category_query(category)    
    category_mask = category_df['title'].str.contains('Category:')
    pages_in_category = category_df[~category_mask]['title'].tolist()
    
    for page in pages_in_category:
        ctext = get_clean_text(page)
        ctitle = cleaner(page)
        write_to_dbpage(ctitle, ctext)
    

In [45]:
put_data_to_dbpage('Machine learning')

In [46]:
query = 'SELECT * FROM page'
query_to_dataframe(query)

Unnamed: 0,page_id,text,title
0,1,this is not a wikipedia article it is an indi...,user custintelmngt sandbox customer intelligen...
1,2,this article has multiple issues please help ...,data exploration
2,3,machine learning and data mining problems cla...,list of datasets for machine learning research
3,4,for the journal see machine learning journal m...,machine learning
4,5,the following outline is provided as an overvi...,outline of machine learning
5,6,the accuracy paradox for predictive analytics ...,accuracy paradox
6,7,machine learning and data mining problems cla...,action model learning
7,8,this article is about a machine learning metho...,active learning machine learning
8,9,adversarial machine learning is a research fie...,adversarial machine learning
9,10,aiva nationality luxembourgish style classica...,aiva


In [None]:
connect_to_db()
query = 'SELECT * FROM page'
query_to_dataframe(query)
query = 'SELECT * FROM category'
query_to_dataframe(query)
query = 'SELECT * FROM page_category'
query_to_dataframe(query)

In [20]:
def get_all_pages_first_level(category):
    category_df = execute_category_query(category)
    pages_list = []
    category_mask = category_df['title'].str.contains('Category:')
 
    #Grab all articles (titles that do not contain 'Category:') and append them to list
    pages_df = category_df[~category_mask]
    pages_list.append(pages_df)

    #For the sub-categories, call the second level function to grab those articles
    categories = category_df[category_mask]['title']\
                        .str.replace('Category:','').tolist()
    
    for categori in categories:
        category_df = execute_category_query(categori)
        category_mask = category_df['title'].str.contains('Category:')
 
        #Grab all articles (titles that do not contain 'Category:') and append them to list
        pages_df = category_df[~category_mask]
        pages_list.append(pages_df)    
    
    return pages_list
    

In [21]:
machine_learning_list = get_all_pages_first_level('Machine learning')

In [22]:
len(machine_learning_list)

31

In [23]:
business_soft_list = get_all_pages_first_level('Business software')

In [24]:
len(business_soft_list)

33

_____________

In [36]:
from bs4 import BeautifulSoup

In [37]:
def generate_parse(category):
    '''
    Format for api call to grab an article's text
    '''
    query = '''https://en.wikipedia.org/w/api.php?
            action=parse&
            format=json&
            page={}
            '''.format(generate_category(category))
    query = re.sub('\s','',query)
    return query

In [38]:
def execute_category_parse(category):
    '''
    Executes a category parse and returns a DataFrame of the category members
    '''    
    r = requests.get(generate_parse(category))
    response = r.json()
    return response

In [39]:
def json_to_html(category):
    article_html = execute_category_parse(category)['parse']['text']['*'] \
        if not execute_category_parse(category).get('error') else ''
    return article_html

In [40]:
def cleaner(message):
    message = re.sub('\.+', ' ', message)
    message = re.sub('\'+', '', message)
    message = re.sub('[^a-z0-9 ]',' ', message.lower())
    message = re.sub('\d+',' NUMBER ',message)
    message = re.sub('\s+',' ',message)
    return message

In [41]:
def make_beautiful_html(category):
    article_html = json_to_html(category)
    soup = BeautifulSoup(article_html,'html.parser')
    article_text = soup.get_text()
    return article_text

In [42]:
def get_clean_text(category):
    article_text = make_beautiful_html(category)
    return cleaner(article_text)

____________________

Grab all data from each categories using the methods defined above...

In [None]:
len(machine_learning_list)

Combine the data into a single DataFrame

In [None]:
df_tot = pd.DataFrame()
for df in machine_learning_list:
    df_tot = pd.concat([df_tot, df],axis=0)
    df_tot['Category'] = 'Machine Learning'
for df in business_soft_list:
    df_tot = pd.concat([df_tot, df],axis=0)
    df_tot['Category'] = 'Business Software'

In [None]:
df_tot

In [None]:
df_tot['text'] = df_tot['title'].apply(lambda x: get_clean_text(x))