In [1]:
!pip install wikipedia --quiet

In [2]:
import re
import requests
import pandas as pd
import wikipedia

In [3]:
def generate_category(category):
    '''
    format a category for insertion in to a wikipedia api call
    '''
    category = re.sub('\s','+',category)
    return category

In [4]:

generate_category('machine learning')

'machine+learning'

In [5]:
def generate_query(category):
    '''
    Format an api call for requests
    '''
    query = '''https://en.wikipedia.org/w/api.php?
            action=query&
            format=json&
            list=categorymembers&
            cmtitle=Category:{}&
            cmlimit=max
            '''.format(generate_category(category))
    query = re.sub('\s','',query)
    return query

In [6]:
generate_query('machine learning')

'https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:machine+learning&cmlimit=max'

In [7]:
def execute_category_query(category):
    '''
    Executes a category query and returns a DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    return pd.DataFrame(response['query']['categorymembers'])

In [8]:
test = execute_category_query('machine learning')

In [9]:
test

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,53198248,Singular statistical model
5,0,3771060,Accuracy paradox
6,0,43808044,Action model learning
7,0,28801798,Active learning (machine learning)
8,0,45049676,Adversarial machine learning
9,0,52642349,AIVA


In [10]:
category_mask = test['title'].str.contains('Category:')

In [11]:
test[~category_mask]

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,53198248,Singular statistical model
5,0,3771060,Accuracy paradox
6,0,43808044,Action model learning
7,0,28801798,Active learning (machine learning)
8,0,45049676,Adversarial machine learning
9,0,52642349,AIVA


In [12]:
test[category_mask]

Unnamed: 0,ns,pageid,title
198,14,33547387,Category:Applied machine learning
199,14,42936114,Category:Artificial neural networks
200,14,1718975,Category:Bayesian networks
201,14,1991254,Category:Classification algorithms
202,14,22532673,Category:Cluster analysis
203,14,34310097,Category:Computational learning theory
204,14,12932492,Category:Artificial intelligence conferences
205,14,33542714,Category:Data mining and machine learning soft...
206,14,42320378,Category:Datasets in machine learning
207,14,29549713,Category:Dimension reduction


In [13]:
subcat_df_list = []

In [14]:
def remove_category(category):
    category = re.sub('Category:','',category)
    return category

In [15]:
categories_to_query = test[category_mask]['title'].apply(remove_category).tolist()

In [16]:
categories_to_query

['Applied machine learning',
 'Artificial neural networks',
 'Bayesian networks',
 'Classification algorithms',
 'Cluster analysis',
 'Computational learning theory',
 'Artificial intelligence conferences',
 'Data mining and machine learning software',
 'Datasets in machine learning',
 'Dimension reduction',
 'Ensemble learning',
 'Evolutionary algorithms',
 'Genetic programming',
 'Inductive logic programming',
 'Kernel methods for machine learning',
 'Latent variable models',
 'Learning in computer vision',
 'Log-linear models',
 'Loss functions',
 'Machine learning algorithms',
 'Machine learning portal',
 'Machine learning task',
 'Markov models',
 'Machine learning researchers',
 'Semisupervised learning',
 'Statistical natural language processing',
 'Structured prediction',
 'Supervised learning',
 'Support vector machines',
 'Unsupervised learning']

In [17]:
for category in categories_to_query:
    subcat_df_list.append(execute_category_query(category))

In [18]:
subcat_df_list[1]

Unnamed: 0,ns,pageid,title
0,0,21523,Artificial neural network
1,0,28016652,Types of artificial neural networks
2,0,14179835,Activation function
3,0,8220913,ADALINE
4,0,31663887,Adaptive neuro fuzzy inference system
5,0,3056879,Adaptive resonance theory
6,0,4231161,ALOPEX
7,0,16167377,Artificial Intelligence System
8,0,349771,Artificial neuron
9,0,51404222,Artisto


In [19]:
def get_all_pages_rec(category):
    category_df = execute_category_query(category)
    pages_list = []
    category_mask = category_df['title'].str.contains('Category:')
    pages_df = category_df[~category_mask]
    pages_list.append(pages_df)
    categories = category_df[category_mask]['title']\
                            .str.replace('Category:','').tolist()
    if len(categories) > 0:
        for cat in categories:
            pages_list.append(get_all_pages_rec(cat))
    
    pages_df = pd.concat(pages_list)
    pages_df.reset_index()
    return pages_df

In [20]:
rec_test = get_all_pages_rec('machine learning')

In [21]:
rec_test.reset_index(drop=True).shape

(1606, 3)

In [22]:

rec_test = rec_test.drop_duplicates().reset_index(drop=True)

In [23]:
rec_test.head()

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,53198248,Singular statistical model


In [24]:
def get_whole_category(category):
    df = get_all_pages_rec(category)
    df = df.drop_duplicates().reset_index(drop=True)
    df['category'] = category
    return df

In [25]:
gwc_test = get_whole_category('machine learning')

In [26]:
gwc_test.head()

Unnamed: 0,ns,pageid,title,category
0,0,43385931,Data exploration,machine learning
1,0,49082762,List of datasets for machine learning research,machine learning
2,0,233488,Machine learning,machine learning
3,0,53587467,Outline of machine learning,machine learning
4,0,53198248,Singular statistical model,machine learning


_______________________________

## Now for Business software

In [27]:
btest = execute_category_query('business software')

In [28]:
btest.sample(30)

Unnamed: 0,ns,pageid,title
87,0,52993539,Enterprise coexistence
88,0,24310774,Enterprise forms automation
312,14,23785395,Category:Human resource management software
154,0,12185719,KXEN Inc.
101,0,54135637,EZOfficeinventory
4,0,44133735,Alteryx
1,0,41270069,AccuSystems
71,0,17106978,DocPoint
300,14,14541812,Category:Business software companies
284,0,51567252,WeWorked


In [29]:
btest.iloc[303].title

'Category:Business software for Windows'

In [30]:
def get_all_pages_rec(category):
    category_df = execute_category_query(category)
    pages_list = []
    category_mask = category_df['title'].str.contains('Category:')
    pages_df = category_df[~category_mask]
    pages_list.append(pages_df)
    categories = category_df[category_mask]['title']\
                            .str.replace('Category:','').tolist()        
    count = 0
    if len(categories) > 0:
        count += 1
        print('entering level: {}'.format(count))
        for cat in categories:
            pages_list.append(get_all_pages_rec(cat))
    
    pages_df = pd.concat(pages_list)
    pages_df.reset_index()
    return pages_df

In [31]:
get_all_pages_rec('business software')

entering level: 1
entering level: 1


KeyError: 'title'

In [None]:
bcategory_df = execute_category_query('business software')
bpages_list = []
bcategory_mask = bcategory_df['title'].str.contains('Category:')
bpages_df = bcategory_df[~bcategory_mask]
bpages_list.append(bpages_df)
bcategories = bcategory_df[bcategory_mask]['title'].str.replace('Category:','').tolist()

In [None]:
bcategories

In [None]:
category_df

In [None]:
get_all_pages_rec(bcategories[0])

In [None]:
brec_test = get_all_pages_rec('business software')