In [1]:
import wikipediaapi

In [2]:
wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)


## Challenge categories

1. Economy
2. Regulation
3. Environment
4. Health related issue
5. Industry
6. Cultures and customs


In [4]:
# Manual research for Wikipedia categories
p_wiki = wiki_wiki.page('travels')
p_wiki.exists()

True

In [5]:
print(p_wiki.summary)
p_wiki.categories

Travel is the movement of people between distant geographical locations. Travel can be done by foot, bicycle, automobile, train, boat, bus, airplane, ship or other means, with or without luggage, and can be one way or round trip. Travel can also include relatively short stays between successive movements.


{'Category:Articles with Curlie links': Category:Articles with Curlie links (id: ??, ns: 14),
 'Category:Tourism': Category:Tourism (id: ??, ns: 14),
 'Category:Tourist activities': Category:Tourist activities (id: ??, ns: 14),
 'Category:Transport culture': Category:Transport culture (id: ??, ns: 14),
 'Category:Travel': Category:Travel (id: ??, ns: 14),
 'Category:Webarchive template wayback links': Category:Webarchive template wayback links (id: ??, ns: 14),
 'Category:Wikipedia articles with GND identifiers': Category:Wikipedia articles with GND identifiers (id: ??, ns: 14),
 'Category:Wikipedia articles with HDS identifiers': Category:Wikipedia articles with HDS identifiers (id: ??, ns: 14),
 'Category:Wikipedia articles with LCCN identifiers': Category:Wikipedia articles with LCCN identifiers (id: ??, ns: 14),
 'Category:Wikipedia articles with NARA identifiers': Category:Wikipedia articles with NARA identifiers (id: ??, ns: 14),
 'Category:Wikipedia articles with NDL identifiers

In [107]:
categories = ['Economy','Regulation','Environment','Health','Industry','Cultures']
wiki_scan = [["Category:Economy",'Category:Stock market','Category:Trade','Category:Monetary economics'],
             ['Category:Regulation','Category:Public policy','Category:Economics of regulation'],
             ['Category:Environmental science','Category:Ecology','Category:Environmentalism'],
             ['Category:Health','Category:Physical fitness','Category:Medicine'],
             ['Category:Industry','Category:Manufacturing','Category:Energy','Category:Technology'],
             ['Category:Culture','Category:Social concepts','Category:Tourism','Category:Tourist activities']]

In [108]:
def add_categorymembers(categorymembers, level=0, max_level=1,members=[]):
    for c in categorymembers.values():
        #print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
        if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
            #print("down cat - %s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            members = add_categorymembers(c.categorymembers, level=level + 1, max_level=max_level,members=members)
            #print("Down:",len(members))
        else:
            #print("append - %s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            #if len(c.text)>0:
            members.append(c.title)
            #else:
            #    print("ZERO - %s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            
            #print("A:",len(members))
    return members

def list_categorymembers(categories, level=0, max_level=1):
    l = []
    for cat_name in categories:
        cat = wiki_wiki.page(cat_name)
        l_cat = (add_categorymembers(cat.categorymembers, level=0, max_level=1,members=[]))
        print(f'{cat_name} : {len(l_cat)} elements')
        l.extend(l_cat)
    return l
                

In [109]:
wiki_pages = dict()
for c,l in zip(categories,wiki_scan):
    print(c,l)
    wiki_pages[c] = list_categorymembers(l)
    print(f'Pages to scan : {len(wiki_pages[c])}')

Economy ['Category:Economy', 'Category:Stock market', 'Category:Trade', 'Category:Monetary economics']
Category:Economy : 703 elements
Category:Stock market : 884 elements
Category:Trade : 804 elements
Category:Monetary economics : 575 elements
Pages to scan : 2966
Regulation ['Category:Regulation', 'Category:Public policy', 'Category:Economics of regulation']
Category:Regulation : 456 elements
Category:Public policy : 1076 elements
Category:Economics of regulation : 284 elements
Pages to scan : 1816
Environment ['Category:Environmental science', 'Category:Ecology', 'Category:Environmentalism']
Category:Environmental science : 1377 elements
Category:Ecology : 1822 elements
Category:Environmentalism : 603 elements
Pages to scan : 3802
Health ['Category:Health', 'Category:Physical fitness', 'Category:Medicine']
Category:Health : 2870 elements
Category:Physical fitness : 9 elements
Category:Medicine : 1492 elements
Pages to scan : 4371
Industry ['Category:Industry', 'Category:Manufacturin

In [110]:
sum([len(wiki_pages[c]) for c in wiki_pages])
    

25601

In [112]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
tf.__version__

'2.0.0-beta1'

In [115]:
from progress import progress

In [155]:
max_words = 20000
max_i     = 2000         # number of files scanned per category

In [156]:
t  = Tokenizer(num_words=max_words,oov_token='xxxunk')

for c in wiki_pages.keys():
    texts = []
    print(f'Scanning {max_i} articles out of {len(wiki_pages[c])} on {c}')
    for i,p in enumerate(wiki_pages[c]):
        progress(i, min(max_i,len(wiki_pages[c])), status=f'{i}')
        p_wiki = wiki_wiki.page(p)
        texts.append(p_wiki.text)
        if i>=max_i:
            break
    print(f'\nFitting {len(texts)} texts on {c}')
    t.fit_on_texts(texts)
    X_train.extend(t.texts_to_sequences(texts))
    y_train.extend([categories.index(c)]*len(X))


Scanning 2000 articles out of 2966 on Economy
Fitting 2001 texts on Economy
Scanning 2000 articles out of 1816 on Regulation
Fitting 1816 texts on Regulation
Scanning 2000 articles out of 3802 on Environment
Fitting 2001 texts on Environment
Scanning 2000 articles out of 4371 on Health
Fitting 2001 texts on Health
Scanning 2000 articles out of 6603 on Industry
Fitting 2001 texts on Industry
Scanning 2000 articles out of 6043 on Cultures
Fitting 2001 texts on Cultures


In [157]:
dataset = { 'max_i'     : max_i,
            'max_words' : max_words,
            'word_index': t.word_index,
            'X'         : X_train,
            'y'         : y_train }

In [158]:
from pathlib import Path
path = Path('dataset/wiki')

In [159]:
filename = f'dss_wiki_{max_i:05}_{max_words//1000}K.json'
filename

'dss_wiki_02000_20K.json'

In [160]:
import json
with open(path/filename, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)