In [1]:
import wikipediaapi

In [2]:
wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)


## Challenge categories

1. Economy
2. Regulation
3. Environment
4. Health related issue
5. Industry
6. Cultures and customs


In [4]:
# Manual research for Wikipedia categories
p_wiki = wiki_wiki.page('travels')
p_wiki.exists()

True

In [5]:
print(p_wiki.summary)
p_wiki.categories

Travel is the movement of people between distant geographical locations. Travel can be done by foot, bicycle, automobile, train, boat, bus, airplane, ship or other means, with or without luggage, and can be one way or round trip. Travel can also include relatively short stays between successive movements.


{'Category:Articles with Curlie links': Category:Articles with Curlie links (id: ??, ns: 14),
 'Category:Tourism': Category:Tourism (id: ??, ns: 14),
 'Category:Tourist activities': Category:Tourist activities (id: ??, ns: 14),
 'Category:Transport culture': Category:Transport culture (id: ??, ns: 14),
 'Category:Travel': Category:Travel (id: ??, ns: 14),
 'Category:Webarchive template wayback links': Category:Webarchive template wayback links (id: ??, ns: 14),
 'Category:Wikipedia articles with GND identifiers': Category:Wikipedia articles with GND identifiers (id: ??, ns: 14),
 'Category:Wikipedia articles with HDS identifiers': Category:Wikipedia articles with HDS identifiers (id: ??, ns: 14),
 'Category:Wikipedia articles with LCCN identifiers': Category:Wikipedia articles with LCCN identifiers (id: ??, ns: 14),
 'Category:Wikipedia articles with NARA identifiers': Category:Wikipedia articles with NARA identifiers (id: ??, ns: 14),
 'Category:Wikipedia articles with NDL identifiers

In [107]:
categories = ['Economy','Regulation','Environment','Health','Industry','Cultures']
wiki_scan = [["Category:Economy",'Category:Stock market','Category:Trade','Category:Monetary economics'],
             ['Category:Regulation','Category:Public policy','Category:Economics of regulation'],
             ['Category:Environmental science','Category:Ecology','Category:Environmentalism'],
             ['Category:Health','Category:Physical fitness','Category:Medicine'],
             ['Category:Industry','Category:Manufacturing','Category:Energy','Category:Technology'],
             ['Category:Culture','Category:Social concepts','Category:Tourism','Category:Tourist activities']]

In [108]:
def add_categorymembers(categorymembers, level=0, max_level=1,members=[]):
    for c in categorymembers.values():
        #print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
        if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
            #print("down cat - %s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            members = add_categorymembers(c.categorymembers, level=level + 1, max_level=max_level,members=members)
            #print("Down:",len(members))
        else:
            #print("append - %s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            #if len(c.text)>0:
            members.append(c.title)
            #else:
            #    print("ZERO - %s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            
            #print("A:",len(members))
    return members

def list_categorymembers(categories, level=0, max_level=1):
    l = []
    for cat_name in categories:
        cat = wiki_wiki.page(cat_name)
        l_cat = (add_categorymembers(cat.categorymembers, level=0, max_level=1,members=[]))
        print(f'{cat_name} : {len(l_cat)} elements')
        l.extend(l_cat)
    return l
                

In [109]:
wiki_pages = dict()
for c,l in zip(categories,wiki_scan):
    print(c,l)
    wiki_pages[c] = list_categorymembers(l)
    print(f'Pages to scan : {len(wiki_pages[c])}')

Economy ['Category:Economy', 'Category:Stock market', 'Category:Trade', 'Category:Monetary economics']
Category:Economy : 703 elements
Category:Stock market : 884 elements
Category:Trade : 804 elements
Category:Monetary economics : 575 elements
Pages to scan : 2966
Regulation ['Category:Regulation', 'Category:Public policy', 'Category:Economics of regulation']
Category:Regulation : 456 elements
Category:Public policy : 1076 elements
Category:Economics of regulation : 284 elements
Pages to scan : 1816
Environment ['Category:Environmental science', 'Category:Ecology', 'Category:Environmentalism']
Category:Environmental science : 1377 elements
Category:Ecology : 1822 elements
Category:Environmentalism : 603 elements
Pages to scan : 3802
Health ['Category:Health', 'Category:Physical fitness', 'Category:Medicine']
Category:Health : 2870 elements
Category:Physical fitness : 9 elements
Category:Medicine : 1492 elements
Pages to scan : 4371
Industry ['Category:Industry', 'Category:Manufacturin

In [110]:
sum([len(wiki_pages[c]) for c in wiki_pages])
    

25601

In [112]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
tf.__version__

'2.0.0-beta1'

In [115]:
from progress import progress

In [467]:
max_words       = 20000     # max number of words in the vocab
min_text_words  = 20        # dont'store intpo corpus pages with less than that number of workds
max_i           = 10        # number of files scanned per category
max_i           = 5         # number of files scanned per category
max_i           = 200      # number of files scanned per category


In [468]:
corpus_texts  =  []
corpus_cats   =  []
for c in wiki_pages.keys():
    texts = []
    print(f'\nScanning {max_i+1} articles out of {len(wiki_pages[c])} on {c}')
    for i,p in enumerate(wiki_pages[c]):
        progress(i, min(max_i,len(wiki_pages[c])), status=f'{i}')
        p_wiki = wiki_wiki.page(p)
        text = p_wiki.text
        if len(text.split())>min_text_words:
            texts.append(p_wiki.text)
        if i>=max_i:
            break
    print(f'\nAdding {len(texts)} texts into corpus')
    corpus_texts.extend(texts)
    corpus_cats.extend([categories.index(c)]*len(texts))



Scanning 201 articles out of 2966 on Economy
Adding 116 texts into corpus

Scanning 201 articles out of 1816 on Regulation
Adding 184 texts into corpus

Scanning 201 articles out of 3802 on Environment
Adding 201 texts into corpus

Scanning 201 articles out of 4371 on Health
Adding 64 texts into corpus

Scanning 201 articles out of 6603 on Industry
Adding 76 texts into corpus

Scanning 201 articles out of 6043 on Cultures
Adding 80 texts into corpus


In [469]:
t  = Tokenizer(num_words=max_words,oov_token='xxxunk')
t.fit_on_texts(corpus_texts)
X_corpus = t.texts_to_sequences(corpus_texts)


In [490]:
t

<keras_preprocessing.text.Tokenizer at 0x1856631a240>

In [470]:
dss = {     'max_i'     : max_i,
            'max_words' : max_words,
            'word_index': t.word_index,
            'categories': categories,
            'X'         : X_corpus,
            'y'         : corpus_cats }

In [471]:
from pathlib import Path
path = Path('dataset/wiki')

In [472]:
filename = f'dss_wiki_{max_i:05}_{max_words//1000}K1.json'
filename

'dss_wiki_00200_20K1.json'

In [473]:
import json
with open(path/filename, 'w', encoding='utf-8') as f:
    json.dump(dss, f, ensure_ascii=False, indent=4)

In [474]:
path/filename

WindowsPath('dataset/wiki/dss_wiki_00200_20K1.json')

In [492]:
tokenizer_filename = f'dss_wiki_{max_i:05}_{max_words//1000}K1_tokenizer.pkl'
tokenizer_filename

'dss_wiki_00200_20K1_tokenizer.pkl'

In [491]:
import pickle

In [495]:

with open(path/tokenizer_filename, 'wb') as f:
    serial_grades = pickle.dump(t,f)



In [475]:
print(f"Dataset of {len(dss['X'])} texts of {len(set(dss['y']))} classes - vocabulary of {len(dss['word_index'])} words")
print(dss['categories'])

Dataset of 721 texts of 6 classes - vocabulary of 47380 words
['Economy', 'Regulation', 'Environment', 'Health', 'Industry', 'Cultures']


In [476]:
number_of_words = len(dss['word_index'])
max_len = 100

In [477]:
import numpy as np

In [478]:
X_arr = np.array(dss['X'])
y_arr = np.array(dss['y'])
id_to_word = {dss['word_index'][key]:key for key in dss['word_index'].keys()}

In [489]:
i=np.random.randint(len(X_arr))

print(f'Text #{i} - {dss["categories"][y_arr[i]]}')
s = ' '.join(id_to_word[id] for id in X_arr[i])
print('\n'.join(textwrap.wrap(s, width=120, replace_whitespace=False)))
print()

Text #374 - Environment
in situ chemical reduction iscr is a new type of environmental remediation technique used for soil and or groundwater
remediation to reduce the concentrations of targeted environmental contaminants to acceptable levels it is the mirror
process of in situ chemical oxidation isco iscr is usually applied in the environment by xxxunk chemically reductive
additives in liquid form into the contaminated area or placing a solid medium of chemical reductants in the path of a
contaminant plume it can be used to remediate a variety of organic compounds including some that are resistant to
natural degradation the in situ in iscr is just latin for in place signifying that iscr is a chemical reduction reaction
that occurs at the site of the contamination like isco it is able to xxxunk many compounds and in theory iscr could be
more effective in ground water remediation than isco chemical reduction is one half of a redox reaction which results in
the gain of electrons one of t

In [485]:
import urllib.request
import json
import requests

In [486]:
url_wiki = 'https://drive.google.com/uc?export=download&id=1iteaiSPd1OLJdKZitcv76LzrD_mAC8S8'# 5
url_wiki = 'https://drive.google.com/uc?export=download&id=15hhpN2EszdRx7-PN43yGFsoFPuZzwFO1'   # 200
url_wiki = 'https://drive.google.com/file/d/1HZxMVthzG-tboMXO0qsPkeH2Cuq1YCbi/view?usp=sharing' # 2000


In [484]:

f = urllib.request.urlopen(url_wiki)
dss = json.load(f)
print(f"Dataset of {len(dss['X'])} texts of {len(set(dss['y']))} classes - vocabulary of {len(dss['word_index'])} words")
print(dss['categories'])
      
      



JSONDecodeError: Expecting value: line 1 column 1 (char 0)