In [1]:
import json
import os
from glob import glob
from pprint import pprint
import requests
import urllib.parse as urlparse

# Enumerate JSON Files

In [16]:
PATH = "/Users/allen/Projects/wikiextractor/extracted"
def files():
    return sorted(glob(PATH + "/*/*"))


print("Total Files: {}".format(len(files())))
files()[:5]    

Total Files: 12697


['/Users/allen/Projects/wikiextractor/extracted/AA/wiki_00',
 '/Users/allen/Projects/wikiextractor/extracted/AA/wiki_01',
 '/Users/allen/Projects/wikiextractor/extracted/AA/wiki_02',
 '/Users/allen/Projects/wikiextractor/extracted/AA/wiki_03',
 '/Users/allen/Projects/wikiextractor/extracted/AA/wiki_04']

In [3]:
def doc_iterator():
    for path in files()[:2]:
        with open(path, "r") as f:
            print(path)
            for line in f:
                item = json.loads(line)
                yield item
#             for item in json.loads(f.read()):
#                 yield item
            

In [4]:
stuff = list(get_objects())
len(stuff)

/Users/allen/Projects/wikiextractor/extracted/AA/wiki_00
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_01


82

In [21]:
stuff[33]

{'id': '615',
 'url': 'https://en.wikipedia.org/wiki?curid=615',
 'title': 'American Football Conference',
 'text': 'American Football Conference\n\nThe American Football Conference (AFC) is one of the two conferences of the National Football League (NFL), the highest professional level of American football in the United States. This conference and its counterpart, the National Football Conference (NFC), currently contain 16 teams each, making up the 32 teams of the NFL. Both conferences were created as part of the 1970 merger with the rival American Football League (AFL), with all ten of the former AFL teams and three NFL teams forming the AFC, and the remaining thirteen NFL clubs forming the NFC. A series of league expansions and division realignments have occurred since the merger, thus making the current total 16 clubs per each conference.\n\nCurrent teams.\nSince 2002, like the NFC, the AFC has 16 teams, organized into four divisions each with four teams: East, North, South and We

# Get Section Title Only 
Based on length of paragraph.

In [10]:
# split text into list of paragraphs based on line break
text = [ii for ii in stuff[0]["text"].strip().split("\n") if ii.strip()]


# find section title 
MIN_PARA_LEN = 10
section_titles = [ii for ii in text if len(ii.split()) <= MIN_PARA_LEN]
section_titles

['Anarchism',
 'Etymology and terminology.',
 'History.',
 'Origins.',
 'First International and the Paris Commune.',
 'Organised labour.',
 'Propaganda of the deed and illegalism.',
 'Russian Revolution and other uprisings of the 1910s.',
 'Conflicts with European fascist regimes.',
 'Spanish Revolution.',
 'Post-war years.',
 'Contemporary anarchism.',
 'Anarchist schools of thought.',
 'Mutualism.',
 'Social anarchism.',
 'Collectivist anarchism.',
 'Anarcho-communism.',
 'Anarcho-syndicalism.',
 'Individualist anarchism.',
 'Post-classical anarchist schools of thought.',
 'Internal issues and debates.',
 'Topics of interest.',
 'Free love.',
 'Libertarian education and freethought.',
 'Criticisms.']

# Create iter for paragraphs in Text

In [48]:
def wikidata_iterator(path, output="text"):

    def doc_iterator(path):
        files = sorted(glob(path + "/*/*"))

        for path in files[:2]:
            with open(path, "r") as f:
                print(path)
                for line in f:
                    item = json.loads(line)
                    yield item

    def para_iterator(doc):
        MIN_LENGTH = 300
        text = [ii for ii in doc.strip().split("\n") if ii.strip()]

        too_small = ""
        for para in text:
            if len(para) < MIN_LENGTH:
                too_small = too_small + " " + para
                continue

            yield (too_small + " " + para).strip()
            too_small = ""

    def fetch_title(url):
        api_url = "https://en.wikipedia.org/w/api.php?action=query&prop=info&pageids={}&inprop=url&format=json"

        parsed = urlparse.urlparse(url)
        curid = urlparse.parse_qs(parsed.query)["curid"][0]

        response = requests.get(api_url.format(curid))
        data = response.json()
        canonicalurl = data["query"]["pages"][curid]["canonicalurl"]
        return canonicalurl.split("/")[-1]

    for doc in doc_iterator(path):
        try:
            ans = fetch_title(doc["url"])
        except Exception as e:
            print(e)
            continue

        for para in para_iterator(doc["text"]):
            if output == "text": 
                yield para
                
            if output == "ans":
                yield ans
            
            if output == "both":
                yield ans, para
    



/Users/allen/Projects/wikiextractor/extracted/AA/wiki_00
Anarchism Anarchism is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although several authors have defined them more specifically as institutions based on non-hierarchical or free associations. Anarchism holds the state to be undesirable, unnecessary and harmful.
While opposition to the state is central, anarchism specifically entails opposing authority or hierarchical organisation in the conduct of all human relations. Anarchism is usually considered a far-left ideology and much of anarchist economics and anarchist legal philosophy reflects anti-authoritarian interpretations of communism, collectivism, syndicalism, mutualism or participatory economics.
Anarchism does not offer a fixed body of doctrine from a single particular world view, instead fluxing and flowing as a philosophy. Many types and traditions of anarchism exist, not 

# Let's try something fancy

In [17]:
import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
class WikidataIterator(object):
    def __init__(self, path):
        self.path = path
        self.i_to_ans = {}

    def _doc_iterator(self, path):
        files = sorted(glob(path + "/*/*"))

        for path in files[:10]:
            with open(path, "r") as f:
                print(path)
                for line in f:
                    item = json.loads(line)
                    yield item

    def _para_iterator(self, doc):
        MIN_LENGTH = 300
        text = [ii for ii in doc.strip().split("\n") if ii.strip()]

        too_small = ""
        for para in text:
            if len(para) < MIN_LENGTH:
                too_small = too_small + " " + para
                continue

            yield (too_small + " " + para).strip()
            too_small = ""

    def _fetch_title(self, url):
        api_url = "https://en.wikipedia.org/w/api.php?action=query&prop=info&pageids={}&inprop=url&format=json"

        parsed = urlparse.urlparse(url)
        curid = urlparse.parse_qs(parsed.query)["curid"][0]

        response = requests.get(api_url.format(curid))
        data = response.json()
        canonicalurl = data["query"]["pages"][curid]["canonicalurl"]
        return canonicalurl.split("/")[-1]
        
        
    @property
    def docs(self):
        counter = 0
        for doc in self._doc_iterator(self.path):
            try:
                ans = self._fetch_title(doc["url"])
            except Exception as e:
                print(e)
                continue

            for para in self._para_iterator(doc["text"]):
                self.i_to_ans[counter] = ans
                counter += 1
                yield para

    
    

In [33]:
class TFIDF():
    
    def init(self):
        self.i_to_ans = None
    
    def train(self, path, ngram_range=(1, 1), min_df=1, max_df=.95):
        wikidata = WikidataIterator(path)
        
        vectorizer_kwargs = {
            'ngram_range': ngram_range,
            'min_df': min_df,
            'max_df': max_df
        }
        start = time.time()
        self.tfidf_vectorizer = TfidfVectorizer(**vectorizer_kwargs).fit(wikidata.docs)
        elapsed = int(time.time() - start)
        print("INFO: fit completed in {} seconds".format(elapsed))
        
        start = time.time()
        self.tfidf_matrix = self.tfidf_vectorizer.transform(wikidata.docs)
        elapsed = int(time.time() - start)
        print("INFO: transform completed in {} seconds".format(elapsed))
        
        self.i_to_ans = wikidata.i_to_ans

    def guess(self, questions, max_n_guesses=2):
        representations = self.tfidf_vectorizer.transform(questions)
        guess_matrix = self.tfidf_matrix.dot(representations.T).T
        guess_indices = (-guess_matrix).toarray().argsort(axis=1)[:, 0:max_n_guesses]
        guesses = []
        for i in range(len(questions)):
            idxs = guess_indices[i]
            guesses.append([(self.i_to_ans[j], guess_matrix[i, j]) for j in idxs])

        return guesses


In [34]:
model = TFIDF()
model.train(PATH)

/Users/allen/Projects/wikiextractor/extracted/AA/wiki_00
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_01
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_02
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_03
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_04
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_05
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_06
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_07
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_08
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_09
INFO: fit completed in 95 seconds
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_00
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_01
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_02
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_03
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_04
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_05
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_

In [21]:
len(files()) * 8 / 60 / 60

28.215555555555557

In [29]:
num_files = len(files())
print("Files: {}".format(num_files))

Files: 12697


In [24]:
len(model.tfidf_vectorizer.vocabulary_.keys())

26529

In [30]:
len(model.i_to_ans.keys())

2808

In [31]:
wikidata = WikidataIterator(PATH)
docs = [1 for _ in wikidata.docs]
len(docs), len(wikidata.i_to_ans.keys())

/Users/allen/Projects/wikiextractor/extracted/AA/wiki_00
/Users/allen/Projects/wikiextractor/extracted/AA/wiki_01


(2808, 2808)

In [35]:
12697 / 60 / 60

3.5269444444444447