# Getting the data

In [86]:
import requests
import json

First find the article ID for the Harry Potter page.

In [96]:
r = requests.get('http://harrypotter.wikia.com/api/v1/Articles/Top')
hp_id = [it['id'] for it in json.loads(r.content)['items'] if it['title'] == 'Harry Potter'][0]

Get the content of the page as a JSON file and dump it to file.

In [97]:
r = requests.get('http://harrypotter.wikia.com/api/v1/Articles/AsSimpleJson', params={'id': hp_id})
json.dump(r.content, open('HarryPotterWikia.json', 'w'))

## Pulling out the text

In [100]:
cont = json.loads(r.content)

In [123]:
with open('HarryPotterWikia.txt', 'w') as f:
    for section in cont['sections']:
        f.write(section['title'].encode('utf8')+'\n')
        for unit in section['content']:
            if unit['type'] == 'paragraph':
                f.write(unit['text'].encode('utf8')+'\n')

# Playing with NLP on the text

In [98]:
import textacy

In [155]:
text = textacy.preprocess.transliterate_unicode(open('HarryPotterWikia.txt').read().decode('utf8'))
doc = textacy.Doc(text.decode('utf8'), lang=u'en')

In [206]:
statements = textacy.extract.semistructured_statements(doc, 'Harry', cue='be')

In [207]:
for i, s in enumerate(statements):
    if i < 20:
        print(s)

(Harry, being, known as the "Boy Who Lived)
(Harry, was, already famous before he arrived at Hogwarts School of Witchcraft and Wizardry.
)
(Harry, was, under increasing pressure to show that he was not just a famous name)
(Harry, was, once again able to control his broom)
(Harry, was, to be replaced amusing, and so reverted to teasing Harry about having to stay at Hogwarts for the holidays)
(Harry, was, scared to hear Filch's voice approaching, and horrified when he realised who Filch was talking to: Snape)
(Harry, was, ready to stay there all night, staring at the family he lost)
(Harry, was, worried that it would be impossible with Snape as a referee)
(Harry, was, in possession of a dragon)
(Harry, was, Slytherin's heir)
(Harry, was, worried of the trouble he'll probably go through in order to persuade his aunt or uncle to sign it)
(Harry, were, furious with Hermione and they stopped speaking to her)
(Harry, was, sceptical until Black and Lupin forced Pettigrew back into his human fo

In [208]:
sovs = textacy.extract.subject_verb_object_triples(doc)

In [209]:
for i, sov in enumerate(sovs):
    if i < 20:
        print(sov)

(b., was, blood wizard)
(what, proved, to be)
(Lord Voldemort, tried, to murder)
(he, was, year)
(Voldemort, murdered, parents)
(they, tried, to protect)
(son, had, Muggle aunt)
(son, had, to be raised)
(charm, would protect, him)
(he, was, wizard)
(He, began, attending)
(Harry, became, friends)
(Harry, won, Tournament)
(he, lost, godfather)
(Harry, played, role)
(Battle, saw, deaths)
(He, encountered, Voldemort)
(doing, was, way)
(Voldemort, cursed, him)
(Albus Dumbledore, gave, Harry advice)


In [210]:
textacy.keyterms.key_terms_from_semantic_network(doc)

[(u'harry', 0.0469206846336103),
 (u'hermione', 0.010608326944389412),
 (u'ron', 0.009826473350165532),
 (u'voldemort', 0.008733421363855971),
 (u'dumbledore', 0.008457090221454067),
 (u'hagrid', 0.005796885020150041),
 (u'hogwarts', 0.005646176666194813),
 (u'school', 0.0052157088355882075),
 (u'snape', 0.004358693575276469),
 (u'time', 0.004329328087827489)]