In [1]:
# need the punkt package from nltk downloaded
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.tree import DecisionTreeClassifier
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import requests

## Tutorial

#### Requests

In [2]:
# example urls
url = 'http://www.google.com'
url_bad = 'http://www.google.com/asdfoiaseadco.com'

# Requests Package
page = requests.get(url)
print(page)
page = requests.get(url_bad)
print(page)

<Response [200]>
<Response [404]>


#### Beautiful Soup

In [3]:
# basic commands
url = 'http://mlp.wikia.com/wiki/Friendship_is_Magic_animated_media'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
print(soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>
<html class="" dir="ltr" lang="en">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1.0, user-scalable=yes" name="viewport"/>
<meta content="MediaWiki 1.19.24" name="generator"/>
<meta content="My Little Pony Friendship is Magic Wiki,mlp,Friendship is Magic animated media,My Little Pony Equestria Girls (franchise),Equestria Girls animated media,List of episodes,My Little Pony Friendship is Magic,Friendship is Magic, part 1,Transcripts/Friendship is Magic, part 1,Friendship is Magic, part 1/Gallery,Friendship is Magic, part 2,Transcripts/Friendship is Magic, part 2,Friendship is Magic, part 2/Gallery" name="keywords"/>
<meta content="This is a sortable list of My Little Pony Friendship is Magic animated media – including..." name="description"/>
<meta content="summary" name="twitter:card"/>
<meta content="@getfandom" name="twitter:site"/>
<meta content="http://mlp.w

#### Searching the webpage

In [4]:
soup.find('table')

<table cellspacing="1px" class="table-dotted-rows sortable" style="width:100%; line-height:1.5em; text-align:center;">
<tr style="background-color:#EE3F96; color:white;">
<th style="width:4.5em;"> <span style="cursor:help; border-bottom:1px dotted" title="Episode number by season and (overall)">№</span>
</th><th style="width:20em; text-align:left; padding-left:2em;"> Title
</th><th style="width:12em;"> Writer
</th><th style="width:10em; font-size:70%;"> Original airdate
</th><th class="unsortable" style="width:7em; font-size:70%;"> Transcript
</th><th class="unsortable" style="width:6em; font-size:70%;"> Gallery
</th></tr>
<tr>
<td> 01 (01)
</td><td style="text-align:left; padding-left:1em;"> <a href="/wiki/Friendship_is_Magic,_part_1" title="Friendship is Magic, part 1">Friendship is Magic, part 1</a>
</td><td> <a href="/wiki/Lauren_Faust" title="Lauren Faust">Lauren Faust</a>
</td><td> 2010-10-10
</td><td style="font-size:70%;"> <a href="/wiki/Transcripts/Friendship_is_Magic,_part_1"

In [5]:
res_table = soup.find_all('table')
print(len(res_table))

7


In [6]:
soup.find(class_ = 'mw-headline')

<span class="mw-headline" id="Episodes">Episodes</span>

In [7]:
soup.find('div', class_ = 'skiplinkcontainer')

<div class="skiplinkcontainer">
<a class="skiplink" href="#WikiaArticle" rel="nofollow">Skip to Content</a>
<a class="skiplink wikinav" href="#WikiHeader" rel="nofollow">Skip to Wiki Navigation</a>
<a class="skiplink sitenav" href="#GlobalNavigation" rel="nofollow">Skip to Site Navigation</a>
</div>

## Example

In [8]:
# find the links to the transcripts
url = 'http://mlp.wikia.com/wiki/Friendship_is_Magic_animated_media'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')

res = soup.find_all('td', style='font-size:70%;')
res = [r for r in res if r.text.strip() == 'Transcript']

In [9]:
res[0].find('a')
res[0].find('a').get('href')

'/wiki/Transcripts/Friendship_is_Magic,_part_1'

In [40]:
# loop through the transcripts
res_text = []
for r in res:
    url_new = r.find('a').get('href')
    url_new = 'http://mlp.wikia.com' + url_new
    page = requests.get(url_new)
    soup = BeautifulSoup(page.text, 'lxml')
    res_dd = soup.find_all('dd')
    
    for line in res_dd:
        try:
            character = line.find('b').text
            text = line.text[len(character) + 2:]

            res_text.append([character, text])
        except:
            pass
        
    print(url_new)

http://mlp.wikia.com/wiki/Transcripts/Friendship_is_Magic,_part_1
http://mlp.wikia.com/wiki/Transcripts/Friendship_is_Magic,_part_2
http://mlp.wikia.com/wiki/Transcripts/The_Ticket_Master
http://mlp.wikia.com/wiki/Transcripts/Applebuck_Season
http://mlp.wikia.com/wiki/Transcripts/Griffon_the_Brush_Off
http://mlp.wikia.com/wiki/Transcripts/Boast_Busters
http://mlp.wikia.com/wiki/Transcripts/Dragonshy
http://mlp.wikia.com/wiki/Transcripts/Look_Before_You_Sleep
http://mlp.wikia.com/wiki/Transcripts/Bridle_Gossip
http://mlp.wikia.com/wiki/Transcripts/Swarm_of_the_Century
http://mlp.wikia.com/wiki/Transcripts/Winter_Wrap_Up
http://mlp.wikia.com/wiki/Transcripts/Call_of_the_Cutie
http://mlp.wikia.com/wiki/Transcripts/Fall_Weather_Friends
http://mlp.wikia.com/wiki/Transcripts/Suited_For_Success
http://mlp.wikia.com/wiki/Transcripts/Feeling_Pinkie_Keen
http://mlp.wikia.com/wiki/Transcripts/Sonic_Rainboom
http://mlp.wikia.com/wiki/Transcripts/Stare_Master
http://mlp.wikia.com/wiki/Transcripts/T

In [41]:
characters = ['Twilight Sparkle', 'Rarity']
res_text = [l for l in res_text if l[0] in characters]
len(res_text)

6025

In [42]:
df_temp = pd.DataFrame(res_text, columns = ['character', 'text'])
df_temp.head()
# df_temp.to_csv('/home/matt/res.csv', index = False)

Unnamed: 0,character,text
0,Twilight Sparkle,...and harmony has been maintained in Equestri...
1,Twilight Sparkle,"Oh, sorry, girls... I've got a lot of studying..."
2,Twilight Sparkle,I know I've heard of the Elements of Harmony.\n
3,Twilight Sparkle,Spike! Spi-ike! Spike?\n
4,Twilight Sparkle,"There you are. Quick, find me that old copy of..."


## Making the data sturctured

In [43]:
# first need to tokenize the words the words
words_token = nltk.word_tokenize('The quick brown fox jumps over the lazy dog')
words_token

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [44]:
# stemmer only works on a word level
stemmer = PorterStemmer()
print(stemmer.stem('running'))

run


In [45]:
# back to the data
# tokenize each line
res_text = [[r[0], nltk.word_tokenize(r[1])] for r in res_text]
res_text[0]

['Twilight Sparkle',
 ['...',
  'and',
  'harmony',
  'has',
  'been',
  'maintained',
  'in',
  'Equestria',
  'for',
  'generations',
  'since',
  '.',
  'Hmm',
  '...',
  'Elements',
  'of',
  'Harmony',
  '.',
  'I',
  'know',
  'I',
  "'ve",
  'heard',
  'of',
  'those',
  'before',
  '...',
  'but',
  'where',
  '?']]

In [46]:
# stem each line
# saddly we need to combine all the words back to gether
for i in range(len(res_text)):
    res_text[i][1] = ' '.join([stemmer.stem(w) for w in res_text[i][1]])
res_text[0][1]

"... and harmoni ha been maintain in Equestria for gener sinc . Hmm ... Element of Harmoni . I know I 've heard of those befor ... but where ?"

In [47]:
# now that the data is tokenized and stemmed let's through it into a data frame
df = pd.DataFrame(res_text, columns = ['character', 'text'])
print(df.shape)
df.head()

(6025, 2)


Unnamed: 0,character,text
0,Twilight Sparkle,... and harmoni ha been maintain in Equestria ...
1,Twilight Sparkle,"Oh , sorri , girl ... I 've got a lot of studi..."
2,Twilight Sparkle,I know I 've heard of the Element of Harmoni .
3,Twilight Sparkle,Spike ! Spi-ik ! Spike ?
4,Twilight Sparkle,"There you are . Quick , find me that old copi ..."


In [48]:
# turning it data into a tfidf
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
df_tfidf = tfidf.fit_transform(df['text'])
df_tfidf = pd.DataFrame(df_tfidf.toarray())
df_tfidf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# adding the feature names
df_tfidf.columns = tfidf.get_feature_names()
df_tfidf.head()

Unnamed: 0,aah,abl,absolut,accent,accept,accord,acr,act,action,actual,...,yakyakistan,yawn,ye,yeah,year,yelp,yesterday,yourselv,zecora,zesti
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
df_temp = df_tfidf
df_temp['character_name'] = df['character']
df_temp = df_temp.ix[:, ['character_name'] + tfidf.get_feature_names()]
df_temp.head(20)

Unnamed: 0,character_name,aah,abl,absolut,accent,accept,accord,acr,act,action,...,yakyakistan,yawn,ye,yeah,year,yelp,yesterday,yourselv,zecora,zesti
0,Twilight Sparkle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Twilight Sparkle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Twilight Sparkle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Twilight Sparkle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Twilight Sparkle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Twilight Sparkle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Twilight Sparkle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Twilight Sparkle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Twilight Sparkle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.171714,0.0,0.0,0.0,0.0,0.0
9,Twilight Sparkle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
### Doing some analysis

In [52]:
df_tfidf = df_tfidf.drop('character_name', 1)

In [53]:
# throwing the data into a classifier
clf = DecisionTreeClassifier()
clf.fit(df_tfidf, df['character'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [54]:
# looking at terms by importance
importances = clf.feature_importances_
features = tfidf.get_feature_names()
important_features = [(features[i], importances[i]) for i in range(len(importances))]
important_features

[('aah', 0.0011141911861400323),
 ('abl', 0.0),
 ('absolut', 0.0042026245579353545),
 ('accent', 0.0),
 ('accept', 0.0),
 ('accord', 0.00072319351394912947),
 ('acr', 0.0),
 ('act', 0.0),
 ('action', 0.0),
 ('actual', 0.0010867903801660293),
 ('add', 0.0),
 ('admit', 0.00068825964314367535),
 ('ador', 0.0005377087456739523),
 ('adventur', 0.0),
 ('afraid', 0.00072752637364669479),
 ('afternoon', 0.00040861166587748956),
 ('age', 0.0),
 ('ago', 0.0),
 ('agre', 0.00095801166342078569),
 ('ah', 0.0026730061118588767),
 ('aha', 0.0),
 ('ahead', 0.0),
 ('ahem', 0.0030460763260173754),
 ('ahh', 0.0016920888642720637),
 ('air', 0.0013581544319964509),
 ('alicorn', 9.8700019435440115e-05),
 ('allow', 0.00053218141916469607),
 ('alon', 0.0010161633658447976),
 ('alreadi', 0.0016006415262439777),
 ('alright', 0.00069816925820716117),
 ('alway', 0.0024325712607971878),
 ('amaz', 0.001600909525392862),
 ('amor', 0.0),
 ('amulet', 0.0),
 ('ancient', 0.00054191094762457304),
 ('ani', 0.0015458219559

In [55]:
# what are each character's most important terms
sum_t = df_tfidf.ix[df['character'] == 'Twilight Sparkle', :].sum(0)
sum_t = list(sum_t)
sum_r = df_tfidf.ix[df['character'] == 'Rarity', :].sum(0)
sum_r = list(sum_r)
df_importance = pd.DataFrame(important_features, columns = ['feature', 'importance'])
df_importance['twilight'] = sum_t
df_importance['rarity'] = sum_r
df_importance.head()

Unnamed: 0,feature,importance,twilight,rarity
0,aah,0.001114,2.213577,4.419687
1,abl,0.0,5.404675,3.894968
2,absolut,0.004203,3.424941,11.5054
3,accent,0.0,0.0,5.01626
4,accept,0.0,2.675668,1.747964


In [56]:
# split the data set and see what the top words are for each character
df_twilight = df_importance.ix[df_importance['twilight'] > df_importance['rarity'], :]
df_twilight = df_twilight.sort_values('importance', ascending = False)
df_twilight.head(10)

Unnamed: 0,feature,importance,twilight,rarity
812,spike,0.017499,110.686597,24.728711
667,princess,0.011625,68.328798,9.425105
876,thi,0.009588,104.354359,61.802883
570,na,0.009204,32.661983,1.935559
498,like,0.007967,52.424255,35.259977
932,ve,0.007635,62.651859,28.541338
350,gasp,0.007575,52.295029,46.714352
468,just,0.007141,85.976762,56.137966
853,sure,0.006543,41.109918,17.21372
508,look,0.006476,47.183481,29.991363


In [57]:
# split the data set and see what the top words are for each character
df_rarity = df_importance.ix[df_importance['rarity'] > df_importance['twilight'], :]
df_rarity = df_rarity.sort_values('importance', ascending = False)
df_rarity.head(10)

Unnamed: 0,feature,importance,twilight,rarity
594,oh,0.050076,70.665313,114.501212
203,darl,0.01634,0.296038,30.021748
767,simpli,0.010766,1.372903,17.614686
245,dress,0.01038,6.478522,17.182841
915,twilight,0.00971,12.654583,26.627628
179,cours,0.007811,20.902949,26.82917
992,ye,0.007807,36.827993,38.185875
601,ooh,0.007022,9.654423,19.55867
293,fabul,0.006878,0.121343,13.810117
513,love,0.006878,13.396082,22.605948
