In [1]:
# need the punkt package from nltk downloaded
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.tree import DecisionTreeClassifier
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import requests

## Tutorial

#### Requests

In [2]:
# example urls
url = 'http://www.google.com'
url_bad = 'http://www.google.com/asdfoiaseadco.com'

# Requests Package
page = requests.get(url)
print(page)
page = requests.get(url_bad)
print(page)

<Response [200]>
<Response [404]>


#### Beautiful Soup

In [3]:
# basic commands
url = 'http://mlp.wikia.com/wiki/Friendship_is_Magic_animated_media'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
print(soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>
<html class="" dir="ltr" lang="en">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1.0, user-scalable=yes" name="viewport"/>
<meta content="MediaWiki 1.19.24" name="generator"/>
<meta content="My Little Pony Friendship is Magic Wiki,mlp,Friendship is Magic animated media,My Little Pony Equestria Girls (franchise),Equestria Girls animated media,List of episodes,My Little Pony Friendship is Magic,Friendship is Magic, part 1,Transcripts/Friendship is Magic, part 1,Friendship is Magic, part 1/Gallery,Friendship is Magic, part 2,Transcripts/Friendship is Magic, part 2,Friendship is Magic, part 2/Gallery" name="keywords"/>
<meta content="This is a sortable list of My Little Pony Friendship is Magic animated media – including..." name="description"/>
<meta content="summary" name="twitter:card"/>
<meta content="@getfandom" name="twitter:site"/>
<meta content="http://mlp.w

#### Searching the webpage

In [12]:
soup.find('table')

<table cellspacing="1px" class="table-dotted-rows sortable" style="width:100%; line-height:1.5em; text-align:center;">
<tr style="background-color:#EE3F96; color:white;">
<th style="width:4.5em;"> <span style="cursor:help; border-bottom:1px dotted" title="Episode number by season and (overall)">№</span>
</th><th style="width:20em; text-align:left; padding-left:2em;"> Title
</th><th style="width:12em;"> Writer
</th><th style="width:10em; font-size:70%;"> Original airdate
</th><th class="unsortable" style="width:7em; font-size:70%;"> Transcript
</th><th class="unsortable" style="width:6em; font-size:70%;"> Gallery
</th></tr>
<tr>
<td> 01 (01)
</td><td style="text-align:left; padding-left:1em;"> <a href="/wiki/Friendship_is_Magic,_part_1" title="Friendship is Magic, part 1">Friendship is Magic, part 1</a>
</td><td> <a href="/wiki/Lauren_Faust" title="Lauren Faust">Lauren Faust</a>
</td><td> 2010-10-10
</td><td style="font-size:70%;"> <a href="/wiki/Transcripts/Friendship_is_Magic,_part_1"

In [13]:
res_table = soup.find_all('table')
print(len(res_table))

7


In [14]:
soup.find(class_ = 'mw-headline')

<span class="mw-headline" id="Episodes">Episodes</span>

In [15]:
soup.find('div', class_ = 'skiplinkcontainer')

<div class="skiplinkcontainer">
<a class="skiplink" href="#WikiaArticle" rel="nofollow">Skip to Content</a>
<a class="skiplink wikinav" href="#WikiHeader" rel="nofollow">Skip to Wiki Navigation</a>
<a class="skiplink sitenav" href="#GlobalNavigation" rel="nofollow">Skip to Site Navigation</a>
</div>

## Example

In [16]:
# find the links to the transcripts
url = 'http://mlp.wikia.com/wiki/Friendship_is_Magic_animated_media'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')

res = soup.find_all('td', style='font-size:70%;')
res = [r for r in res if r.text.strip() == 'Transcript']

In [17]:
res[0].find('a')
res[0].find('a').get('href')

'/wiki/Transcripts/Friendship_is_Magic,_part_1'

In [18]:
# loop through the transcripts
res_text = []
for r in res:
    url_new = r.find('a').get('href')
    url_new = 'http://mlp.wikia.com' + url_new
    page = requests.get(url_new)
    soup = BeautifulSoup(page.text, 'lxml')
    res_dd = soup.find_all('dd')
    
    for line in res_dd:
        try:
            character = line.find('b').text
            text = line.text[len(character) + 2:]

            res_text.append([character, text])
        except:
            pass
        
    print(url_new)

http://mlp.wikia.com/wiki/Transcripts/Friendship_is_Magic,_part_1
http://mlp.wikia.com/wiki/Transcripts/Friendship_is_Magic,_part_2
http://mlp.wikia.com/wiki/Transcripts/The_Ticket_Master
http://mlp.wikia.com/wiki/Transcripts/Applebuck_Season
http://mlp.wikia.com/wiki/Transcripts/Griffon_the_Brush_Off
http://mlp.wikia.com/wiki/Transcripts/Boast_Busters
http://mlp.wikia.com/wiki/Transcripts/Dragonshy
http://mlp.wikia.com/wiki/Transcripts/Look_Before_You_Sleep
http://mlp.wikia.com/wiki/Transcripts/Bridle_Gossip
http://mlp.wikia.com/wiki/Transcripts/Swarm_of_the_Century
http://mlp.wikia.com/wiki/Transcripts/Winter_Wrap_Up
http://mlp.wikia.com/wiki/Transcripts/Call_of_the_Cutie
http://mlp.wikia.com/wiki/Transcripts/Fall_Weather_Friends
http://mlp.wikia.com/wiki/Transcripts/Suited_For_Success
http://mlp.wikia.com/wiki/Transcripts/Feeling_Pinkie_Keen
http://mlp.wikia.com/wiki/Transcripts/Sonic_Rainboom
http://mlp.wikia.com/wiki/Transcripts/Stare_Master
http://mlp.wikia.com/wiki/Transcripts/T

In [19]:
characters = ['Twilight Sparkle', 'Rarity']
res_text = [l for l in res_text if l[0] in characters]
len(res_text)

6025

In [20]:
df_temp = pd.DataFrame(res_text, columns = ['character', 'text'])
df_temp.head()
df_temp.to_csv('/home/matt/res.csv', index = False)

## Making the data sturctured

In [21]:
# first need to tokenize the words the words
words_token = nltk.word_tokenize('The quick brown fox jumps over the lazy dog')
nltk.word_tokenize('The quick brown fox jumps over the lazy dog')

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [22]:
# stemmer only works on a word level
stemmer = PorterStemmer()
print(stemmer.stem('running'))

run


In [23]:
# back to the data
# tokenize each line
res_text = [[r[0], nltk.word_tokenize(r[1])] for r in res_text]
res_text[0]

['Twilight Sparkle',
 ['...',
  'and',
  'harmony',
  'has',
  'been',
  'maintained',
  'in',
  'Equestria',
  'for',
  'generations',
  'since',
  '.',
  'Hmm',
  '...',
  'Elements',
  'of',
  'Harmony',
  '.',
  'I',
  'know',
  'I',
  "'ve",
  'heard',
  'of',
  'those',
  'before',
  '...',
  'but',
  'where',
  '?']]

In [None]:
# stem each line
# saddly we need to combine all the words back to gether
for i in range(len(res_text)):
    res_text[i][1] = ' '.join([stemmer.stem(w) for w in res_text[i][1]])
res_text[0][1]

In [None]:
# now that the data is tokenized and stemmed let's through it into a data frame
df = pd.DataFrame(res_text, columns = ['character', 'text'])
print(df.shape)
df.head()

In [None]:
# turning it data into a tfidf
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
df_tfidf = tfidf.fit_transform(df['text'])
df_tfidf = pd.DataFrame(df_tfidf.toarray())
df_tfidf.head()

In [None]:
# adding the feature names
df_tfidf.columns = tfidf.get_feature_names()
df_tfidf.head()

### Doing some analysis

In [None]:
# throwing the data into a classifier
clf = DecisionTreeClassifier()
clf.fit(df_tfidf, df['character'])

In [None]:
# looking at terms by importance
importances = clf.feature_importances_
features = tfidf.get_feature_names()
important_features = [(features[i], importances[i]) for i in range(len(importances))]
important_features

In [11]:
# what are each character's most important terms

In [31]:
text = 'The quick brown fox jumps over the lazy dog'
text = nltk.word_tokenize('The quick brown fox jumps over the lazy dog')
text = [stemmer.stem(w) for w in text]
text = ' '.join(text)
text

'The quick brown fox jump over the lazi dog'

In [34]:
# turning it data into a tfidf
tfidf = TfidfVectorizer(max_features=1000)
df_tfidf = tfidf.fit_transform([text])
df_tfidf = pd.DataFrame(df_tfidf.toarray())
df_tfidf.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.301511,0.301511,0.301511,0.301511,0.301511,0.301511,0.301511,0.603023
