In [3]:
!ls week/8

Australian_Broadcasting_Commission_2006.zip  Inaugural_Speeches.zip
George_Eliot.zip			     Jane_Austen.zip
Herman_Melville.zip			     Joseph_Conrad.zip


In [None]:
# First, import the packages we will use below.

import os
from textblob import TextBlob
import random
from pprint import pprint

In [None]:
# Download zipped texts from GitHub, then unzip the directories.

os.chdir('/sharedfolder/')

!wget -N https://github.com/pcda17/pcda17.github.io/blob/master/week/5/Emerson.zip?raw=true -O Emerson.zip
!unzip -o Emerson.zip

!wget -N https://github.com/pcda17/pcda17.github.io/blob/master/week/5/Wilde.zip?raw=true -O Wilde.zip
!unzip -o Wilde.zip

In [None]:
## First, load each author’s works as a list of strings.

corpus_1_dir = "/sharedfolder/Emerson/"
corpus_2_dir = "/sharedfolder/Wilde/"

##

os.chdir(corpus_1_dir)

corpus_1_filenames = os.listdir("./")

corpus_1_texts = []

for filename in corpus_1_filenames:
    text = open(filename).read().replace("\n"," ") #replaces newline characters with spaces
    corpus_1_texts.append(text)

##
    
os.chdir(corpus_2_dir)

wilde_filenames = os.listdir("./")

wilde_texts = []

for filename in wilde_filenames:
    text = open(filename).read().replace("\n"," ") #replaces newline characters with spaces
    wilde_texts.append(text)

In [None]:
corpus_1_blobs = [TextBlob(item) for item in corpus_1_texts]
wilde_blobs = [TextBlob(item) for item in wilde_texts]

In [None]:
# Recall that 'blob.words' is a list of words.

blob = random.choice(corpus_1_blobs)

blob.words[:100]

In [None]:
# ANd 'blob.sentences' is a list of Sentence objects.

blob = random.choice(corpus_1_blobs)

blob.sentences[:5]

### ▷ Simple sentiment analysis with TextBlob

In [None]:
# Negative polarity example
from textblob import TextBlob

text="This is a very mean and nasty sentence."

blob = TextBlob(text)

# result between -1 and +1
sentiment_score=blob.sentiment.polarity  # <--

print(sentiment_score)

In [None]:
# Positive polarity example

text="This is a nice and positive sentence."

blob = TextBlob(text)

# result between -1 and +1
sentiment_score=blob.sentiment.polarity  # <--

print(sentiment_score)

In [None]:
# High subjectivity example

text="This is a very mean and nasty sentence."

blob = TextBlob(sanitize(text))

# result between 0 and +1
sentiment_score=blob.sentiment.subjectivity  # <--

print sentiment_score

In [None]:
# Low subjectivity example

text="This sentence states a fact."

blob = TextBlob(text)

# result between -1 and +1
sentiment_score=blob.sentiment.subjectivity  # <--

print(sentiment_score)

### ▷ Plotting Sentiment Values

In [None]:
# Let's map sentiment ratings across the course of a full book.


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pprint import pprint

# Viewing available plot styles and selecting one to use.

pprint(plt.style.available)

plt.style.use('ggplot')

In [None]:
# Creates a new Emerson TextBlob object

emerson_blob = random.choice(corpus_1_blobs)

random_emerson_sentence = random.choice(emerson_blob.sentences)

print(random_emerson_sentence)
print()
print(random_emerson_sentence.sentiment.polarity)

In [None]:
emerson_sentiments=[item.sentiment.polarity for item in emerson_blob.sentences]

emerson_sentiments[:10]

In [None]:
plt.figure(figsize=(18,8))
plt.plot(emerson_sentiments)

In [None]:
# Smoothing our data before plotting

emerson_sentiments_pd = pd.Series(emerson_sentiments)
emerson_sentiments_smooth = emerson_sentiments_pd.rolling(window=100).mean()

print(emerson_sentiments_smooth[195:210])

In [None]:
plt.figure(figsize=(18,8))
plt.plot(emerson_sentiments_smooth)

In [None]:
max_sentiment = max(emerson_sentiments_smooth[199:])

print(max_sentiment) # max sentiment polarity value

max_sent_index = list(emerson_sentiments_smooth).index(max_sentiment) # index position of the 'max_sentiment' value

print(emerson_blob.sentences[max_sent_index])

In [None]:
min_sentiment=min(emerson_sentiments_smooth[199:])

print(min_sentiment) # min sentiment polarity value

min_sent_index=list(emerson_sentiments_smooth).index(min_sentiment) # index position of the 'min_sentiment' value

print(emerson_blob.sentences[min_sent_index])

In [None]:
austen_sentiments=[sentence.sentiment.polarity for sentence in austen_blob.sentences]
#print austen_sentiments[:10]
austen_sentiments_pd=pd.Series(austen_sentiments)
austen_sentiments_smooth=austen_sentiments_pd.rolling(window=200).mean()
#print austen_sentiments_smooth[190:210]

plt.figure(figsize=(18,8))
plt.plot(austen_sentiments_smooth)

In [None]:
max_sentiment=max(austen_sentiments_smooth[199:])
print max_sentiment # max sentiment polarity value

max_sent_index=list(austen_sentiments_smooth).index(max_sentiment) # index position of the 'max_sentiment' value
print austen_blob.sentences[max_sent_index]

In [None]:
min_sentiment=min(austen_sentiments_smooth[199:])
print min_sentiment # min sentiment polarity value

min_sent_index=list(austen_sentiments_smooth).index(min_sentiment) # index position of the 'min_sentiment' value
print austen_blob.sentences[min_sent_index]

In [None]:
# Creating functions to expedite the steps we put together above process
# These accept an optional second argument for smoothing level. Default is 200 windows.

def plot_polarity(text_in,window=200):
    blob = TextBlob(text_in)
    sentiments=[sentence.sentiment.polarity for sentence in blob.sentences]
    sentiments_pd=pd.Series(sentiments)
    sentiments_smooth=sentiments_pd.rolling(window).mean()
    plt.figure(figsize=(18,8))
    plt.plot(sentiments_smooth)

def plot_subjectivity(text_in,window=200):
    blob = TextBlob(text_in)
    sentiments=[sentence.sentiment.subjectivity for sentence in blob.sentences]
    sentiments_pd=pd.Series(sentiments)
    sentiments_smooth=sentiments_pd.rolling(window).mean()
    plt.figure(figsize=(18,8))
    plt.plot(sentiments_smooth)



In [None]:
# Persuasion Subjectivity

import urllib2
url="http://principalhand.org/workshop-data/Austen_Persuasion.txt"
temp_string=urllib2.urlopen(url).read()
temp_string=temp_string.replace("\r"," ").replace("\n"," ").replace("  "," ")


#plot_polarity(temp_string)
plot_subjectivity(temp_string)

In [None]:
plot_subjectivity(temp_string,10)

In [None]:
# Pride and Prejudice Subjectivity

import urllib2
url="http://www.gutenberg.org/cache/epub/1342/pg1342.txt"
temp_string=urllib2.urlopen(url).read()
temp_string=temp_string.replace("\r"," ").replace("\n"," ").replace("  "," ")

plt.figure(figsize=(20,10))
#plot_polarity(temp_string)
plot_subjectivity(temp_string)


In [None]:
# Emma Subjectivity

from urllib.request import urlopen

url="https://www.gutenberg.org/files/158/158-0.txt"
temp_string = urlopen(url).read().decode('utf8')
temp_string=temp_string.replace("\r"," ").replace("\n"," ").replace("  "," ")


#plot_polarity(temp_string)
plot_subjectivity(temp_string)

In [None]:
# Sense and Sensibility Subjectivity

import urllib2
url="http://www.gutenberg.org/cache/epub/161/pg161.txt"
temp_string=urllib2.urlopen(url).read()
temp_string=temp_string.replace("\r"," ").replace("\n"," ").replace("  "," ")


#plot_polarity(temp_string)
plot_subjectivity(temp_string)

In [None]:
# Subjectivity: New York Times Current History; The European War, Vol 2, No. 3, June, 1915

import urllib2
url="http://www.gutenberg.org/cache/epub/15480/pg15480.txt"
temp_string=urllib2.urlopen(url).read()
temp_string=temp_string.replace("\r"," ").replace("\n"," ").replace("  "," ")

#plot_polarity(temp_string)
plot_subjectivity(temp_string)

In [None]:
# Huckleberry Finn Polarity

import urllib2
url="https://www.gutenberg.org/files/76/76-0.txt"
temp_string=urllib2.urlopen(url).read()
temp_string=temp_string.replace("\r"," ").replace("\n"," ").replace("  "," ")


plot_polarity(temp_string)
#plot_subjectivity(temp_string)

### ▷ Plotting smoothed random data (for comparison)

In [None]:
## Plotting completely random data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

random_vals=np.random.rand(4000)

vals_pd=pd.Series(random_vals)
vals_smooth=vals_pd.rolling(window=200).mean()

plt.figure(figsize=(18,8))
plt.plot(vals_smooth)


### ▷ Sentiment Histograms

In [None]:
def hist_polarity(text_in):
    blob = TextBlob(text_in)
    sentiments=[sentence.sentiment.polarity for sentence in blob.sentences]
    plt.figure(figsize=(20,10))
    plt.hist(sentiments_smooth)

def hist_subjectivity(text_in):
    blob = TextBlob(text_in)
    sentiments=[sentence.sentiment.subjectivity for sentence in blob.sentences]
    plt.figure(figsize=(20,10))
    plt.hist(sentiments)

In [None]:
#import urllib2
#url="http://principalhand.org/workshop-data/Austen_Persuasion.txt"
#temp_string=urllib2.urlopen(url).read()

In [None]:
temp_string = corpus_1_texts[4]

hist_subjectivity(temp_string)

In [None]:
# These functions remove zero values before plotting.

def hist_polarity_filtered(text_in):
    blob = TextBlob(text_in.decode("utf8"))
    sentiments=[sentence.sentiment.polarity for sentence in blob.sentences]
    sentiments=[x for x in sentiments if x != 0]
    plt.figure(figsize=(15,8))
    plt.hist(sentiments)

def hist_subjectivity_filtered(text_in):
    blob = TextBlob(text_in.decode("utf8"))
    sentiments=[sentence.sentiment.subjectivity for sentence in blob.sentences]
    sentiments=[x for x in sentiments if x != 0]
    plt.figure(figsize=(15,8))
    plt.hist(sentiments)


In [None]:
hist_polarity_filtered(temp_string)

In [None]:
import urllib2
url="http://principalhand.org/workshop-data/Melville_Moby-Dick.txt"
melville_string=urllib2.urlopen(url).read()

In [None]:
hist_polarity_filtered(melville_string)

### ▷ Descriptive Stats

In [None]:
blob = TextBlob(' '.join(corpus_1_texts))
melville_sentiments=[sentence.sentiment.subjectivity for sentence in blob.sentences]
np.mean(melville_sentiments)

In [None]:
blob = TextBlob(' '.join(corpus_2_texts))
austen_sentiments=[sentence.sentiment.subjectivity for sentence in blob.sentences]
np.mean(austen_sentiments)

In [None]:
blob=gilman_blob
gilman_sentiments=[sentence.sentiment.subjectivity for sentence in blob.sentences]
np.mean(gilman_sentiments)

### ▷ Statistical Tests

In [None]:
# T-test of independent values

# Inappropriate in this case because zeroes in data make distribution non-normal.

from scipy import stats

print(stats.ttest_ind(melville_sentiments,austen_sentiments))

#print stats.ttest_ind(melville_sentiments,gilman_sentiments)

#print stats.ttest_ind(austen_sentiments,gilman_sentiments)

In [None]:
# Mann-Whitney U test

# Designed to work for non-normally distrbuted data.

from scipy import stats

print stats.mannwhitneyu(melville_sentiments,austen_sentiments)

print stats.mannwhitneyu(melville_sentiments,gilman_sentiments)

print stats.mannwhitneyu(austen_sentiments,gilman_sentiments)

### ▷ POS tagging

You can find a list of POS tags here: http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [None]:
# 'blob_1.tags' is a list of NLTK's best guess for each word's part of speech (POS).
# The following prints the first 20 word-tag pairs in our text.

from pprint import pprint

pprint(blob_1.tags[:20])

In [None]:
print blob_1.noun_phrases

In [None]:
# Parse a sentence's grammar in tree form.

blob_1.parse()

### ▷ Now let's work with a longer text.

In [None]:
# Downloading Melville's _Moby Dick_

import urllib2

url="http://principalhand.org/workshop-data/Melville_Moby-Dick.txt"

melville_string=urllib2.urlopen(url).read()

In [None]:
# Create a TextBlob object and print a random sentence.

from textblob import TextBlob
import random

melville_blob = TextBlob(melville_string)
print random.sample(melville_blob.sentences,1)

In [None]:
# Return the number of times a given word appears in a text.

print melville_blob.words.count('the')

In [None]:
# View the most frequently occurring words in a text. Note that this is approach is 
# case-sensitive.

from collections import Counter

print Counter(melville_blob.words).most_common(25)

In [None]:
# Here's a non-case-sensitive version of the command above, which works by converting the
# full text to lowercase before calculating string frequencies.

print Counter(melville_blob.words.lower()).most_common(25)

## Removing Stop Words

Now let's view the most frequent words in our corpus with stopwords removed.

The first time you run the cell below, uncomment the second line to download all nltk corpora and packages.


In [None]:
import nltk

In [None]:
# Loading stop word list

from operator import itemgetter
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word

stopwords_eng=stopwords.words('english')+["'s"] ## Adding "'s"  as a stop word
print sorted(stopwords_eng)

In [None]:
# Creates a new Moby Dick TextBlob object (just for convenience)

import urllib2
url="http://principalhand.org/workshop-data/Melville_Moby-Dick.txt"
melville_string=urllib2.urlopen(url).read()


# Create a TextBlob object and print a random sentence.
from textblob import TextBlob
import random

melville_blob = TextBlob(melville_string)
print random.sample(melville_blob.sentences,1)

In [None]:
# Creates a copy of our word tally list with stopwords removed.
from collections import Counter
from textblob import Word
from pprint import pprint

most_freq=Counter(melville_blob.words.lower()).most_common()

most_freq_ns=[]

for pair in most_freq:
    word=pair[0].lower()
    pre_apostrophe=Word(word).split("'"[0]) # 
    if not (word in stopwords_eng)|(pre_apostrophe in stopwords_eng):
        most_freq_ns.append(pair)

        
print len(most_freq_ns)
pprint(most_freq_ns[:25])

In [None]:
# Creating a function that applies the process above to any TextBlob object.

def most_freq_no_stop(blob):
    stopwords_eng=stopwords.words('english')+["'s"]
    most_freq=Counter(blob.words.lower()).most_common()
    
    most_freq_no_stop=[]

    for pair in most_freq:
        word=pair[0].lower()
        pre_apostrophe=Word(word).split("'"[0])
        if not (word in stopwords_eng)|(pre_apostrophe in stopwords_eng):
            most_freq_no_stop.append(pair)
    
    return most_freq_no_stop

In [None]:
pprint(most_freq_no_stop(melville_blob)[:25])

#### ▷ Let's load another text for comparison.

In [None]:
import urllib2

url="http://principalhand.org/workshop-data/Austen_Persuasion.txt"

austen_string=urllib2.urlopen(url).read()

In [None]:
#### This cell will throw an error. Don't panic! ####

austen_blob = TextBlob(austen_string)
pprint(most_freq_no_stop(austen_blob)[:30])

#### ▷ The cell above will produce a 'UnicodeDecodeError.' To fix the problem, we can apply the "decode()" function to our string before passing it to the TextBlob constructor.


In [None]:
url="http://principalhand.org/workshop-data/Austen_Persuasion.txt"

austen_string=urllib2.urlopen(url).read().decode("utf8")

austen_blob = TextBlob(austen_string)

pprint(most_freq_no_stop(austen_blob)[:25])

### ▷ Yet another word frequency list

In [None]:
import urllib2

url="http://principalhand.org/workshop-data/Gilman_Yellow-Wallpaper.txt"

gilman_string=urllib2.urlopen(url).read().decode("utf8")

gilman_blob = TextBlob(gilman_string)

pprint(most_freq_no_stop(gilman_blob)[:25])

### ▷ Creating a concordance with NLTK

In [None]:
import nltk
import urllib2

url="http://principalhand.org/workshop-data/Stein_Three-Lives.txt"
temp_string=urllib2.urlopen(url).read().decode('utf8')

raw=sanitize(temp_string)


nltk_text = nltk.Text([sanitize(temp_string)])
tokens = nltk.word_tokenize(raw)
nltk_text = nltk.Text(tokens)


print nltk_text.concordance('blood')


<a rel="license"
     href="http://creativecommons.org/publicdomain/zero/1.0/">
    <img src="http://i.creativecommons.org/p/zero/1.0/88x31.png" style="border-style: none;" alt="CC0" />
  </a>