In [1]:
import numpy as np  #load up the libraries and object defs. we need
import pandas as pd
from pandas import DataFrame, Series

# load up my visualization system, and call the object plt
import matplotlib.pyplot as plt

# tell ipython notebook to print visualizations inline
%pylab
%matplotlib inline

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


# NLTK

Import the nltk (Natural Language Toolkit) package.

In [2]:
import nltk

NTLK doesn't contain many language packages by default, so you'll need to install a few. If you haven't installed the `punkt` tokenizer package, you'll need to uncomment and run the line below to launch the NLTK downloader. 

Once it's loaded, enter `d` and then download the `punkt` package.

While you're there. Also download the `averaged_perceptron_tagger`, `stopwords`, and `vader_lexicon` packages. (We'll use them later.)

In [None]:
nltk.download_shell()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> punkt


    Downloading package punkt to /Users/wjwillett/nltk_data...
      Package punkt is already up-to-date!



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> averaged_perceptron_tagger
Command 'averaged_perceptron_tagger' unrecognized

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> averaged_perceptron_tagger


    Downloading package averaged_perceptron_tagger to
        /Users/wjwillett/nltk_data...
      Package averaged_perceptron_tagger is already up-to-date!



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> stopwords


    Downloading package stopwords to /Users/wjwillett/nltk_data...
      Unzipping corpora/stopwords.zip.



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> vader_lexicon


    Downloading package vader_lexicon to /Users/wjwillett/nltk_data...
      Package vader_lexicon is already up-to-date!



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------


## Text Splitting Basics

In [None]:
s = "This is a short, simple piece of text, so let's try a short, simple analysis."

Breaking it up with `string.split()`

In [None]:
s.lower()

In [None]:
s.split()

## Tokenizing
Using NLTK, we can split ("tokenize") more intelligently.

In [None]:
nltk.wordpunct_tokenize(s)

We can also count and visualize the distribution of terms.

In [None]:
tokens = nltk.wordpunct_tokenize(s)
nltk.probability.FreqDist(tokens)

In [None]:
fd = nltk.probability.FreqDist(tokens)
fd.plot()

### Removing stop words
NLTK provides corpuses of common stop words that we can use to remove common terms from text.

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
stopwords

Here's a slow way to remove stop words using a for loop.

In [None]:
t = s.lower()   # make lowercase
tokens = nltk.tokenize.wordpunct_tokenize(t)
no_stopwords = []  # this will hold our NON-stopword words
for t in tokens:   # for each word
    if t not in stopwords:  # not in stopword list
        no_stopwords.append(t)  # collect the word

print(no_stopwords)

#### A nice Python shortcut
You can actually do this kind of transformation more concisely by using list comprehensions.

In [None]:
tokens = nltk.wordpunct_tokenize(s)
print(tokens)   # non-lowercase version

In [None]:
lower_tokens = [t.lower() for t in tokens]

# ^ this list comprenehsion is basically a shorthand for:
#lower_tokens = []
#for t in tokens:
#    lower_tokens.append(t.lower())
print(lower_tokens)

So to remove stopwords, we can nest an if statement inside the list comprehension.

In [None]:
tokens_nostop = [t.lower() for t in tokens if t not in stopwords]
print(tokens_nostop)

You can remove punctuation the same way.

In [None]:
import string
string.punctuation

In [None]:
tokens_nopunct = [t.lower() for t in tokens if t not in string.punctuation]
print(tokens_nopunct)

Or we can do both.

In [None]:
tokens_nostop_nopunct = [t.lower() for t in tokens if (t not in string.punctuation and t not in stopwords)]
print(tokens_nostop_nopunct)

### Stemming

In [None]:
stemmer = nltk.stem.snowball.EnglishStemmer()  # grab me a "snowball stemmer"

In [None]:
stemmer.stem("runs")

In [None]:
stemmer.stem("applicable")

Lowercase, remove stopwords, and stem all in one line.

In [None]:
filtered = [stemmer.stem(t.lower()) for t in tokens if (t not in string.punctuation and t.lower() not in stopwords)]
print(filtered)

In [None]:
print(s)
nltk.probability.FreqDist(filtered).plot()

### Parts of Speech
Labels each term in the text with its part of speech (noun, verb, adjective, etc.). 

Parts of speech are labeled using the codes from the UPENN Treebank project: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [None]:
nltk.pos_tag(tokens_nopunct)

In [None]:
pos_list = [i[1] for i in nltk.pos_tag(tokens_nopunct)]
nltk.probability.FreqDist(pos_list).plot()

### n-grams

In [None]:
n = 3
[i for i in nltk.ngrams(tokens_nopunct,n)]

### Sentiment
Sentiment analysis can be interesting, but it's also **very** inexact. You should always take it with a grain of salt, especially on small corpuses of text.

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [None]:
print(s)
sid.polarity_scores(s)

^ The compound score here combines the others and will for negative sentences/phrases deemed to have negative sentiment and positive for text deemed to have positive sentiment.

In [None]:
sid.polarity_scores("The movie was atrocious with horrible acting and a deeply flawed plot.")

In [None]:
sid.polarity_scores("The movie was amazing with stellar acting and an incredible plot.")

## Tweet data
Let's try this on a small sample of tweets from @nenshi.

**Save the Raw Data sheet from your analysis last week as a CSV and load it here.**

In [None]:
df = pd.read_csv("V&A Qualitative Datathon - Twitter @nenshi - Raw_Data.csv")

In [None]:
df.head(10)

### Tweet length

In [None]:
df['tweet-length'] = [len(t) for t in df['tweet-text']]
print(df.head(5))
df['tweet-length'].hist()

### Sentiment

In [None]:
df['sentiment'] = df['tweet-text'].map(lambda x: sid.polarity_scores(x)["compound"])
print(df[['tweet-text','sentiment']].head(5))
df['sentiment'].hist()

### Word frequency
Tokenize the tweet text and remove punctuation and stopwords.

In [None]:
df['tweet-tokens'] = df['tweet-text'].map(lambda x: nltk.wordpunct_tokenize(x))
df[['tweet-text','tweet-tokens']].head(5)

In [None]:
def remove_stop_punct(token_list):
    return [t.lower() for t in token_list if (t not in string.punctuation and t.lower() not in stopwords)]

df['tweet-tokens'] = [nltk.wordpunct_tokenize(tweet) for tweet in df['tweet-text']]
df['tweet-tokens-filtered'] = df['tweet-tokens'].map(remove_stop_punct)
df.head(5)


Get a list of all of the tokens in the whole corpus

In [None]:
tweet_tokens_filtered_list = [item for sublist in df["tweet-tokens-filtered"] for item in sublist]
nltk.FreqDist(tweet_tokens_filtered_list).plot(30) #Show me just the top 30

# ToDo
Are there terms that should be removed from this frequency distribution? Think about how you'd remove them.

*Hint: Think about how we removed punctuation and stop words earlier.*

Try this analysis on one of the other Twitter samples in the dataset.
> @CoachsCornerCBC  
> @BoredElonMusk  
> @DungeonsDonald  
> @georgetakei  
> @neiltyson  