# DS-SF-30 | Codealong 18: Natural Language Processing

## >>> One-time setup

In [1]:
import nltk
nltk.download()

pass

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


## <<< One-time setup

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import string
import unicodedata
from nltk import tokenize, corpus, stem

from sklearn import feature_extraction, linear_model, ensemble, cross_validation, metrics, decomposition

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

## Tokenization

In [5]:
def tokenize_text(document):
    document = document.encode('utf-8')

    # Convert text to lowercase
    document = document.lower()

    # Tokenize
    tokens = tokenize.word_tokenize(document)

    # Remove punctuation in tokens and then remove empty tokens
    tokens = [token.translate(None, string.punctuation) for token in tokens]
    tokens = [token for token in tokens if token]

    # Remove stop words
    tokens = [token for token in tokens if not token in corpus.stopwords.words('english')]

    return tokens

In [6]:
tokens = tokenize_text("This is a sentence...  Wait, here's another.  And a third!")

tokens

['sentence', 'wait', 'another', 'third']

## Stemming

In [7]:
class Stemmer:
    stemmer = stem.porter.PorterStemmer()

    @staticmethod
    def stem_tokens(tokens):
        return [Stemmer.stemmer.stem(token) for token in tokens]

In [8]:
tokens = Stemmer.stem_tokens(tokens)

tokens

[u'sentenc', u'wait', u'anoth', u'third']

## Book reviews

Below, we will be analyzing a partial list of the reviews for J.K. Rowling's The Casual Vacancy.  (https://www.amazon.com/dp/0316228532)

Our dataset is a subset of http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books_5.json.gz.

In [9]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-18-reviews.csv'))

In [10]:
df

Unnamed: 0,date,id,author,title,body,star_rating
0,2016-12-11,R3SH1N77GNTD9K,Stefi,Great read,Very moving story. Great effortless writing wh...,5.0
1,2016-12-11,RVOEQK3JK4LY2,Amazon Customer,Great book! Does not disappoint,Great book! Does not disappoint. Wonderful c...,5.0
2,2016-12-11,RCU7OTNRDJBOS,Priscilla Seaton,Disturbing in its accurate reflection of human...,A very absorbing book. Not at all what I expec...,4.0
3,2016-12-10,R257OLQTPXYQ82,J,Superb,"Lives intertwined, humor,sadness, superior sto...",5.0
4,2016-12-10,R1LNKO30KAXCUM,Roberta L. Sherrill,One Star,Disappointing..... finally quit reading it. S...,1.0
...,...,...,...,...,...,...
5796,2012-09-27,RT2TE0W92SL67,Tricia K.,Seriously? $17 bucks for a computer file??? ...,Premise sounds dull as dirt. For $17 for a co...,1.0
5797,2012-09-27,R14ZGYPSP9H0Y7,Pretzel,A must read,The depth of character development and storyli...,5.0
5798,2012-09-27,R1913ISIDAGQ1A,Prodigy,I love it,The book was great and I will love to re-read ...,5.0
5799,2012-09-27,R2JY771IW7RI3R,David Katz,Kendle price too expensive,I started to order the kindle edition and than...,5.0


In [11]:
df.drop(['date', 'id', 'author', 'title'],
    axis = 1,
    inplace = True)

In [12]:
df

Unnamed: 0,body,star_rating
0,Very moving story. Great effortless writing wh...,5.0
1,Great book! Does not disappoint. Wonderful c...,5.0
2,A very absorbing book. Not at all what I expec...,4.0
3,"Lives intertwined, humor,sadness, superior sto...",5.0
4,Disappointing..... finally quit reading it. S...,1.0
...,...,...
5796,Premise sounds dull as dirt. For $17 for a co...,1.0
5797,The depth of character development and storyli...,5.0
5798,The book was great and I will love to re-read ...,5.0
5799,I started to order the kindle edition and than...,5.0


### `NaN`

In [23]:
df.dropna(inplace=True)

### Positive, neutral, and negatives reviews

In [24]:
df.polarity = df.star_rating.map(lambda x: -1 if x <= 2 else (0 if x == 3.0 else 1))

In [62]:
polarities = df.polarity.value_counts()
minimum = min(polarities)

#sampled = df.copy().drop(df.index)
#for polarity in polarities.keys():
#    subset = df[df.polarity == polarity].sample(minimum)
#    sampled.append(subset)
       

sampled = df[df.polarity == 1].sample(minimum).append(df[df.polarity == 0].sample(minimum)).append(df[df.polarity == -1].sample(minimum))
sampled

Unnamed: 0,body,star_rating
3977,I wondered about JK Rowling's ability to pen a...,4.0
4386,This is a fabulous read. If you're looking for...,4.0
5755,"Through the special argot Rowling utilizes, th...",5.0
1002,"In ""The Casual Vacancy"" J.K. Rowling has broug...",4.0
4857,The Casual Vacancy is a searing portrait of a ...,5.0
...,...,...
3141,tHE BOOK A CASUAL VACANCY I WAS DISAPPOINTED i...,2.0
4605,I've just finished reading this book and was l...,2.0
2084,"Tedious, boring - and a real chore to finish, ...",2.0
2171,"I don't know, but it doesn't seem like she wro...",2.0


### Feature matrix and response vector

### Train/test sets

In [None]:
train_X, test_X, train_c, test_c = cross_validation.train_test_split(X, c, stratify = c, train_size = .6, random_state = 0)

### TF-IDF and `TfidfVectorizer`

In [65]:
#vectorizer = feature_extraction.text.TfidfVectorizer(stop_words='english')

class CustomTokenizer(object):
    def __init__(self):
        self.stemmer = stem.porter.PorterStemmer()
        
    def __call__(self, document):
        tokens = tokenize_text(document)
        tokens = self.stemmer()

IndentationError: expected an indented block (<ipython-input-65-06675c8a2cc2>, line 7)

In [None]:
vectorizer.fit(train_X)

### Bag-of-words

In [None]:
vectorizer.get_feature_names()

### Transformed feature matrix `X`

In [None]:
# TODO

### Machine Learning Modeling

> # TODO...