# Natural Language Processing - Sentiment Analysis of Rotten Tomatoes quotes

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [3]:
# load data from rt_critics.csv in the data folder of our DAT2 repo
# at '../data/rt_critics.csv'
url = '../data/rt_critics.csv'
tomato = pd.read_csv(url)

In [4]:
# look at first 5 rows
tomato.head()

Unnamed: 0,critic,fresh,imdb,publication,quote,review_date,rtid,title
0,Derek Adams,fresh,114709.0,Time Out,"So ingenious in concept, design and execution ...",2009-10-04,9559.0,Toy story
1,Richard Corliss,fresh,114709.0,TIME Magazine,The year's most inventive comedy.,2008-08-31,9559.0,Toy story
2,David Ansen,fresh,114709.0,Newsweek,A winning animated feature that has something ...,2008-08-18,9559.0,Toy story
3,Leonard Klady,fresh,114709.0,Variety,The film sports a provocative and appealing st...,2008-06-09,9559.0,Toy story
4,Jonathan Rosenbaum,fresh,114709.0,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10,9559.0,Toy story


In [5]:
# Check the shape of dataframe
tomato.shape

(14072, 8)

In [6]:
# Fresh is the column with ratings.  Count the number of each value in column 'fresh'
tomato['fresh'].value_counts()

fresh     8613
rotten    5436
none        23
Name: fresh, dtype: int64

In [7]:
# vectorize the quotes and store it on a variable names Xcv
cv = CountVectorizer()
Xcv = cv.fit_transform(tomato['quote'])

In [8]:
# Check the shape of dataframe Xcv
Xcv.shape

(14072, 21544)

But wait! We have more features than samples. This would ensure overfitting. Let's trim that number down to the top 5000, ranked by the term frequency across all documents.

In [9]:
# Create an vectorizer object as a variable named vectorizer that includes just the top 5000
# Hint: check the documentation for CountVectorizer if needed
cv = CountVectorizer(max_features=5000)
Xcv = cv.fit_transform(tomato['quote'])

In [10]:
#  Create a new vectorized feature matix named Xcv with the new vectorizer
Xcv.shape

(14072, 5000)

In [11]:
# Create the response vector y where the value is 1 if "fresh" and 0 if any other value than fresh
tomato['result'] = tomato['fresh'].map({'fresh':1,'rotten':0,'none':0})

# OR
# tomato['result'] = np.where(tomato['fresh']=='fresh',1,0)

In [12]:
# Determine the null accuracy
tomato['result'].value_counts()/len(tomato)

In [18]:
# split the data into training and test sets
X = tomato['quote']
y = tomato['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [25]:
# Evaluate performance of models using test train split or cross_validation

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
X_train_dtm = cv.fit_transform(X_train)
X_test_dtm = cv.transform(X_test)

# use Naive Bayes to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

# calculate accuracy
print (accuracy_score(y_test, y_pred_class))

0.749569336779


In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

In [36]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('NB', MultinomialNB()),
])
cross_val_score(pipeline, X, y, cv=5, scoring='accuracy').mean()

0.75973708057401268

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
# Tune the logistic Regression regularization parameter "C" to improve performance.
# Evaluate performance of models using test train split
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('lg', LogisticRegression(C=10)),
])
cross_val_score(pipeline, X, y, cv=5, scoring='accuracy').mean()

0.74516886514339598

In [41]:
from sklearn.model_selection import GridSearchCV

In [50]:
#Bonus: Create a for loop to find the C value
# that produces the most accurate model 
param_grid = {'lg__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('lg', LogisticRegression()),
])
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid.fit(X,y)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'lg__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [53]:
print (grid.best_score_)
print (grid.best_params_)

0.756964184196
{'lg__C': 1}


# Stop Words

The performance isn't bad, but it's not great. Let's see if we can improve things by [using stop words](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer)?

In [49]:
# Modify your vectorizer to also remove stop words (still allow only 5000 features)

# create a new vectorizer object that only allows 5000 features
cv = CountVectorizer(max_features=5000,stop_words='english')

In [16]:
# Create a new X called Xcvs
Xcvs = tomato['quote']

In [55]:
# split the converted data (Xcvs) into training and test sets
X_train, X_test, y_train, y_test = train_test_split(Xcvs, y, test_size=0.33, random_state=42)

In [66]:
# Evaluate performance of models using the test data
# Tune the regularization parameter, C, to improve performance.
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('lg', LogisticRegression(C=10)),
])
cross_val_score(pipeline, Xcvs, y, cv=5, scoring='accuracy').mean()

0.72619448950087695

In [58]:
# Tune the regularization parameter, C, to improve performance.
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('lg', LogisticRegression(C=100)),
])
cross_val_score(pipeline, Xcvs, y, cv=5, scoring='accuracy').mean()

0.73266167192861675

In [61]:
#Alternate tuning of C using for loop
all_vals =[]
for param in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('lg', LogisticRegression(C=param)),
])
    all_vals.append(cross_val_score(pipeline, Xcvs, y, cv=5, scoring='accuracy').mean())
best_val = max(all_vals)
print(best_val)

0.756965376746


In [65]:
params = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
print(params[all_vals.index(best_val)])

1


In [19]:
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english',ngram_range=(1, 2),  max_features=30000, min_df=3)),
    ('lg', LogisticRegression(C=10)),
])
cross_val_score(pipeline, Xcvs, y, cv=5, scoring='accuracy').mean()

0.72157551376691187

# tf-idf

If that didn't work, how about using tf-idf weighting?

http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer

In [24]:
# edit this cell to create a TfidfVectorizer instead of a simple CountVectorizer
# or start with your own model with CountVectorizer from the cells above

# create vectorizer object
# vectorizer = CountVectorizer(max_features=5000)
vect = TfidfVectorizer()

# Create Xti and y
Xti = vect.fit_transform(tomato['quote'])
# Y = (df['fresh'] == 'fresh').values.astype(np.int8)

# split the converted data into training and test sets
# xtrainti, xtestti, ytrainti, ytestti = train_test_split(Xti, y)

In [25]:
xtrainti, xtestti, ytrainti, ytestti = train_test_split(Xti, y,test_size=0.33, random_state=42)

In [26]:
# Evaluate performance of the new model
lg = LogisticRegression()
lg.fit(xtrainti,ytrainti)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
predict_tf = lg.predict(xtestti)

In [28]:
from sklearn import metrics

In [30]:
print(metrics.accuracy_score(predict_tf,ytestti))

0.747846683893


In [32]:
print(metrics.classification_report(predict_tf,ytestti))

             precision    recall  f1-score   support

          0       0.51      0.75      0.61      1196
          1       0.90      0.75      0.81      3448

avg / total       0.80      0.75      0.76      4644



In [31]:
# Tune the regularization parameter, C, to improve performance.
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lg', LogisticRegression(C=10)),
])
cross_val_score(pipeline, Xcvs, y, cv=5, scoring='accuracy').mean()

0.75980838099373638

In [None]:
#Bonus: if you have time find the best value of C using a for loop


# tf-idf and stop words

Do both together help?

In [35]:
# edit this cell to create a TfidfVectorizer that uses stop words
vect = TfidfVectorizer(stop_words='english')
Xti = vect.fit_transform(tomato['quote'])
xtrainti, xtestti, ytrainti, ytestti = train_test_split(Xti, y,test_size=0.33, random_state=42)
lg = LogisticRegression()
lg.fit(xtrainti,ytrainti)
predict_tf = lg.predict(xtestti)

print(metrics.accuracy_score(predict_tf,ytestti))
print(metrics.classification_report(predict_tf,ytestti))

0.742463393626
             precision    recall  f1-score   support

          0       0.46      0.77      0.58      1063
          1       0.92      0.73      0.81      3581

avg / total       0.81      0.74      0.76      4644



In [None]:
# Evaluate performance of models
# Tune the regularization parameter, C, to improve performance.


In [None]:
# Tune the regularization parameter, C, to improve performance.


# Next steps

Are you satisfied with these results? Why might you be less than satisfied? How can you explain the observed behavior? What are the next steps you would need to do to improve this classifier? If you have time remaining, try a few strategies out below.

In [36]:
from sklearn.feature_extraction.text import TfidfTransformer

In [38]:
# continue playing here
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('lg', LogisticRegression()),
])
cross_val_score(pipeline, Xcvs, y, cv=5, scoring='accuracy').mean()
# Use pipeline to evaluate accuracy with cross validation

0.74261149240177038

# More Next Steps

The hardest part of creating a sentiment model is finding good training data. Googling 'sentiment analysis training data' or 'sentiment analysis test data' turns up a few freely available sources. Most of them are hosted by universities.

But notice, determining the judgment of a movie review isn't the same task as determining the emotional content of a tweet. And yet, it kind of is. The computer doesn't know anything about nature of the text. All it knows is that there are documents with one label (fresh/happy) and documents with another label (rotten/sad) and it needs to fit a model to discriminate between the two. This can be extended to more classes (look into the 20 newsgroups dataset in sci-kit learn) and to proprietary corpora.

One application you might use at work is classifying support emails from users. The classes may be 'ranting', 'mischarge', 'lost order', 'gushing'. Or whatever is common. Even if the classifier isn't perfect, it could help streamline the process of getting the right emails to the right support personnel.

In [None]:
from IPython.display import HTML
HTML('''
<style>
.text_cell_render {
  background-color: silver
}
</style>
''')