In [None]:
!wget https://raw.githubusercontent.com/perryism/machine_learning_101/master/data/imdb_labelled.txt -O /tmp/imdb.csv

# Reference

[sklearn doc](https://scikit-learn.org/stable/modules/classes.html)

# Load data

In [None]:
import pandas as pd
from IPython.display import Image
from IPython.core.display import HTML 
import sys
sys.path.append("..")
from lib import is_rotten, predict, plot_confusion_matrix, Sentiment

import warnings
warnings.filterwarnings('ignore')

reviews = pd.read_csv('/tmp/imdb.csv', sep="\t", header=None)
reviews.columns = ['review', 'like']

# Split data
from sklearn.model_selection import train_test_split

X_raw_train, X_raw_test, y_train, y_test = train_test_split(reviews['review'], reviews['like'], test_size=0.33, random_state=42)




# Explore

real world problems: not clean, messy, for example, voicebase, transcript not accurate. 

In [None]:
from IPython.display import display
pd.set_option('display.max_colwidth', -1)
with pd.option_context("display.max_rows", reviews.shape[0]):
    display(reviews)

In [None]:
%matplotlib inline
reviews['like'].hist()

# How would you do it?

In [None]:
from sklearn.metrics import f1_score, accuracy_score, classification_report
from toolz import curry

@curry
def is_good(positive_words, review):
    lower_quote = review.lower()
    for positive_word in positive_words:
        if positive_word.lower() in lower_quote:
            return 1

    return 0

def predict(df, positive_words):
    good_func = is_good(positive_words)
    return df.apply(good_func)

#try adding 'excellent' to the list
positive_words = ['good']

y_predict = predict(X_raw_train, positive_words)
accuracy_score(y_train, y_predict)

# Machine learning approach



In [None]:
# DEMO

s = Sentiment.demo()
s.predict("This movie is boring")

In [None]:
# bag of words
 
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
m = vectorizer.fit_transform(["The quick brown fox jumps over the lazy dog"]).todense()
f = vectorizer.get_feature_names()
pd.concat([pd.DataFrame(m), pd.DataFrame(f).T])

In [None]:
vectorizer.transform(["The fox is in the box"]).todense()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews['review'])
y = reviews['like']

# Common Terminologies
- X  features

- y  labels

In [None]:
from sklearn.linear_model import LogisticRegression

# LogisticRegression is a classifer. checkout https://en.wikipedia.org/wiki/Logistic_regression for more details

clf = LogisticRegression()
clf.fit(X, y)

In [None]:
y_predict = clf.predict(X)
print(accuracy_score(y, y_predict))

## It looks so good

## Can we say our model is very accurate?  

## Or is it really? What is wrong?

# Memorization vs Learning

In [None]:
from sklearn.model_selection import train_test_split

X_raw_train, X_raw_test, y_train, y_test = train_test_split(reviews['review'], reviews['like'], test_size=0.33, random_state=42)

clf.fit(vectorizer.transform(X_raw_train), y_train)
y_predict = clf.predict(vectorizer.transform(X_raw_test))

from sklearn.metrics import f1_score, accuracy_score, classification_report
print(accuracy_score(y_test, y_predict))

# Find the right measurement

In [None]:
y_predict = clf.predict(vectorizer.transform(X_raw_test))

from sklearn.metrics import f1_score, accuracy_score, classification_report
print(accuracy_score(y_test, y_predict))

In [None]:
# Imagine you are writing a spam filter, and the samples you collected are mostly ham.

y_true = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1]

# We can have the model ALWAYS returns 0s
y_predict = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]

print(accuracy_score(y_true, y_predict))

In [None]:
# What is precision? What is recall?

Image(url= "https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Precisionrecall.svg/700px-Precisionrecall.svg.png")

In [None]:
# f1 score takes consideration of both precision and recall

Image(url="https://wikimedia.org/api/rest_v1/media/math/render/svg/057ffc6b4fa80dc1c0e1f2f1f6b598c38cdd7c23")

In [None]:
y_predict = clf.predict(vectorizer.transform(X_raw_test))

from sklearn.metrics import f1_score, accuracy_score, classification_report
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

# Confusion matrix

precision/recall trade off

eg. if you are writing a parental control filter for children, you probably want high precision.

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
import pylab as plt
cm = confusion_matrix(y_test, y_predict)
plt.figure(figsize=(10,5))
plot_confusion_matrix(cm)

In [None]:
import numpy as np
result = pd.DataFrame(np.array([y_predict, y_test]).T, columns=['predict', 'actual'])
compare = pd.concat([result, X_raw_test.reset_index()['review']], axis=1)

compare.query("actual == 1").head(50)

In [None]:
# test it yourself

my_review = "This is so boring."
Sentiment(vectorizer, clf).predict(my_review)

In [None]:
# word cloud

coef = np.array(clf.coef_)
positive_idx = np.where( coef > .4 )[1]
negative_idx = np.where( coef < -0.4 )[1]
features = vectorizer.get_feature_names()

#fix me
pos_freq = dict(list(map(lambda idx: (features[idx], coef[0,idx]), positive_idx )))
neg_freq = dict(list(map(lambda idx: (features[idx], coef[0,idx]), negative_idx )))

from wordcloud import WordCloud
wordcloud = WordCloud().generate_from_frequencies(pos_freq)

import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
plt.imshow(wordcloud)
plt.axis("off")

In [None]:
wordcloud = WordCloud().generate_from_frequencies(neg_freq)

import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
plt.imshow(wordcloud)
plt.axis("off")