In [1]:
import pandas as pd
import re
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import random
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def tokenize_review(review):
    review = review.lower().replace("'", "")
    review = review.replace('"', '')
    review = re.sub('[\[\]\(\)\:\;\.,]',' ',review)
    review = re.split('[\/<\?!\s]',review)
    return [token.strip() for token in review if token.strip() != '']

def preprocess_review(review):
    review = review.lower()#.replace("'", "")
    review = review.replace('"', '')
    review = re.sub('[\[\]\(\)\:\;,]',' ',review)
    review = re.sub('[\/<\?!\.]','',review)
    return review

def build_evaluate_model(x, y, print_stats):
    data = x
    target = y
    # Instantiate our model and store it in a new variable.
    bnb = BernoulliNB()

    # Fit our model to the data.
    bnb.fit(data, target)

    # Classify, storing the result in a new variable.
    #y_pred = bnb.predict(data)
    y_pred = cross_val_predict(bnb, data, target)

    n = data.shape[0]
    # Display our results.
    if print_stats:
        print("Number of mislabeled points out of a total {} points : {}".format(
            n,
            (target != y_pred).sum()
        ))

    correct = (n - (target != y_pred).sum())/n * 100
    if print_stats:
        print("Accuracy: {}%".format(correct))
    
    
    cv = cross_val_score(bnb, data, target, cv=5)
    
    if print_stats:
        print(cv)
        print("cv average is = {:.2f}%".format(cv.mean()*100))

        cm = confusion_matrix(target, y_pred)
        print(cm)
        print("True Positive = {:.2f}%".format(cm[1,1]/10))
        print("False Positive = {:.2f}%".format(cm[0,1]/10))
        print("True Negative = {:.2f}%".format(cm[0,0]/10))
        print("False Negative = {:.2f}%".format(cm[1,0]/10))
        print("sensitivity or hit rate is {:.2f}%".format(cm[1,1]/(cm[1,0]+cm[1,1])*100))
        # sensitivity (recall) is the percentage of positives identified or TP/FN+TP
        print("specificity or True Negative rate is {:.2f}%".format(cm[0,0]/(cm[0,0]+cm[0,1])*100))
    return bnb, cv.mean()

In [3]:
# Load Data
reviews = pd.read_csv('./amazon_cells_labelled.txt', sep='\t', header=None)
reviews.columns = ['review', 'score']
# Let's preprocess the review text
# remove special characters
# replace punctuation with a space
reviews["review_processed"] = reviews.review.apply(preprocess_review)

In [4]:
reviews.head()

Unnamed: 0,review,score,review_processed
0,So there is no way for me to plug it in here i...,0,so there is no way for me to plug it in here i...
1,"Good case, Excellent value.",1,good case excellent value
2,Great for the jawbone.,1,great for the jawbone
3,Tied to charger for conversations lasting more...,0,tied to charger for conversations lasting more...
4,The mic is great.,1,the mic is great


In [5]:
# Split into train and test sets (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(reviews.review_processed, reviews.score, test_size=0.3, random_state=42)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [6]:
# Now tokenize each review to generate an overall list of tokens unique to each class
# get words unique to negative reviews
p = reviews.query("score == 1").review_processed.str.split(expand=True).stack()
n = p = reviews.query("score == 0").review_processed.str.split(expand=True).stack()
tokens_positive = set(p.values)
tokens_negative = set(n.values)

tokens_positive = []
tokens_negative = []
reviews.query("score == 1").review_processed.apply(lambda x: tokens_positive.extend(tokenize_review(x)))
reviews.query("score == 0").review_processed.apply(lambda x: tokens_negative.extend(tokenize_review(x)))

x = list(set(tokens_negative).difference(set(tokens_positive)))
y = list(set(tokens_positive).difference(set(tokens_negative)))
# now want to get a count of word (in x) in  tokens_negative
# replace all numbers (ex 45, 11) with _number token
tokens_negative = []
tokens_positive = []
for token in x:
    if len(token) == 1:
        continue
    if token.isdigit():
        token = "NUMBER"
    if re.match('[^a-z]+[0-9]+', token):
        token = "NUMCHAR"
    if re.match('#[0-9]+', token):
        token = "RANK"
    if re.match('[a-z]\*+', token):
        token = "EXPLITIVE"
    tokens_negative.append(token)


for token in y:
    if len(token) == 1:
        continue
    if token.isdigit():
        token = "NUMBER"
    if re.match('[^a-z]+[0-9]+', token):
        token = "NUMCHAR"
    if re.match('#[0-9]+', token):
        token = "RANK"
    if re.match('[a-z]\*+', token):
        token = "EXPLITIVE"
    tokens_positive.append(token)    
    
# NUMBER is pretty balanced (12 positive, 13 negative), so lets remove that one
tokens_negative = set(tokens_negative)
tokens_positive = set(tokens_positive)
tokens_negative.remove("NUMBER")
tokens_positive.remove("NUMBER")
print("Tokens (negative): {} (positive): {}".format(len(tokens_negative), len(tokens_positive)))
print("Saturated model will have {} features".format(len(tokens_negative)+len(tokens_positive)))

Tokens (negative): 758 (positive): 598
Saturated model will have 1356 features


Now that we have our data in a form that is better suited for analysis, let's build some models!

# Models

## Model 1
We are going to manually specify some keywords that we believe are good indicator of review sentiment

In [7]:
# Let's add some features that I suspect will help
tokens_my = ['back', 'best', 'easy', 'cool', "doesn't",
             'fine', 'good', 'great', 'happy', 'love',
             'money', 'never', 'not', 'recommend', 
             'very', 'well', 'worst']

In [8]:
for word in tokens_my:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    X_train[str(word)] = X_train.review_processed.str.contains(str(word), case=False)
    X_test[str(word)] = X_test.review_processed.str.contains(str(word), case=False)

In [9]:
x2 = X_train.copy()
x2["score"] = y_train
means = x2.groupby("score").mean()
means

Unnamed: 0_level_0,back,best,easy,cool,doesn't,fine,good,great,happy,love,money,never,not,recommend,very,well,worst
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,0.014124,0.002825,0.008475,0.002825,0.025424,0.002825,0.025424,0.00565,0.008475,0.0,0.036723,0.011299,0.225989,0.00565,0.081921,0.011299,0.031073
1,0.00289,0.046243,0.026012,0.017341,0.00289,0.020231,0.109827,0.182081,0.028902,0.043353,0.00289,0.00578,0.034682,0.043353,0.176301,0.072254,0.0


In [10]:
(means.iloc[0,:] / means.iloc[1,:]).sort_values()

love          0.000000
great         0.031029
best          0.061088
recommend     0.130320
fine          0.139629
well          0.156384
cool          0.162900
good          0.231490
happy         0.293220
easy          0.325800
very          0.464666
never         1.954802
back          4.887006
not           6.516008
doesn't       8.796610
money        12.706215
worst              inf
dtype: float64

*Those token ratios near 1, or inf are questionable, see if any of these need to be included.*

  1. sad
  2. avoid
  3. awful
  4. negative
  5. satisfactory
  6. quality
  7. problem
  

In [11]:
model1, cv_mean = build_evaluate_model(X_train[tokens_my], y_train, print_stats=True)

Number of mislabeled points out of a total 700 points : 176
Accuracy: 74.85714285714286%
[0.74468085 0.75714286 0.75       0.7        0.79136691]
cv average is = 74.86%
[[314  40]
 [136 210]]
True Positive = 21.00%
False Positive = 4.00%
True Negative = 31.40%
False Negative = 13.60%
sensitivity or hit rate is 60.69%
specificity or True Negative rate is 88.70%


In [12]:
# Now let's see how it performs on the test set!
y_pred = model1.predict(X_test.iloc[:,1:])

n = 300
# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
        n,
        (y_test != y_pred).sum()
    ))
correct = (n - (y_test != y_pred).sum())/n * 100
print("Accuracy: {}%".format(correct))

Number of mislabeled points out of a total 300 points : 77
Accuracy: 74.33333333333333%


Conclusion: this model has an accuracy of **74.86%** on both the train and test sets, and is **74.33%** accurate on the test set.

## Model 2
Now we are going to randomly select 50 features (from tokens); we sample from negative and positive tokens, and depending on the objective we vary the sampling rates. 

In [14]:
for word in list(tokens_negative) + list(tokens_positive):
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    X_train[str(word)] = X_train.review_processed.str.contains(str(word), case=False)
    X_test[str(word)] = X_test.review_processed.str.contains(str(word), case=False)

In [15]:
# repeat this process until you get a good set of features
all_features = pd.Series(X_train.columns[1:])
features = []
model2 = None
cv_max = 0.0
for i in range(1,1000):
    featuresset = list(all_features.sample(50)) + tokens_my
    new_model, new_cv = build_evaluate_model(X_train[featuresset], y_train, print_stats=False)
    if new_cv > cv_max:
        cv_max = new_cv
        features = featuresset
        model2 = new_model

In [17]:
model2, cv2 = build_evaluate_model(X_train[features], y_train, print_stats=True)

Number of mislabeled points out of a total 700 points : 156
Accuracy: 77.71428571428571%
[0.76595745 0.80714286 0.78571429 0.72142857 0.82014388]
cv average is = 78.01%
[[323  31]
 [125 221]]
True Positive = 22.10%
False Positive = 3.10%
True Negative = 32.30%
False Negative = 12.50%
sensitivity or hit rate is 63.87%
specificity or True Negative rate is 91.24%


In [18]:
# Now let's see how it performs on the test set!
y_pred2 = model2.predict(X_test[features])

n = 300
# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
        n,
        (y_test != y_pred2).sum()
    ))
correct = (n - (y_test != y_pred2).sum())/n * 100
print("Accuracy: {}%".format(correct))

Number of mislabeled points out of a total 300 points : 72
Accuracy: 76.0%


We sampled from the entire set of tokens (50 tokens each) 1000 times, built a Naive Bayes model for each set of features and got our best performing set of features (k = 50). This model was **78.01**% accurate on the train set, and **76%** on the test set.

## Model 3

Let's manually remove some features from model 2 that do not contribute a lot.

In [27]:
x3 = X_train.copy()
x3["score"] = y_train
means = x3.groupby("score")[features].mean()
means

Unnamed: 0_level_0,point,buyers,sound-wise,handsfree,explain,NUMCHAR,unintelligible,compromise,hot,cutouts,...,great,happy,love,money,never,not,recommend,very,well,worst
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.062147,0.002825,0.002825,0.0,0.002825,0.0,0.0,0.0,0.00565,0.0,...,0.00565,0.008475,0.0,0.036723,0.011299,0.225989,0.00565,0.081921,0.011299,0.031073
1,0.0,0.0,0.0,0.00578,0.0,0.0,0.0,0.00289,0.00289,0.0,...,0.182081,0.028902,0.043353,0.00289,0.00578,0.034682,0.043353,0.176301,0.072254,0.0


In [52]:
ratios = (means.iloc[0,:] / means.iloc[1,:]).sort_values()
mask = (means.iloc[0,:]==0) & (means.iloc[1,:]==0)
ign1 = set(means.columns[mask])
ign2 = set(means.columns[ratios.between(0.6,1.4)])
ign3 = set(means.columns[(ratios>0) & (ratios<0.3)])

In [57]:
features3 = list(set(means.columns).difference(ign1).difference(ign2).difference(ign3))

In [58]:
model3, cv3 = build_evaluate_model(X_train[features3], y_train, print_stats=True)

Number of mislabeled points out of a total 700 points : 159
Accuracy: 77.28571428571429%
[0.76595745 0.80714286 0.77857143 0.72142857 0.8057554 ]
cv average is = 77.58%
[[326  28]
 [131 215]]
True Positive = 21.50%
False Positive = 2.80%
True Negative = 32.60%
False Negative = 13.10%
sensitivity or hit rate is 62.14%
specificity or True Negative rate is 92.09%


In [59]:
# Now let's see how it performs on the test set!
y_pred3 = model3.predict(X_test[features3])

n = 300
# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
        n,
        (y_test != y_pred3).sum()
    ))
correct = (n - (y_test != y_pred3).sum())/n * 100
print("Accuracy: {}%".format(correct))

Number of mislabeled points out of a total 300 points : 74
Accuracy: 75.33333333333333%


So this model's performance was slightly worse, at **77.58%** on the train set and **75.33%** on the test set.

## Model 4
We will oversample our negative tokens for this model.

In [64]:
# repeat this process until you get a good set of features
features4 = []
model4 = None
cv_max = 0.0
for i in range(1,1000):
    featuresset = random.sample(tokens_negative, 35) + random.sample(tokens_positive, 15) + tokens_my
    new_model, new_cv = build_evaluate_model(X_train[featuresset], y_train, print_stats=False)
    if new_cv > cv_max:
        cv_max = new_cv
        features4 = featuresset
        model4 = new_model

In [65]:
build_evaluate_model(X_train[features4], y_train, print_stats=True)

Number of mislabeled points out of a total 700 points : 155
Accuracy: 77.85714285714286%
[0.78014184 0.77857143 0.79285714 0.71428571 0.82014388]
cv average is = 77.72%
[[328  26]
 [129 217]]
True Positive = 21.70%
False Positive = 2.60%
True Negative = 32.80%
False Negative = 12.90%
sensitivity or hit rate is 62.72%
specificity or True Negative rate is 92.66%


(BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
 0.7772000029156005)

In [66]:
# Now let's see how it performs on the test set!
y_pred4 = model4.predict(X_test[features4])

n = 300
# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
        n,
        (y_test != y_pred4).sum()
    ))
correct = (n - (y_test != y_pred4).sum())/n * 100
print("Accuracy: {}%".format(correct))

Number of mislabeled points out of a total 300 points : 76
Accuracy: 74.66666666666667%


So when we oversample negative tokens, our model is slightly more accurate on the train set at **77.85%** but less on the test set at **74.66%**, however the specificity (true negative rate) increases (as expected), but not by very much. 

## Model 5

*VADER*

In [68]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def nltk_sentiment(sentence, return_list = True):
    nltk_sentiment = SentimentIntensityAnalyzer()
    score = nltk_sentiment.polarity_scores(sentence)
    # score will look like:
    # {'neg': 0.0, 'neu': 0.667, 'pos': 0.333, 'compound': 0.3612}
    if return_list:
        # [0.0, 0.667, 0.333, 0.3612]
        score = [ item[1] for item in score.items() ]
        return score[0], score[1], score[2], score[3]
    else:
        return score

In [77]:
scores = X_train.review_processed.apply(nltk_sentiment)
X_train["neg"] = [score[0] for score in scores]
X_train["neu"] = [score[1] for score in scores]
X_train["pos"] = [score[2] for score in scores]
X_train["compound"] = [score[3] for score in scores]

In [78]:
features5 = ["neg","neu","pos","compound"]
model5, cv5 = build_evaluate_model(X_train[features5], y_train, print_stats=True)

Number of mislabeled points out of a total 700 points : 112
Accuracy: 84.0%
[0.87234043 0.81428571 0.85       0.82857143 0.83453237]
cv average is = 83.99%
[[297  57]
 [ 55 291]]
True Positive = 29.10%
False Positive = 5.70%
True Negative = 29.70%
False Negative = 5.50%
sensitivity or hit rate is 84.10%
specificity or True Negative rate is 83.90%


In [79]:
scores2 = X_test.review_processed.apply(nltk_sentiment)
X_test["neg"] = [score[0] for score in scores2]
X_test["neu"] = [score[1] for score in scores2]
X_test["pos"] = [score[2] for score in scores2]
X_test["compound"] = [score[3] for score in scores2]

In [80]:
# Now let's see how it performs on the test set!
y_pred5 = model5.predict(X_test[features5])

n = 300
# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
        n,
        (y_test != y_pred5).sum()
    ))
correct = (n - (y_test != y_pred5).sum())/n * 100
print("Accuracy: {}%".format(correct))

Number of mislabeled points out of a total 300 points : 47
Accuracy: 84.33333333333334%


This model uses the VADER algorithm to give the review a negative, neutral, positive and compound scores, and then uses these 4 features to predict the sentiment. It was **83.99%** accurate on the train set, and **84.33%** accurate on the test set.