In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train.csv", encoding = 'cp1252')
test = pd.read_csv("test.csv", encoding = 'cp1252')

In [3]:
# let's take a peek at out training and test datasets
train['Review'], train['Target'], test['Review']

(0    I’ve seen a lot of bad reviews for this phone ...
 1    This phone looks and performs great like it's ...
 2    Don't listen to bad reviews! My phone arrived ...
 3    Love this phone! I am so glad I bought a refur...
 4    First, seller did a great job and I think I go...
 5    Received prompt delivery of the phone. I inser...
 6    Overall, the phone isn't too bad for the price...
 7    The iPhone 7 I purchased was "certified refurb...
 8    Initially I was happy with the phone. It looke...
 9    Be cautious - if you have ANY issues at all, r...
 Name: Review, dtype: object,
 0    1
 1    1
 2    1
 3    1
 4    1
 5    1
 6    0
 7    0
 8    0
 9    0
 Name: Target, dtype: int64,
 0    The phone arrived in pretty decent condition. ...
 1    iPhone 7 Black came in excellent condition. Li...
 Name: Review, dtype: object)

In [4]:
# Everything looks fine. Let's proceed...

In [5]:
key_words = ['great', 'happy', 'bad', 'return']

In [6]:
# import string
import re

def build_vocabulary(key_words, text):
    """
    INPUT:
        key_words:   list of key words
        text:        text where you search for key words
    OUTPUT:
        occurrences: list of occurrences of the key words
    """
    occurrences = []
    
    # we convert all letter into lower case
    test_lower = text.lower()
    
    # and split all word 
    # (otherwise, word with the same root may be counted instead of the key words
    # e.g., ``returned'' will be counted as ``return'')
    text_as_words = re.split('[ ,.\n:;\|/]', test_lower)
    for word in key_words: 
        n_word = text_as_words.count(word)
        occurrences.append(n_word)

    return occurrences

In [7]:
# Let's look what we have now

# print and save as X_train lists of occurrences of the key words in the training data
X_train = []
for i in range(len(train)):
    current_occurrences = build_vocabulary(key_words, train['Review'][i])
    print(current_occurrences)
    X_train.append(current_occurrences)

[0, 2, 1, 0]
[1, 1, 0, 1]
[1, 1, 1, 0]
[1, 1, 0, 0]
[1, 1, 1, 0]
[0, 1, 0, 0]
[0, 0, 3, 1]
[1, 0, 1, 0]
[1, 1, 1, 1]
[0, 0, 0, 2]


In [8]:
# print and save as X_test lists of occurrences of the key words in the training data

X_test = []
for i in range(len(test)):
    current_occurrences = build_vocabulary(key_words, test['Review'][i])
    print(current_occurrences)
    X_test.append(current_occurrences)

[1, 0, 1, 1]
[1, 1, 1, 1]


In [9]:
# save target values as a list
y_train = list(train['Target'])
print(y_train)

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]


In [10]:
from sklearn.naive_bayes import MultinomialNB

# we use Multinomial Naive Bayes

# alpha is a parameter in a so-called Laplacian smoothing
# in our simple model we will not use it, that is why we want to set alpha = 0
# alpha = 1.0e-10 (meaning alpha = 0, but Python will not allow this, so we say it is a very small number)

# fit_prior = True since we do want to fit our priors, otherwise the uniform distribution will be chosen
our_classifier = MultinomialNB(alpha = 1.0e-10, fit_prior = True);
our_classifier.fit(X_train, y_train)

y_test = our_classifier.predict(X_test)
probabilities = our_classifier.predict_proba(X_test)
print(y_test, "\n", probabilities)

[0 1] 
 [[0.81274382 0.18725618]
 [0.43663546 0.56336454]]


In [11]:
for i in range(len(test)):
    print('TEST #', i + 1)
    print('Probabilities:', probabilities[i][0], ' vs ', probabilities[i][1])
    if y_test[i] == 0:
        print('Prediction:', y_test[i], '(Negative)')
    else:
        print('Prediction:', y_test[i], '(Positive)')

TEST # 1
Probabilities: 0.8127438231342611  vs  0.18725617686573853
Prediction: 0 (Negative)
TEST # 2
Probabilities: 0.43663546178460994  vs  0.5633645382153898
Prediction: 1 (Positive)


What is we calculate priors ourselves, and then use the Bayes formula?
Let's see and compare the results

In [4]:
import math
# TEST review #1:

# Probability of classifying the review as positive:
p1_pos = 6/10 * math.pow(4/15, 1) * math.pow(7/15, 0) * math.pow(3/15, 1) * math.pow(1/15, 1)

# Probability of classifying the review as negative:
p1_neg = 4/10 * 2/12 * 1 * 5/12 * 4/12

print(p1_neg/(p1_pos + p1_neg), p1_pos/(p1_pos + p1_neg))

0.812743823146944 0.18725617685305593


In [13]:
# TEST review #2:

# Probability of classifying the review as positive:
p2_pos = 6/10 * 4/15 * 7/15 * 3/15 * 1/15

# Probability of classifying the review as negative:
p2_neg = 4/10 * 2/12 * 1/12 * 5/12 * 4/12

print(p2_neg/(p2_pos + p2_neg), p2_pos/(p2_pos + p2_neg))

0.4366354617856643 0.5633645382143356


In [2]:
import math
# Probability of classifying the review as positive:
p1_pos = 6/10 * math.pow(5/19, 1) * math.pow(8/19, 0) * math.pow(4/19, 1) * math.pow(2/19, 1)

# Probability of classifying the review as negative:
p1_neg = 4/10 * 3/16 * 1 * 6/16 * 5/16

print(p1_neg/(p1_pos + p1_neg), p1_pos/(p1_pos + p1_neg))

0.7152490527998889 0.2847509472001112
