In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [123]:
"""Bag of Words Meets Bags of Popcorn
https://www.kaggle.com/competitions/word2vec-nlp-tutorial
    
Data fields
    id - Unique ID of each review
    sentiment - Sentiment of the review; 1 for positive reviews and 0 for negative reviews
    review - Text of the review
"""

'Bag of Words Meets Bags of Popcorn\nhttps://www.kaggle.com/competitions/word2vec-nlp-tutorial\n    \nData fields\n    id - Unique ID of each review\n    sentiment - Sentiment of the review; 1 for positive reviews and 0 for negative reviews\n    review - Text of the review\n'

In [114]:
# Load data
train = pd.read_csv('data/labeledTrainData.tsv',delimiter='\t')
test = pd.read_csv('data/testData.tsv',delimiter='\t')

print('========= train set: ', train.shape)
print(train, '\n')
print('========= test set: ', test.shape)
print(test, '\n')

            id  sentiment                                             review
0       5814_8          1  With all this stuff going down at the moment w...
1       2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2       7759_3          0  The film starts with a manager (Nicholas Bell)...
3       3630_4          0  It must be assumed that those who praised this...
4       9495_8          1  Superbly trashy and wondrously unpretentious 8...
...        ...        ...                                                ...
24995   3453_3          0  It seems like more consideration has gone into...
24996   5064_1          0  I don't believe they made this film. Completel...
24997  10905_3          0  Guy is a loser. Can't get girls, needs to buil...
24998  10194_3          0  This 30 minute documentary Buñuel made in the ...
24999   8478_8          1  I saw this movie as a child and it broke my he...

[25000 rows x 3 columns] 

             id                                 

In [115]:
# data preprocessing
import re

def review_preprocessing(review):
    # Only alphabet
    review_filtered = re.sub("[^a-zA-Z]", " ", review)
    
    # Only lowercase
    review_filtered = review_filtered.lower()
    
    # Remove stop words
    # TBD
    
    return (review_filtered)

full_train_y = train['sentiment']
full_train_X = []
for review in train['review']:
    full_train_X.append(review_preprocessing(review))
full_train_X = np.array(full_train_X)

full_test_X = []
for review in test['review']:
    full_test_X.append(review_preprocessing(review))
full_test_X = np.array(full_test_X)

print(full_train_X.shape)
print(full_test_X.shape)

(25000,)
(25000,)


In [116]:
# data spliting
from sklearn.model_selection import train_test_split

train_X, validation_X, train_y, validation_y = train_test_split(
    full_train_X,
    full_train_y,
    test_size=0.2,
    random_state=0
)

print(train_X.shape)
print(validation_X.shape)

(20000,)
(5000,)


In [117]:
# build vocabulary dictionary on train data
def GetVocabulary(data):
    vocab_dict = {}
    wid = 0
    for document in data:
        words = document.split()
        for word in words:
            if word not in vocab_dict:
                vocab_dict[word] = wid
                wid += 1
    
    return vocab_dict

vocab_dict = GetVocabulary(train_X)
print('Vocabulary: ' + str(len(vocab_dict.keys())))

Vocabulary: 67009


In [118]:
# vectorize document
def Document2Vector(vocab_dict, data):
    word_vector = np.zeros(len(vocab_dict.keys()))
    words = data.split()
    out_of_voc = 0
    for word in words:
        if word in vocab_dict:
            word_vector[vocab_dict[word]] += 1
        else:
            out_of_voc += 1
            
        return word_vector, out_of_voc

train_matrix = []
for document in train_X:
    word_vector, _ = Document2Vector(vocab_dict, document)
    train_matrix.append(word_vector)
    
print('Train matrix: ' + str(len(train_matrix)) + ' x ' + str(len(train_matrix[0])))

Train matrix: 20000 x 67009


In [119]:
# naive bayes training on two labels
def NaiveBayes_train(train_matrix, train_y, label_1, label_2):
    num_docs = len(train_matrix)
    num_words = len(train_matrix[0])
    
    # counter initialized with smoothing 1
    label_1_word_counter = np.ones(num_words)
    label_2_word_counter = np.ones(num_words)
    
    label_1_total_word_count = 0
    label_2_total_word_count = 0
    
    label_1_count = 0
    label_2_count = 0
    
    for i in range(num_docs):
        if i % (num_docs / 10) == 0:
            print('Train on doc id:' + str(i))
            
        if train_y.values[i] == label_1:
            label_1_word_counter += train_matrix[i]
            label_1_total_word_count += sum(train_matrix[i])
            label_1_count += 1
        elif train_y.values[i] == label_2:
            label_2_word_counter += train_matrix[i]
            label_2_total_word_count += sum(train_matrix[i])
            label_2_count += 1
            
        p_label_1_vector = np.log(label_1_word_counter / (label_1_total_word_count + num_words)) # with smoothing
        p_label_2_vector = np.log(label_2_word_counter / (label_2_total_word_count + num_words)) # with smoothing
    
    return p_label_1_vector, np.log(label_1_count/num_docs), p_label_2_vector, np.log(label_2_count/num_docs), label_1_total_word_count, label_2_total_word_count

p_neg_vector, p_neg, p_pos_vector, p_pos, neg_total_count, pos_total_count = NaiveBayes_train(train_matrix, train_y, 0, 1)


Train on doc id:0
Train on doc id:2000
Train on doc id:4000
Train on doc id:6000
Train on doc id:8000
Train on doc id:10000
Train on doc id:12000
Train on doc id:14000
Train on doc id:16000
Train on doc id:18000


In [121]:
# prediction for validation
def Predict(test_word_vector, p_neg_vector, p_neg, p_pos_vector, p_pos, neg_smoothing, pos_smoothing):
    neg = sum(test_word_vector * p_neg_vector) + p_neg + neg_smoothing
    pos = sum(test_word_vector * p_pos_vector) + p_pos + pos_smoothing
    if (neg > pos):
        return 0
    else:
        return 1
    
num_words = len(vocab_dict.keys())
predictions = []
i = 0
for document in validation_X:
    if (i % (validation_X.shape[0] / 10) == 0):
        print('Test on the doc id: ' + str(i))
    i += 1
    
    test_word_vector, out_of_voc = Document2Vector(vocab_dict, document)
    
    # smoothing
    if (out_of_voc != 0):
        neg_smoothing = np.log(out_of_voc / (neg_total_count + num_words))
        pos_smoothing = np.log(out_of_voc / (pos_total_count + num_words))
    else:
        neg_smoothing = 0
        pos_smoothing = 0
        
    ans = Predict(test_word_vector, p_neg_vector, p_neg, p_pos_vector, p_pos, neg_smoothing, pos_smoothing)
    predictions.append(ans)
    
print('Prediction on verification: ' + str(len(predictions)))

Test on the doc id: 0
Test on the doc id: 500
Test on the doc id: 1000
Test on the doc id: 1500
Test on the doc id: 2000
Test on the doc id: 2500
Test on the doc id: 3000
Test on the doc id: 3500
Test on the doc id: 4000
Test on the doc id: 4500
Prediction on verification: 5000


In [111]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

print(accuracy_score(validation_y.values, predictions))
print(classification_report(validation_y.values, predictions))
print(confusion_matrix(validation_y, predictions))


# print(validation_y.values[:100])
# print(predictions)

0.5512
              precision    recall  f1-score   support

           0       0.53      0.84      0.65      2459
           1       0.64      0.27      0.38      2541

    accuracy                           0.55      5000
   macro avg       0.58      0.56      0.51      5000
weighted avg       0.58      0.55      0.51      5000

[[2069  390]
 [1854  687]]
