In [66]:
import pandas as pd
import numpy as np
import nltk

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
reviews = pd.read_csv('movie_reviews.csv')

In [4]:
reviews.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [7]:
reviews.sample(3000).sentiment.value_counts()

sentiment
positive    1501
negative    1499
Name: count, dtype: int64

In [8]:
reviews_sample = reviews.sample(3000)

In [10]:
reviews_sample.sentiment.value_counts()

sentiment
negative    1502
positive    1498
Name: count, dtype: int64

In [14]:
reviews_sample = reviews_sample.reset_index().drop(columns = 'index')

In [15]:
reviews_sample

Unnamed: 0,review,sentiment
0,"After reading the book, Heart of Darkness, the...",negative
1,Title: Zombie 3 (1988) <br /><br />Directors: ...,negative
2,I got all excited when I saw the ads for this ...,negative
3,This film is basically two hours of Dafoe's ch...,negative
4,Bergman´s tale about how the hell of the war c...,positive
...,...,...
2995,"I've never been impressed by JD anyway, and Fi...",negative
2996,I just finished watching this movie and am dis...,negative
2997,I just read the comments of TomReynolds2004 an...,positive
2998,I watched this film in a very strange way -- I...,positive


In [21]:
import nltk
import re
from bs4 import BeautifulSoup

stop_words = nltk.corpus.stopwords.words('english')


def strip_html(doc):
    soup = BeautifulSoup(doc,"html.parser")
    text = soup.get_text()
    return text
    

def normalize_document(doc):
    doc = strip_html(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [24]:
reviews_sample['review_sample'] = reviews_sample['review'].apply(normalize_document)

In [25]:
reviews_sample

Unnamed: 0,review,sentiment,review_sample
0,"After reading the book, Heart of Darkness, the...",negative,reading book heart darkness movie justice movi...
1,Title: Zombie 3 (1988) <br /><br />Directors: ...,negative,title zombie directors mostly lucio fulci also...
2,I got all excited when I saw the ads for this ...,negative,got excited saw ads movie recently read book r...
3,This film is basically two hours of Dafoe's ch...,negative,film basically two hours dafoes character drin...
4,Bergman´s tale about how the hell of the war c...,positive,bergmans tale hell war drive sensible couple m...
...,...,...,...
2995,"I've never been impressed by JD anyway, and Fi...",negative,ive never impressed jd anyway final justice ha...
2996,I just finished watching this movie and am dis...,negative,finished watching movie disappointed say didnt...
2997,I just read the comments of TomReynolds2004 an...,positive,read comments tomreynolds feel jump understand...
2998,I watched this film in a very strange way -- I...,positive,watched film strange way put netflix list coul...


In [30]:
X = reviews_sample['review_sample']
y = reviews_sample['sentiment']

In [31]:
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.33, random_state=23)

In [32]:
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((2010,), (990,), (2010,), (990,))

# Count Vectorizer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
cv = CountVectorizer(binary=True)

In [34]:
cv_transfored_train_X = cv.fit_transform(train_X)
cv_transfored_test_X = cv.transform(test_X)

In [37]:
model = LogisticRegression()

In [38]:
model.fit(cv_transfored_train_X, train_y)

In [40]:
train_pred = model.predict(cv_transfored_train_X)
test_pred = model.predict(cv_transfored_test_X)

In [41]:
confusion_matrix(train_y, train_pred)

array([[1009,    0],
       [   0, 1001]])

In [42]:
confusion_matrix(test_y, test_pred)

array([[401,  92],
       [ 82, 415]])

In [44]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  1.0
Test Accuracy :  0.8242424242424242


# Bag of Words 

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

In [46]:
cv = CountVectorizer(binary=False)

In [47]:
cv_transfored_train_X = cv.fit_transform(train_X)
cv_transfored_test_X = cv.transform(test_X)

In [48]:
model = LogisticRegression()

In [49]:
model.fit(cv_transfored_train_X, train_y)

In [50]:
train_pred = model.predict(cv_transfored_train_X)
test_pred = model.predict(cv_transfored_test_X)

In [51]:
confusion_matrix(train_y, train_pred)

array([[1009,    0],
       [   0, 1001]])

In [52]:
confusion_matrix(test_y, test_pred)

array([[400,  93],
       [ 82, 415]])

In [53]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  1.0
Test Accuracy :  0.8232323232323232


# TF - IDF

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
tf = TfidfVectorizer()

In [57]:
tf_transfored_train_X = tf.fit_transform(train_X)
tf_transfored_test_X = tf.transform(test_X)

In [58]:
model = LogisticRegression()

In [59]:
model.fit(tf_transfored_train_X, train_y)

In [60]:
train_pred = model.predict(tf_transfored_train_X)
test_pred = model.predict(tf_transfored_test_X)

In [61]:
confusion_matrix(train_y, train_pred)

array([[980,  29],
       [ 24, 977]])

In [62]:
confusion_matrix(test_y, test_pred)

array([[404,  89],
       [ 74, 423]])

In [63]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  0.9736318407960199
Test Accuracy :  0.8353535353535354


# Word Embeddings

### Glove

In [64]:
# Load GloVe embeddings into a dictionary
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings_path = '../Scriptures/glove.6B.300d.txt'  # Adjust the path to your downloaded GloVe file
wv = load_embeddings(glove_embeddings_path)

In [65]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector


def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.keys())
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


In [72]:
normalize_corpus = np.vectorize(normalize_document)
train_norm_corpus = normalize_corpus(train_X)
train_tokenized_corpus = [nltk.word_tokenize(doc) for doc in train_norm_corpus]

test_norm_corpus = normalize_corpus(test_X)
test_tokenized_corpus = [nltk.word_tokenize(doc) for doc in test_norm_corpus]

In [73]:
# get document level embeddings
feature_size = 300
train_features_X = averaged_word_vectorizer(corpus=train_tokenized_corpus, model=wv,
                                             num_features=feature_size)

test_features_X = averaged_word_vectorizer(corpus=test_tokenized_corpus, model=wv,
                                             num_features=feature_size)

In [78]:
pd.DataFrame(train_features_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.095210,0.074208,-0.075189,-0.030519,-0.039498,0.030736,-0.063679,0.019690,-0.030630,-0.982443,...,-0.033514,-0.130089,-0.034337,0.004901,-0.032685,-0.012092,0.049251,-0.013650,-0.049144,0.040632
1,-0.013725,0.034086,-0.034454,-0.092877,0.016656,-0.032830,0.007448,0.008423,0.088812,-1.173755,...,0.010705,-0.071519,-0.028189,-0.021560,0.020954,-0.086438,-0.016503,-0.022217,-0.050941,0.033541
2,-0.055449,0.011165,-0.027353,-0.026629,0.029692,0.039979,0.003559,0.022882,-0.012319,-1.174624,...,-0.045856,-0.084983,-0.020713,0.038621,0.006540,0.090790,-0.031528,-0.063819,-0.022391,0.042568
3,-0.072958,0.083566,-0.049090,-0.077937,-0.047330,0.075804,-0.045134,-0.047141,0.036042,-0.992552,...,0.046525,-0.071441,0.017279,0.046466,-0.054474,-0.075757,0.024841,-0.043944,0.007172,0.068454
4,-0.042490,0.055707,-0.027435,-0.052708,0.088608,-0.030552,0.029926,0.010903,0.030163,-0.944765,...,-0.028950,-0.099857,0.046651,-0.017479,0.027504,-0.106758,-0.026807,-0.033834,-0.049099,0.047046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005,-0.092523,0.081752,-0.063385,-0.033966,0.056871,0.034229,-0.009610,0.025635,0.092939,-1.273092,...,-0.016819,-0.139673,-0.081753,-0.032949,0.077111,-0.101313,-0.003604,-0.073269,-0.119029,0.070678
2006,-0.034708,0.105868,-0.031159,-0.018991,-0.004858,-0.095991,-0.065348,-0.039109,-0.028009,-0.958242,...,-0.044165,-0.089371,-0.016946,-0.028961,-0.026986,-0.113882,0.053097,-0.047088,-0.024108,0.029061
2007,-0.052781,0.045932,-0.024633,-0.101543,-0.003141,0.011376,-0.027632,0.015630,0.041839,-1.078481,...,-0.033619,-0.082491,-0.015605,-0.047796,-0.018306,0.026556,-0.104593,-0.058046,-0.049240,0.011466
2008,-0.062242,0.077842,0.003939,-0.034427,0.034741,0.093695,0.013632,0.045812,0.124574,-0.915449,...,-0.057947,-0.107110,-0.059751,0.042056,0.046700,-0.087650,0.009637,-0.017887,-0.061365,0.116531


In [79]:
pd.DataFrame(test_features_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.085171,0.087697,-0.034940,-0.064675,0.031153,0.007324,-0.096535,0.016857,0.015636,-1.194943,...,0.002860,-0.071951,-0.111008,0.018815,-0.020504,-0.046612,-0.031983,-0.001246,-0.002914,-0.042785
1,-0.071061,0.031449,-0.086494,-0.105086,0.026560,0.076512,-0.035002,0.067925,-0.024546,-0.882894,...,0.008911,-0.040386,-0.118541,-0.027533,0.020789,-0.186462,0.032365,0.010867,0.048979,0.102556
2,-0.055935,0.031655,0.041022,-0.011393,0.079114,0.136252,-0.014718,-0.029704,-0.008770,-0.865452,...,0.003628,-0.031124,0.046586,-0.017082,0.002051,-0.117929,-0.025070,0.002550,-0.049759,0.040466
3,-0.117528,0.048562,0.018459,-0.099897,0.036499,0.097177,-0.015837,0.013810,0.006456,-0.792350,...,-0.018390,-0.078470,-0.017232,-0.014702,-0.044294,-0.148532,-0.028399,-0.041426,-0.027574,0.052501
4,-0.056573,0.014105,-0.042295,0.026413,-0.037524,0.035594,-0.005434,0.069709,0.011840,-0.904231,...,-0.058551,-0.110081,0.005421,-0.024142,-0.031935,-0.264091,-0.023653,0.087820,0.000125,0.069631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,-0.051394,0.035845,-0.034806,-0.065552,0.032538,0.143753,-0.056050,-0.000659,0.059859,-0.692159,...,-0.035126,-0.020249,-0.036673,0.016049,-0.021832,-0.073260,-0.008150,-0.003551,0.012078,-0.043570
986,-0.056376,0.074387,0.004264,-0.162391,-0.075184,0.091519,-0.046987,-0.005608,0.001129,-0.900835,...,-0.099467,-0.093213,-0.068575,0.094120,0.041656,-0.026783,0.066266,0.006180,0.028854,-0.009676
987,-0.066837,0.010423,-0.056334,-0.006306,-0.001397,0.062324,-0.127318,-0.037340,-0.076692,-0.680659,...,-0.075357,0.024648,0.014440,-0.015972,-0.074291,-0.091538,-0.064304,0.054602,0.023180,0.097109
988,-0.075133,-0.013684,0.072210,-0.046680,-0.016196,0.043928,-0.042347,-0.035546,-0.074855,-0.701632,...,0.059805,-0.026380,0.032301,-0.064150,-0.108212,-0.140116,-0.037404,-0.028904,-0.002801,0.085904


In [80]:
model = LogisticRegression()

In [81]:
model.fit(train_features_X, train_y)

In [82]:
train_pred = model.predict(train_features_X)
test_pred = model.predict(test_features_X)

In [83]:
confusion_matrix(train_y, train_pred)

array([[860, 149],
       [144, 857]])

In [84]:
confusion_matrix(test_y, test_pred)

array([[405,  88],
       [ 98, 399]])

In [85]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  0.8542288557213931
Test Accuracy :  0.8121212121212121


# Glove with Deep Learning Architecture

In [161]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout, BatchNormalization
from keras.utils import to_categorical


In [100]:
y_cat_train = to_categorical(np.asarray(train_y.factorize()[0]))
y_cat_test = to_categorical(np.asarray(test_y.factorize()[0]))

In [173]:
model = Sequential()
model.add(Dense(64, kernel_initializer = 'he_normal', input_shape = (300,),activation = 'elu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(32, kernel_initializer = 'he_normal', activation = 'elu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(16 ,kernel_initializer = 'he_normal',activation = 'elu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(8 ,kernel_initializer = 'he_normal', activation = 'elu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(4 ,kernel_initializer = 'he_normal', activation = 'elu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(2, kernel_initializer = 'he_normal', activation = 'softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [174]:
model.summary()

In [175]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'] )

In [176]:
history = model.fit(train_features_X,y_cat_train,epochs = 100, verbose = 1, batch_size=100)

Epoch 1/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5470 - loss: 0.7953
Epoch 2/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6795 - loss: 0.6728
Epoch 3/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6990 - loss: 0.6245
Epoch 4/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7122 - loss: 0.6145
Epoch 5/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7430 - loss: 0.5847
Epoch 6/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7632 - loss: 0.5675
Epoch 7/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7766 - loss: 0.5400
Epoch 8/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7965 - loss: 0.5273
Epoch 9/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━

In [177]:
train_pred = np.argmax(model.predict(train_features_X), axis = 1)
test_pred = np.argmax(model.predict(test_features_X), axis = 1)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 374us/step


In [178]:
train_pred = np.where(train_pred ==0, 'positive', 'negative')
test_pred = np.where(test_pred ==0, 'positive', 'negative')

In [179]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  0.9805970149253731
Test Accuracy :  0.8010101010101011


In [None]:
1. Word2vec Embedding - Google New Vector
2. Train Word2vec Embedding
3. Train FastText Embedding

In [183]:
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g

In [184]:
#FastText Embedding Vector Download
#https://fasttext.cc/docs/en/english-vectors.html