In [2]:
import pandas as pd
import numpy as np
import nltk

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [3]:
reviews = pd.read_csv('movie_reviews.csv')

In [4]:
reviews.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [5]:
reviews.sample(3000).sentiment.value_counts()

sentiment
negative    1507
positive    1493
Name: count, dtype: int64

In [6]:
reviews_sample = reviews.sample(3000)

In [7]:
reviews_sample.sentiment.value_counts()

sentiment
positive    1573
negative    1427
Name: count, dtype: int64

In [8]:
reviews_sample = reviews_sample.reset_index().drop(columns = 'index')

In [9]:
reviews_sample

Unnamed: 0,review,sentiment
0,"I saw this recently and I must say, I was move...",negative
1,Have to disagree with people saying that this ...,negative
2,This is an amazing movie and all of the actors...,positive
3,I expected this movie was originally supposed ...,negative
4,"In the first transformation scene, what is the...",positive
...,...,...
2995,Rififi is a great film that is overlooked. It'...,positive
2996,"Like many, this dung heap caught my eye while ...",negative
2997,A not bad but also not so great heist film. Ki...,negative
2998,"Classic, highly influential low budget thrille...",positive


In [10]:
import nltk
import re
from bs4 import BeautifulSoup

stop_words = nltk.corpus.stopwords.words('english')


def strip_html(doc):
    soup = BeautifulSoup(doc,"html.parser")
    text = soup.get_text()
    return text
    

def normalize_document(doc):
    doc = strip_html(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [11]:
reviews_sample['review_sample'] = reviews_sample['review'].apply(normalize_document)

  soup = BeautifulSoup(doc,"html.parser")


In [12]:
reviews_sample

Unnamed: 0,review,sentiment,review_sample
0,"I saw this recently and I must say, I was move...",negative,saw recently must say moved factual basis stor...
1,Have to disagree with people saying that this ...,negative,disagree people saying lousy horror film good ...
2,This is an amazing movie and all of the actors...,positive,amazing movie actors actresses good even thoug...
3,I expected this movie was originally supposed ...,negative,expected movie originally supposed show electi...
4,"In the first transformation scene, what is the...",positive,first transformation scene music ive heard gre...
...,...,...,...
2995,Rififi is a great film that is overlooked. It'...,positive,rififi great film overlooked crime drama man g...
2996,"Like many, this dung heap caught my eye while ...",negative,like many dung heap caught eye channel surfing...
2997,A not bad but also not so great heist film. Ki...,negative,bad also great heist film kirk douglas recentl...
2998,"Classic, highly influential low budget thrille...",positive,classic highly influential low budget thriller...


In [13]:
X = reviews_sample['review_sample']
y = reviews_sample['sentiment']

In [14]:
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.33, random_state=23)

In [15]:
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((2010,), (990,), (2010,), (990,))

# Count Vectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer(binary=True)

In [18]:
cv_transfored_train_X = cv.fit_transform(train_X)
cv_transfored_test_X = cv.transform(test_X)

In [19]:
model = LogisticRegression()

In [20]:
model.fit(cv_transfored_train_X, train_y)

In [21]:
train_pred = model.predict(cv_transfored_train_X)
test_pred = model.predict(cv_transfored_test_X)

In [22]:
confusion_matrix(train_y, train_pred)

array([[ 957,    0],
       [   0, 1053]])

In [23]:
confusion_matrix(test_y, test_pred)

array([[363, 107],
       [ 55, 465]])

In [24]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  1.0
Test Accuracy :  0.8363636363636363


# Bag of Words 

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
cv = CountVectorizer(binary=False)

In [31]:
cv_transfored_train_X = cv.fit_transform(train_X)
cv_transfored_test_X = cv.transform(test_X)

In [32]:
model = LogisticRegression()

In [33]:
model.fit(cv_transfored_train_X, train_y)

In [34]:
train_pred = model.predict(cv_transfored_train_X)
test_pred = model.predict(cv_transfored_test_X)

In [35]:
confusion_matrix(train_y, train_pred)

array([[1032,    0],
       [   0,  978]])

In [36]:
confusion_matrix(test_y, test_pred)

array([[399,  95],
       [ 99, 397]])

In [37]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  1.0
Test Accuracy :  0.804040404040404


# TF - IDF

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
tf = TfidfVectorizer()

In [40]:
tf_transfored_train_X = tf.fit_transform(train_X)
tf_transfored_test_X = tf.transform(test_X)

In [41]:
model = LogisticRegression()

In [42]:
model.fit(tf_transfored_train_X, train_y)

In [43]:
train_pred = model.predict(tf_transfored_train_X)
test_pred = model.predict(tf_transfored_test_X)

In [44]:
confusion_matrix(train_y, train_pred)

array([[1014,   18],
       [  26,  952]])

In [45]:
confusion_matrix(test_y, test_pred)

array([[404,  90],
       [100, 396]])

In [46]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  0.9781094527363184
Test Accuracy :  0.8080808080808081


# Word Embeddings

### Glove

In [25]:
# Load GloVe embeddings into a dictionary
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings_path = 'glove.6B.300d.txt'  # Adjust the path to your downloaded GloVe file
wv = load_embeddings(glove_embeddings_path)

In [26]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector


def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.keys())
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


In [27]:
normalize_corpus = np.vectorize(normalize_document)
train_norm_corpus = normalize_corpus(train_X)
train_tokenized_corpus = [nltk.word_tokenize(doc) for doc in train_norm_corpus]

test_norm_corpus = normalize_corpus(test_X)
test_tokenized_corpus = [nltk.word_tokenize(doc) for doc in test_norm_corpus]

In [28]:
# get document level embeddings
feature_size = 300
train_features_X = averaged_word_vectorizer(corpus=train_tokenized_corpus, model=wv,
                                             num_features=feature_size)

test_features_X = averaged_word_vectorizer(corpus=test_tokenized_corpus, model=wv,
                                             num_features=feature_size)

In [29]:
pd.DataFrame(train_features_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.049165,0.052444,0.031960,-0.071950,0.032793,0.048069,-0.053564,0.016657,0.024148,-1.108993,...,-0.045978,-0.073211,-0.046919,0.012517,0.001496,0.004134,-0.050841,0.019352,-0.050495,-0.000791
1,-0.105681,0.045909,-0.004646,-0.030854,0.056146,0.005300,-0.034945,-0.006234,0.074045,-0.909599,...,0.002156,-0.043157,-0.000104,0.010153,0.041413,-0.059955,-0.016339,-0.022908,-0.091051,0.020144
2,-0.046176,-0.007872,-0.054750,-0.051546,0.019541,-0.028307,-0.063718,-0.053443,-0.029368,-0.912395,...,0.036766,-0.028397,0.028678,0.149325,-0.007303,-0.064194,-0.033430,-0.045291,-0.100903,0.101187
3,-0.084055,0.144388,-0.048921,-0.061271,0.001274,0.099269,-0.019970,0.023440,-0.033673,-0.991250,...,-0.048714,-0.037017,-0.060602,0.010045,0.022147,-0.122591,0.001777,-0.073922,-0.026341,-0.010161
4,-0.055458,0.043987,-0.013972,-0.056797,-0.022531,0.055491,-0.064837,0.089841,0.040812,-0.844451,...,0.020417,-0.099706,-0.055017,0.071576,-0.011612,-0.147060,0.054429,-0.018500,-0.078385,0.151170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005,-0.100532,-0.029887,0.067890,-0.044762,0.107496,0.106629,-0.144847,0.031142,0.083907,-1.031707,...,0.041463,0.017034,-0.073145,0.070133,-0.088130,-0.013833,-0.103302,-0.045891,-0.061425,0.054517
2006,-0.091245,0.056812,-0.036939,-0.012791,0.003856,0.051798,-0.093165,-0.099857,-0.033069,-0.845264,...,0.002284,-0.117497,-0.077910,0.057344,0.021356,-0.041718,0.002215,-0.032273,-0.013484,0.031794
2007,-0.085236,0.001140,-0.006074,-0.082129,0.025673,0.047489,-0.065533,-0.062481,0.024915,-1.008796,...,-0.017099,-0.110478,0.072349,0.034391,0.028183,-0.107158,0.054900,-0.003695,-0.079584,0.042854
2008,-0.043361,0.118525,-0.025280,-0.054349,0.082131,0.001435,-0.151787,-0.011836,-0.015750,-0.884487,...,0.030698,-0.063872,-0.075988,-0.017931,-0.083995,0.036870,-0.005913,-0.076154,-0.063887,0.105094


In [30]:
pd.DataFrame(test_features_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.052537,0.091124,-0.065973,-0.021576,0.022972,0.134362,-0.070737,0.091711,0.045290,-1.134408,...,0.019397,-0.237070,-0.108205,-0.078260,-0.190459,-0.159598,0.047273,0.077141,0.004034,-0.028294
1,-0.072845,0.047125,0.021333,-0.093270,0.088039,0.081752,-0.102703,0.015140,0.065554,-1.089631,...,0.013210,-0.163945,-0.123100,0.015224,-0.052538,0.138016,-0.032549,-0.034749,0.010404,0.057869
2,-0.029183,-0.019746,-0.011152,-0.040143,0.004725,-0.008937,-0.090370,0.030888,0.054048,-0.974858,...,-0.030922,-0.078219,0.000690,0.067660,0.044929,-0.163237,-0.106094,0.078722,-0.050162,0.009220
3,-0.069958,0.084086,-0.055243,-0.122744,-0.030658,0.106131,-0.042479,0.070517,0.061982,-0.914609,...,0.035870,-0.032976,-0.075464,0.004362,-0.037075,-0.009298,0.029799,0.023376,0.004521,0.049957
4,-0.079415,0.054271,-0.027632,0.016405,0.026526,0.055384,-0.057809,0.006547,0.039659,-0.755967,...,-0.054058,-0.086566,-0.048011,-0.012868,0.047532,-0.168991,0.020066,0.018346,-0.051851,0.111961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,-0.022409,-0.023695,-0.024228,-0.038190,0.028569,0.019978,-0.017572,0.096081,0.035262,-1.069151,...,0.046890,-0.076336,0.008560,0.098530,-0.066955,-0.130243,0.028522,-0.094712,-0.023147,0.048259
986,-0.119466,0.004307,-0.053651,-0.074866,-0.046343,0.044201,-0.098234,0.004778,0.014118,-1.069709,...,0.058214,-0.097406,-0.091344,-0.036342,-0.073859,-0.153723,-0.028258,0.034064,0.035100,0.092933
987,-0.011719,0.055906,0.007184,-0.069013,0.010372,0.095319,-0.135913,-0.019853,-0.005765,-1.078017,...,-0.023421,-0.109827,-0.051343,0.017536,0.048360,-0.025041,0.029640,-0.046908,-0.091542,0.060366
988,-0.060785,0.023363,-0.039515,-0.061046,0.042034,0.045911,-0.117750,-0.054647,0.080711,-0.929332,...,-0.013616,-0.018049,-0.047607,0.060981,-0.000740,-0.024292,0.023542,-0.009262,-0.021186,0.060067


In [56]:
model = LogisticRegression()

In [57]:
model.fit(train_features_X, train_y)

In [58]:
train_pred = model.predict(train_features_X)
test_pred = model.predict(test_features_X)

In [59]:
confusion_matrix(train_y, train_pred)

array([[878, 154],
       [179, 799]])

In [60]:
confusion_matrix(test_y, test_pred)

array([[408,  86],
       [111, 385]])

In [61]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  0.8343283582089552
Test Accuracy :  0.8010101010101011


# Glove with Deep Learning Architecture

In [62]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout, BatchNormalization
from keras.utils import to_categorical


In [63]:
y_cat_train = to_categorical(np.asarray(train_y.factorize()[0]))
y_cat_test = to_categorical(np.asarray(test_y.factorize()[0]))

In [64]:
model = Sequential()
model.add(Dense(64, kernel_initializer = 'he_normal', input_shape = (300,),activation = 'elu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(32, kernel_initializer = 'he_normal', activation = 'elu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(16 ,kernel_initializer = 'he_normal',activation = 'elu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(8 ,kernel_initializer = 'he_normal', activation = 'elu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(4 ,kernel_initializer = 'he_normal', activation = 'elu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(2, kernel_initializer = 'he_normal', activation = 'softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [65]:
model.summary()

In [66]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'] )

In [67]:
history = model.fit(train_features_X,y_cat_train,epochs = 100, verbose = 1, batch_size=100)

Epoch 1/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5063 - loss: 0.9787
Epoch 2/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6025 - loss: 0.8067
Epoch 3/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6311 - loss: 0.7481
Epoch 4/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6673 - loss: 0.6984
Epoch 5/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6952 - loss: 0.6568
Epoch 6/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 897us/step - accuracy: 0.7005 - loss: 0.6240
Epoch 7/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 821us/step - accuracy: 0.7352 - loss: 0.6065
Epoch 8/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 837us/step - accuracy: 0.7595 - loss: 0.5742
Epoch 9/100
[1m21/21[0m [32m━━━━━━━━━━━

In [68]:
train_pred = np.argmax(model.predict(train_features_X), axis = 1)
test_pred = np.argmax(model.predict(test_features_X), axis = 1)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 326us/step


In [69]:
train_pred = np.where(train_pred ==0, 'positive', 'negative')
test_pred = np.where(test_pred ==0, 'positive', 'negative')

In [70]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  0.9786069651741294
Test Accuracy :  0.7666666666666667


In [183]:
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g

In [184]:
#FastText Embedding Vector Download
#https://fasttext.cc/docs/en/english-vectors.html

# Word2vec Pretraining model

In [78]:
from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import word_tokenize

In [72]:
path_to_model = 'word2vec-google-news-300.bin'

#Load W2V model. This will take some time. 
%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)
print('done loading Word2Vec')

CPU times: user 6.51 s, sys: 742 ms, total: 7.25 s
Wall time: 7.77 s
done loading Word2Vec


In [73]:
#Inspect the model
print('Shape :',w2v_model.vectors.shape)
word2vec_vocab = w2v_model.index_to_key 
word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]
print('Total vocabulary counts :',len(word2vec_vocab))

Shape : (3000000, 300)
Total vocabulary counts : 3000000


In [86]:
for word in train_tokenized_corpus[0]:
    try:
        word = word.lower()
        embeddings = w2v_model.key_to_index[word]
        print(word,'-->',embeddings)
    except:
        print(word)

story --> 550
documenting --> 20656
rise --> 1027
chinas
first --> 56
emperor --> 30068
efforts --> 763
unify --> 25688
empire --> 10746
expensive --> 2414
movie --> 1121
production --> 620
chinese --> 100038
historyits
worth --> 1069
every --> 274
penny --> 8725
visually --> 20622
dazzling --> 16782
cinematography --> 37101
sweeping --> 6378
score --> 1012
outstanding --> 1890
characters --> 3304
make --> 109
one --> 45
finest --> 8214
epics --> 71594
ever --> 518
put --> 238
film --> 692
foreign --> 754
otherwise --> 2357
please --> 2456
miss --> 2116
opportunity --> 703
see --> 158
big --> 276
screen --> 1973


In [87]:
# Creating a feature vector by averaging all embeddings for all sentences
def features_embeddings(list_of_lists):
    
    # feature vector column length 
    DIMENSION = w2v_model.vector_size
    
    # Zero vector if words not in our w2v_model vocubulary
    zero_vector = np.zeros(DIMENSION)
    
    features = []
    
    # feature vector for each sentence in list_of_lists one by one
    for tokens in list_of_lists:
        
        # feature vector for a single sentence
        feature =  np.zeros(DIMENSION) # shape = 1X DIMENSION
        
        # count for each token/word in a tokens or sentence
        feature_counts = 0 + 1e-5   # to avoid divide-by-zero 
        for token in tokens:
            if token in w2v_model:
                feature += w2v_model[token] # Here both feature and w2v_model[token] are of same shape : 1X DIMENSION
                feature_counts +=1    # Count increases for each token in a tokens
         
        # Average the feature values for each tokens
        if(feature_counts !=0 ):        # for non empty tokens
            features.append(feature/feature_counts) 
        else:                           # for empty tokens
            features.append(zero_vector)
            
    return features

In [90]:
train_vectors = features_embeddings(train_tokenized_corpus)
print(len(train_vectors))
train_vectors[0].shape

2010


(300,)

In [91]:
test_vectors = features_embeddings(test_tokenized_corpus)
print(len(test_vectors))
test_vectors[0].shape

990


(300,)

In [94]:
pd.DataFrame(train_vectors)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.025413,0.017893,0.009669,0.100787,0.006266,0.008118,0.087102,-0.022144,0.094054,0.093803,...,-0.134564,0.016229,-0.099295,0.033675,-0.013184,-0.042983,-0.013791,-0.068369,0.027123,-0.027254
1,0.041864,0.029539,-0.003314,0.060085,-0.037326,0.024126,0.069229,-0.044021,0.081423,0.041232,...,-0.137162,0.055503,-0.123243,-0.004244,-0.045488,-0.014705,0.026503,-0.036409,0.060550,-0.020487
2,0.058087,0.059167,-0.002032,0.059178,-0.067313,-0.008359,0.055403,-0.065988,0.090547,0.091691,...,-0.106429,0.020701,-0.087849,0.028960,-0.039412,-0.018355,0.006856,-0.051078,0.029464,-0.021360
3,0.055705,0.011109,-0.012106,0.110000,-0.038259,-0.005965,0.032449,-0.006873,0.073128,0.077147,...,-0.070112,0.076626,-0.136963,0.027219,-0.047677,-0.033622,0.011968,-0.064334,-0.002256,0.021236
4,0.055795,0.065975,0.007370,0.097802,-0.042695,-0.013492,0.041037,-0.053537,0.053832,0.062231,...,-0.048031,0.038145,-0.087831,0.019510,-0.049447,-0.020297,0.035594,-0.054443,0.001766,-0.020204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005,0.077894,0.048710,-0.000583,0.103553,-0.087790,0.059894,0.082951,-0.036448,0.093953,0.095390,...,-0.098446,0.056713,-0.084245,-0.011151,-0.068074,-0.021399,0.044470,-0.062348,0.042190,0.017638
2006,0.044526,0.053642,0.012405,0.116098,-0.079652,0.042850,0.064464,-0.086374,0.069077,0.077816,...,-0.031583,0.085157,-0.107991,0.040527,-0.072522,-0.061935,0.003120,-0.048855,-0.002187,-0.013914
2007,0.051548,0.046371,0.011113,0.081187,-0.054934,0.000628,0.046997,-0.069626,0.075349,0.078323,...,-0.056852,0.055989,-0.086515,0.009794,-0.051231,0.006207,0.023844,-0.069063,0.019625,0.010695
2008,0.046670,0.050964,-0.023507,0.095043,-0.012921,-0.019055,0.035108,-0.045781,0.077376,0.050663,...,-0.039909,0.045488,-0.077940,-0.015282,-0.033841,-0.007587,0.033654,-0.057930,0.020167,0.016718


In [95]:
model = LogisticRegression()

In [96]:
model.fit(train_vectors, train_y)

In [97]:
train_pred = model.predict(train_vectors)
test_pred = model.predict(test_vectors)

In [98]:
confusion_matrix(train_y, train_pred)

array([[885, 147],
       [175, 803]])

In [99]:
confusion_matrix(test_y, test_pred)

array([[427,  67],
       [118, 378]])

In [100]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  0.8398009950248756
Test Accuracy :  0.8131313131313131


# Word2vec Training

In [128]:
# Creating a feature vector by averaging all embeddings for all sentences
def features_embeddings_trained(list_of_lists):
    
    # feature vector column length 
    DIMENSION = w2v_model.vector_size
    
    # Zero vector if words not in our w2v_model vocubulary
    zero_vector = np.zeros(DIMENSION)
    
    features = []
    
    # feature vector for each sentence in list_of_lists one by one
    for tokens in list_of_lists:
        
        # feature vector for a single sentence
        feature =  np.zeros(DIMENSION) # shape = 1X DIMENSION
        
        # count for each token/word in a tokens or sentence
        feature_counts = 0 + 1e-5   # to avoid divide-by-zero 
        for token in tokens:
            if token in w2v_model.wv.index_to_key:
                feature += w2v_model.wv[token] # Here both feature and w2v_model[token] are of same shape : 1X DIMENSION
                feature_counts +=1    # Count increases for each token in a tokens
         
        # Average the feature values for each tokens
        if(feature_counts !=0 ):        # for non empty tokens
            features.append(feature/feature_counts) 
        else:                           # for empty tokens
            features.append(zero_vector)
            
    return features

In [104]:
tokenized_corpus = train_tokenized_corpus + test_tokenized_corpus

In [31]:
train_tokenized_corpus

[['intense',
  'fascinating',
  'drama',
  'similarities',
  'tone',
  'subject',
  'films',
  'like',
  'bully',
  'elephant',
  'kids',
  'although',
  'point',
  'view',
  'superior',
  'film',
  'three',
  'onesbefore',
  'watching',
  'movie',
  'expectations',
  'neutral',
  'ended',
  'taking',
  'good',
  'impression',
  'thanks',
  'intelligent',
  'screenplay',
  'murali',
  'k',
  'thalluris',
  'perfect',
  'direction',
  'excellent',
  'performances',
  'group',
  'young',
  'actorsthe',
  'sporadic',
  'instances',
  'school',
  'violence',
  'around',
  'world',
  'inspired',
  'many',
  'movies',
  'tv',
  'programmes',
  'books',
  'pretend',
  'find',
  'predict',
  'external',
  'internal',
  'reasons',
  'unbridled',
  'expressions',
  'rebelliousness',
  'discontentand',
  'specially',
  'madnesssome',
  'people',
  'may',
  'say',
  'arrives',
  'late',
  'artistic',
  'movement',
  'however',
  'think',
  'like',
  'delay',
  'permitted',
  'movie',
  'make',
  '

In [114]:
from gensim.models import Word2Vec


w2v_model = Word2Vec(tokenized_corpus, vector_size=100, window=5, min_count=5, workers=4, epochs=1000)

In [117]:
w2v_model.wv.get_vector('good')

array([-1.2162328e+00,  2.8214784e+00, -1.6183339e+00, -6.6693825e-01,
        3.1950185e+00, -2.5314000e-01, -1.1005534e+00,  2.3824410e+00,
        2.1774043e-01, -9.0693676e-01, -1.3202015e+00, -3.0586648e+00,
       -1.6025095e+00, -4.2471405e-02,  1.1779610e+00, -2.2992020e+00,
        1.6685644e+00, -1.5037817e+00,  3.1916767e-01, -4.8470378e+00,
       -1.1382118e+00,  1.4438250e+00,  1.3072069e+00,  1.6388686e+00,
        5.8727002e-01,  7.9172087e-01,  8.9846438e-01,  1.4334803e+00,
       -3.8484967e+00,  6.4467245e-01,  5.3743535e-01,  3.1226642e+00,
       -1.0429981e+00, -2.4385281e+00,  8.3032900e-01,  7.8326362e-01,
        7.4336153e-01, -8.4292746e-01, -4.3271947e-01, -1.9767152e+00,
       -1.6679970e+00, -1.3975409e-01, -1.6176885e-01,  3.8531470e-01,
        2.1063437e+00, -2.9386628e+00, -1.9809042e+00,  5.2611160e-01,
        6.8800747e-01,  4.0107608e+00,  7.9448837e-05, -2.0757146e+00,
        1.2549385e+00, -9.0661758e-01, -1.4107471e+00, -2.2575657e-01,
      

In [129]:
train_vectors = features_embeddings_trained(train_tokenized_corpus)
print(len(train_vectors))
train_vectors[0].shape

2010


(100,)

In [130]:
test_vectors = features_embeddings_trained(test_tokenized_corpus)
print(len(test_vectors))
test_vectors[0].shape

990


(100,)

In [131]:
pd.DataFrame(train_vectors)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.542037,0.207305,1.294649,0.187143,0.488779,0.268629,-0.910947,0.615916,0.199186,1.611569,...,-0.178649,-0.064968,0.608629,0.062858,0.090038,1.146683,0.547711,0.386943,-0.177861,0.488217
1,-0.547686,0.355771,0.166179,-0.678580,0.351975,-0.926832,-0.048489,0.668812,-0.215949,0.300916,...,0.782023,-0.358429,0.202502,-0.425116,0.637140,0.518589,0.706520,-0.911478,0.039270,0.930268
2,0.116587,-0.036734,0.051043,0.324732,0.158343,-0.727407,0.036239,0.835853,0.013212,-0.186487,...,0.489580,0.111286,-0.297007,0.147138,0.424103,0.587565,0.367007,-0.434915,0.163814,0.683255
3,-0.228311,0.485657,0.002040,-1.189860,0.235160,0.173281,0.261406,1.065048,-0.415442,0.322661,...,0.660357,0.203004,1.263204,0.071106,1.013812,0.558073,0.958466,-1.139623,1.354658,0.349583
4,-0.905322,0.684515,0.226590,-0.571770,1.019778,-0.274504,0.277069,0.826453,-0.102960,-0.080917,...,0.302292,-0.411636,0.831800,0.003049,1.002412,0.173870,0.225041,0.049867,0.011661,-0.309264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005,0.416515,1.469960,-0.865980,-0.318793,0.090127,-0.181142,-0.393172,1.767932,-0.033844,0.087816,...,0.204167,-0.771697,-0.384556,0.934168,-0.252340,0.406547,0.643272,-0.235321,-0.125435,-0.524991
2006,-0.801499,0.963594,1.378747,0.715873,-0.079275,-0.527411,-0.051364,1.312657,-0.351139,-0.641406,...,0.463297,-0.044615,0.106691,-0.112504,0.669756,0.821823,1.140819,0.016741,0.731739,0.154779
2007,-0.084227,0.398447,0.062124,-0.537270,0.218859,0.075756,-0.187547,-0.189401,-0.381424,0.118413,...,0.019469,0.049975,0.154613,0.189762,-0.318999,0.190289,-0.010788,0.303954,-0.456986,0.004394
2008,-0.035457,0.467487,0.638060,-0.615420,-0.076184,-0.966929,-0.580523,0.349052,0.073621,0.165564,...,0.978257,-1.152258,0.020195,0.117045,0.458258,-0.055478,-0.830837,0.583896,0.226798,0.588833


In [132]:
model = LogisticRegression()

In [133]:
model.fit(train_vectors, train_y)

In [134]:
train_pred = model.predict(train_vectors)
test_pred = model.predict(test_vectors)

In [135]:
confusion_matrix(train_y, train_pred)

array([[877, 155],
       [161, 817]])

In [136]:
confusion_matrix(test_y, test_pred)

array([[397,  97],
       [106, 390]])

In [137]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  0.8427860696517413
Test Accuracy :  0.794949494949495


# Fasttext Taining Model

In [138]:
from gensim.models.fasttext import FastText

In [139]:
# Creating a feature vector by averaging all embeddings for all sentences
def features_embeddings_trained(list_of_lists):
    
    # feature vector column length 
    DIMENSION = w2v_model.vector_size
    
    # Zero vector if words not in our w2v_model vocubulary
    zero_vector = np.zeros(DIMENSION)
    
    features = []
    
    # feature vector for each sentence in list_of_lists one by one
    for tokens in list_of_lists:
        
        # feature vector for a single sentence
        feature =  np.zeros(DIMENSION) # shape = 1X DIMENSION
        
        # count for each token/word in a tokens or sentence
        feature_counts = 0 + 1e-5   # to avoid divide-by-zero 
        for token in tokens:
            if token in w2v_model.wv.index_to_key:
                feature += w2v_model.wv[token] # Here both feature and w2v_model[token] are of same shape : 1X DIMENSION
                feature_counts +=1    # Count increases for each token in a tokens
         
        # Average the feature values for each tokens
        if(feature_counts !=0 ):        # for non empty tokens
            features.append(feature/feature_counts) 
        else:                           # for empty tokens
            features.append(zero_vector)
            
    return features

In [140]:
tokenized_corpus = train_tokenized_corpus + test_tokenized_corpus

In [163]:
w2v_model = FastText(tokenized_corpus, vector_size=300, window=5, min_count=1, workers=4, epochs=100)

In [164]:
w2v_model.wv.get_vector('good')

array([-0.39554578,  0.30556968, -0.3278093 ,  0.6070886 , -1.1435615 ,
        0.6960136 ,  1.8225309 , -0.08764059,  0.02879203,  1.8186275 ,
       -2.6683233 , -1.1544572 , -3.6363158 ,  1.4927639 ,  0.25606278,
        1.6747531 , -0.02730703, -0.19937731,  0.5704363 ,  0.37684003,
       -1.1360894 ,  0.10065033,  0.06069138, -1.2784314 ,  0.1724576 ,
       -1.0831739 , -1.2671391 ,  0.96431136,  2.237326  ,  0.36201993,
        0.33683383, -0.8974773 , -2.0102575 , -1.2174679 ,  0.2255754 ,
        0.07015803,  3.3393335 , -1.8132381 ,  1.234856  ,  2.253016  ,
       -0.3265842 ,  1.464262  , -0.5921297 , -3.4554558 ,  2.0791218 ,
        0.12724856, -0.79863393, -2.32285   , -0.89043736,  0.01599858,
       -0.14147553,  0.27829286, -0.19791688,  0.8342646 ,  0.5525431 ,
        0.09347764,  0.06730296,  0.73783684,  0.7105636 ,  1.0975491 ,
        1.3238546 ,  0.13173565,  2.3809848 ,  1.056398  ,  2.9809813 ,
       -3.7532227 ,  5.270201  ,  1.7451004 , -1.5945438 , -0.32

In [165]:
train_vectors = features_embeddings_trained(train_tokenized_corpus)
print(len(train_vectors))
train_vectors[0].shape

2010


(300,)

In [166]:
test_vectors = features_embeddings_trained(test_tokenized_corpus)
print(len(test_vectors))
test_vectors[0].shape

990


(300,)

In [167]:
pd.DataFrame(train_vectors)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.071765,-0.501332,0.266866,-0.539072,0.543804,0.260111,0.175274,-0.025657,0.591547,-0.472240,...,0.123862,-0.184058,-0.063175,1.502401,0.105758,0.561694,0.195198,-0.154106,-0.705223,0.553513
1,0.201421,-0.835796,0.167064,-0.570961,0.463955,0.147549,0.522221,-0.085878,0.511895,-0.483975,...,0.113176,-0.324834,-0.197113,1.404224,0.185555,0.433111,0.254629,0.201344,-0.938394,0.258309
2,0.100804,-0.761593,0.050553,-0.359987,0.032850,0.450145,-0.044176,-0.275377,0.272777,-0.240959,...,0.334790,-0.360708,0.444338,0.911963,-0.183864,0.532553,0.163177,0.660180,-0.567780,0.196322
3,-0.059055,-0.613561,-0.352019,-0.197341,0.321426,-0.136975,0.484984,-0.339944,0.397234,-0.211405,...,0.324440,-0.049765,-0.042620,0.549133,0.031745,0.500848,0.243917,0.154506,-0.393167,0.501330
4,-0.247734,-0.122627,-0.234251,0.113897,0.412909,0.203975,0.196843,0.114238,-0.100210,0.260022,...,0.590531,-0.380305,-0.021766,0.510044,-0.107208,0.132068,-0.025983,0.021583,0.103656,0.356382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005,0.145286,-0.660611,0.096615,0.059080,0.418366,-0.048677,0.231591,-0.360000,0.168896,0.012134,...,0.323186,0.065004,-0.259029,1.012612,-0.184087,0.426128,0.422127,0.025579,-0.266824,-0.152251
2006,-0.132375,0.007622,-0.060736,-0.169088,-0.065853,-0.240801,-0.004792,-0.474806,0.204424,-0.280626,...,0.336286,-0.373046,-0.101730,0.558735,0.085740,0.723193,0.238508,0.306485,-0.265801,0.111616
2007,0.061875,-0.504767,0.128449,-0.309454,0.239758,0.448429,0.053110,-0.022847,0.268978,-0.157851,...,0.119790,-0.248116,0.137337,0.471121,-0.248154,0.209134,0.027269,0.123677,-0.271872,0.236434
2008,0.481774,-0.456385,0.371910,-0.351619,0.140414,0.081677,0.249063,-0.453577,0.048269,-0.103956,...,-0.006385,-0.106864,-0.074057,0.126658,-0.418518,0.317682,0.147618,0.253540,-0.325074,-0.163294


In [168]:
model = LogisticRegression()

In [169]:
model.fit(train_vectors, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [170]:
train_pred = model.predict(train_vectors)
test_pred = model.predict(test_vectors)

In [171]:
confusion_matrix(train_y, train_pred)

array([[917, 115],
       [110, 868]])

In [172]:
confusion_matrix(test_y, test_pred)

array([[403,  91],
       [113, 383]])

In [173]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  0.8880597014925373
Test Accuracy :  0.793939393939394


# Fasttext Pre trained

In [None]:
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')

In [194]:
ft.save_model('cc.en.100.bin')


In [178]:
ft.get_dimension()

300

In [179]:
ft.get_word_vector('good')

array([-0.09213716, -0.0634383 ,  0.00173813,  0.13524324, -0.06561062,
        0.00619071,  0.12609869, -0.01646539,  0.0174491 , -0.00126792,
       -0.09709831,  0.02329333,  0.00996784,  0.00463419,  0.01587938,
        0.00689824,  0.08575399, -0.01988525, -0.0601579 , -0.02327966,
        0.01183712,  0.08217917,  0.01488847,  0.00902181,  0.00696296,
       -0.06426616,  0.03345198, -0.02101481,  0.06767873,  0.03022419,
        0.07203474, -0.05689922, -0.04370377,  0.00642597,  0.0439174 ,
        0.0604848 , -0.00611545, -0.12256738, -0.03530414, -0.02696739,
       -0.02058216,  0.00752347, -0.00686451,  0.0362783 , -0.03308735,
        0.05801626,  0.00832448, -0.06336953, -0.05775082,  0.01089846,
       -0.0925179 ,  0.01559984, -0.04079024,  0.0066871 , -0.06374165,
        0.05881973,  0.07209535, -0.05387195, -0.14658651, -0.04046486,
       -0.02507038, -0.04954465, -0.05224417, -0.06846938,  0.0467079 ,
        0.00459271, -0.07522177,  0.03627685, -0.0698283 ,  0.01

In [182]:
# Creating a feature vector by averaging all embeddings for all sentences
def features_embeddings_pretrained_fasttext(w2v_model, list_of_lists):
    
    # feature vector column length 
    DIMENSION = w2v_model.get_dimension()
    
    # Zero vector if words not in our w2v_model vocubulary
    zero_vector = np.zeros(DIMENSION)
    
    features = []
    
    # feature vector for each sentence in list_of_lists one by one
    for tokens in list_of_lists:
        
        # feature vector for a single sentence
        feature =  np.zeros(DIMENSION) # shape = 1X DIMENSION
        
        # count for each token/word in a tokens or sentence
        feature_counts = 0 + 1e-5   # to avoid divide-by-zero 
        for token in tokens:
            feature += w2v_model.get_word_vector(token) # Here both feature and w2v_model[token] are of same shape : 1X DIMENSION
            feature_counts +=1    # Count increases for each token in a tokens
         
        # Average the feature values for each tokens
        if(feature_counts !=0 ):        # for non empty tokens
            features.append(feature/feature_counts) 
        else:                           # for empty tokens
            features.append(zero_vector)     
    return features

In [183]:
train_vectors = features_embeddings_pretrained_fasttext(ft, train_tokenized_corpus)
print(len(train_vectors))
train_vectors[0].shape

2010


(300,)

In [184]:
test_vectors = features_embeddings_pretrained_fasttext(ft, test_tokenized_corpus)
print(len(test_vectors))
test_vectors[0].shape

990


(300,)

In [185]:
model = LogisticRegression()

In [186]:
model.fit(train_vectors, train_y)

In [187]:
train_pred = model.predict(train_vectors)
test_pred = model.predict(test_vectors)

In [188]:
confusion_matrix(train_y, train_pred)

array([[834, 198],
       [260, 718]])

In [189]:
confusion_matrix(test_y, test_pred)

array([[402,  92],
       [143, 353]])

In [190]:
print("Train Accuracy : ",accuracy_score(train_y, train_pred))
print("Test Accuracy : ",accuracy_score(test_y, test_pred))

Train Accuracy :  0.7721393034825871
Test Accuracy :  0.7626262626262627
