# IMDB review classifier using word embeddings
### Dataset- https://drive.google.com/file/d/1zXOovxa6RSqmIOaFu9RNlluVlmyBfJ0M/view?usp=sharing


In [1]:
import pandas as pd


### Reading table 

In [2]:
df = pd.read_table('imdb_labelled.txt', header = None)
df.columns=['Review','Sentiments']
df

Unnamed: 0,Review,Sentiments
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
5,"The rest of the movie lacks art, charm, meanin...",0
6,Wasted two hours.,0
7,Saw the movie today and thought it was a good ...,1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the scien...,1


In [3]:
with open("imdb_labelled.txt","r") as text_file:
    lines = text_file.read().split('\n')

In [4]:
lines[1:10]

['Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  \t0',
 'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  \t0',
 'Very little music or anything to speak of.  \t0',
 'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  \t1',
 "The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.  \t0",
 'Wasted two hours.  \t0',
 'Saw the movie today and thought it was a good effort, good messages for kids.  \t1',
 'A bit predictable.  \t0',
 'Loved the casting of Jimmy Buffet as the science teacher.  \t1']

### Splitting the dataset by tab and remove corrupted data if any or lines which are not tab seperated

In [5]:
lines = [line.split("\t") for line in lines if len(line.split("\t"))==2 and line.split("\t")[1]!='']
lines[1:7]

[['Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
  '0'],
 ['Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ',
  '0'],
 ['Very little music or anything to speak of.  ', '0'],
 ['The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  ',
  '1'],
 ["The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.  ",
  '0'],
 ['Wasted two hours.  ', '0']]

### Seperate the Reviews from Sentiments

In [6]:
train_doc = [line[0] for line in lines ]
train_doc[1:7]

['Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
 'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ',
 'Very little music or anything to speak of.  ',
 'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  ',
 "The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.  ",
 'Wasted two hours.  ']

# Seperate the sentiment from Reviews

In [7]:
train_labels = [int(line[1]) for line in lines]
train_labels[1:20]

[0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]

# "Using GloVe word embeddings(TfidVectorized) for the text for unweighted and normalized sentence vectors"


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer


## Instatiate the Tfidfvectorizer

In [9]:
Tfidf_vectorizer = TfidfVectorizer(binary='true')

### Train the documents
We use fit_transform() on the train data so that we learn the parameters of scaling on the train data and in the same time we scale the train data.

In [10]:
train_documents = Tfidf_vectorizer.fit_transform(train_doc)
train_documents

<1000x3051 sparse matrix of type '<class 'numpy.float64'>'
	with 12667 stored elements in Compressed Sparse Row format>

In [11]:
print(train_documents[0])

  (0, 2875)	0.20820982640162744
  (0, 2408)	0.3240544027225203
  (0, 1752)	0.34234019020013984
  (0, 92)	0.3921365649590639
  (0, 1750)	0.14974731557628881
  (0, 37)	0.22173623283795857
  (0, 749)	0.3921365649590639
  (0, 787)	0.3921365649590639
  (0, 3041)	0.34234019020013984
  (0, 1641)	0.29041234203157174


### Joining the similar word and Counting the repeated word 

In [12]:
Tfidf_vectorizer.vocabulary_

{'very': 2875,
 'slow': 2408,
 'moving': 1752,
 'aimless': 92,
 'movie': 1750,
 'about': 37,
 'distressed': 749,
 'drifting': 787,
 'young': 3041,
 'man': 1641,
 'not': 1815,
 'sure': 2571,
 'who': 2969,
 'was': 2921,
 'more': 1735,
 'lost': 1607,
 'the': 2642,
 'flat': 1036,
 'characters': 432,
 'or': 1864,
 'audience': 196,
 'nearly': 1779,
 'half': 1207,
 'of': 1839,
 'whom': 2973,
 'walked': 2909,
 'out': 1877,
 'attempting': 192,
 'artiness': 169,
 'with': 2989,
 'black': 290,
 'white': 2967,
 'and': 125,
 'clever': 479,
 'camera': 377,
 'angles': 131,
 'disappointed': 731,
 'became': 249,
 'even': 893,
 'ridiculous': 2202,
 'as': 174,
 'acting': 56,
 'poor': 2004,
 'plot': 1988,
 'lines': 1577,
 'almost': 104,
 'non': 1807,
 'existent': 922,
 'little': 1583,
 'music': 1762,
 'anything': 148,
 'to': 2698,
 'speak': 2454,
 'best': 276,
 'scene': 2270,
 'in': 1359,
 'when': 2959,
 'gerardo': 1135,
 'is': 1424,
 'trying': 2763,
 'find': 1019,
 'song': 2435,
 'that': 2641,
 'keeps': 1

## Splitting dataset

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x = train_documents
y = train_labels
X_train,X_test, Y_train,Y_test = train_test_split(x,y,test_size = 0.30,random_state = 7)


# Use different standard classifiers for classification of the texts.

## Naive Bayes

#### Naive Bayes classifier is a straightforward and powerful algorithm for the classification task.

## Types of Naive Bayes Algorithm

### (1) Multinomial Naive Bayes

In [15]:
from sklearn.naive_bayes import MultinomialNB

#### Fit the data

In [16]:
multi = MultinomialNB().fit(X_train,Y_train)


#### predict the data

In [17]:
multi_pred = multi.predict(X_test)

In [18]:
print(multi_pred)

[1 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0
 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 1 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 1 0 1 0 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0 1
 0 0 0 0 1 0 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 1 1 1 0 0 0 1 1 0 0 1 0 1 0 1
 1 1 0 1 1 0 0 1 1 0 0 0 1 1 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 1
 0 1 0 1 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 1 0
 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0
 1 0 1 1 1 1 1 0 0 0 1 1 1 1 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1
 0 0 1 0]


#### Accuracy

In [19]:
from sklearn import metrics

multi_acc = metrics.accuracy_score(Y_test,multi_pred)
print("Multinomial accuracy score",multi_acc)

Multinomial accuracy score 0.7766666666666666


#### F1 Score

In [20]:
import numpy as np
multi_f1 = metrics.f1_score(Y_test, multi_pred, average='weighted', labels=np.unique(multi_pred))
multi_f1

0.7759565962731995

#### Confusion Matrix

In [21]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(Y_test,multi_pred)
print(matrix)


[[126  26]
 [ 41 107]]


#### Classification Report

In [22]:
from sklearn.metrics import classification_report 

print(classification_report(Y_test,multi_pred))

             precision    recall  f1-score   support

          0       0.75      0.83      0.79       152
          1       0.80      0.72      0.76       148

avg / total       0.78      0.78      0.78       300



### (2)Bernoulli Naive Bayes

In [23]:
from sklearn.naive_bayes import BernoulliNB

##### Fit the data

In [24]:
bern = BernoulliNB().fit(X_train,Y_train)

#### Predict

In [25]:
bern_pred = bern.predict(X_test)


#### Accuracy

In [26]:

bern_acc = metrics.accuracy_score(Y_test,bern_pred)
bern_acc

0.7666666666666667

#### Confusion matrix

In [27]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(Y_test,bern_pred)
print(matrix)


[[137  15]
 [ 55  93]]


#### F1 score

In [28]:
bern_f1 = metrics.f1_score(Y_test, bern_pred, average='weighted', labels=np.unique(bern_pred))
bern_f1

0.7620033914728682

#### Classification report

In [29]:
from sklearn.metrics import classification_report 

print(classification_report(Y_test,bern_pred))

             precision    recall  f1-score   support

          0       0.71      0.90      0.80       152
          1       0.86      0.63      0.73       148

avg / total       0.79      0.77      0.76       300



### (3)Gaussain Naive Bayes

In [30]:
from sklearn.naive_bayes import GaussianNB

##### Fit the data

In [31]:
gauss = GaussianNB()
gauss.fit(X_train.todense(),Y_train)

GaussianNB(priors=None)

#### Predict

In [32]:
gauss_pred =  gauss.predict(X_test.todense())

#### Accuracy

In [33]:
gauss_acc = metrics.accuracy_score(gauss_pred,Y_test)
gauss_acc

0.66

#### Confusion matrix

In [34]:
matrix = confusion_matrix(Y_test,gauss_pred)
print(matrix)


[[112  40]
 [ 62  86]]


#### F1 score

In [35]:
gauss_f1 = metrics.f1_score(Y_test, gauss_pred, average='weighted', labels=np.unique(gauss_pred))
gauss_f1

0.6578227576015405

#### Classification report

In [36]:
print(classification_report(gauss_pred,Y_test))

             precision    recall  f1-score   support

          0       0.74      0.64      0.69       174
          1       0.58      0.68      0.63       126

avg / total       0.67      0.66      0.66       300



### KNeighborsClassifier

In [37]:
from sklearn.neighbors import KNeighborsClassifier


##### Fit the data

In [38]:
KNN = KNeighborsClassifier()
KNN.fit(X_train,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

#### Predict

In [39]:
KNN_pred = KNN.predict(X_test)

#### Accuracy

In [40]:
KNN_acc = metrics.accuracy_score(Y_test,KNN_pred)
KNN_acc

0.7166666666666667

#### Confusion matrix

In [41]:
KNN_mat = confusion_matrix(Y_test, KNN_pred)
KNN_mat

array([[116,  36],
       [ 49,  99]], dtype=int64)

#### F1 score

In [42]:
KNN_f1 = metrics.f1_score(Y_test, KNN_pred, average='weighted', labels=np.unique(KNN_pred))
KNN_f1

0.7159686846280463

#### Classification report

In [43]:
print(classification_report(Y_test, KNN_pred))

             precision    recall  f1-score   support

          0       0.70      0.76      0.73       152
          1       0.73      0.67      0.70       148

avg / total       0.72      0.72      0.72       300



## Support vector machine

In [44]:
from sklearn.svm import SVC

### (1)Linear SVC

In [45]:
lsvc =SVC(kernel = "linear",degree=8)

##### Fit the data

In [46]:
lsvc.fit(X_train,Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=8, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Predict

In [47]:
lsvc_pred = lsvc.predict(X_test)

#### Accuracy

In [48]:
lsvc_acc = metrics.accuracy_score(Y_test,lsvc_pred)
lsvc_acc

0.7566666666666667

#### Confusion matrix

In [49]:
matrix = confusion_matrix(Y_test,lsvc_pred)
print(matrix)


[[114  38]
 [ 35 113]]


#### F1 score

In [50]:
lsvc_f1 = metrics.f1_score(Y_test, lsvc_pred, average='weighted', labels=np.unique(lsvc_pred))
lsvc_f1

0.7566747778679023

#### Classification report

In [51]:
print(classification_report(Y_test,lsvc_pred))

             precision    recall  f1-score   support

          0       0.77      0.75      0.76       152
          1       0.75      0.76      0.76       148

avg / total       0.76      0.76      0.76       300



### (2)Poly SVC

In [52]:
psvc =SVC(kernel = "poly",degree= 8)

##### Fit the data

In [53]:
psvc.fit(X_train,Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=8, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Predict

In [54]:
psvc_pred = psvc.predict(X_test)

#### Accuracy

In [55]:
psvc_acc = metrics.accuracy_score(Y_test,psvc_pred)
psvc_acc

0.49333333333333335

#### Confusion matrix

In [56]:
matrix = confusion_matrix(Y_test,psvc_pred)
print(matrix)


[[  0 152]
 [  0 148]]


#### F1 score

In [57]:
psvc_f1 = metrics.f1_score(Y_test, psvc_pred, average='weighted', labels=np.unique(psvc_pred))
psvc_f1

0.6607142857142857

#### Classification report

In [58]:
print(classification_report(Y_test,psvc_pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       152
          1       0.49      1.00      0.66       148

avg / total       0.24      0.49      0.33       300



  'precision', 'predicted', average, warn_for)


### (3)Gaussian SVC

In [59]:
gsvc = SVC(kernel = 'rbf')

##### Fit the data

In [60]:
gsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Predict

In [61]:
gsvc_pred = gsvc.predict(X_test)

#### Accuracy

In [62]:
gsvc_acc = metrics.accuracy_score(gsvc_pred, Y_test)
gsvc_acc

0.49333333333333335

#### Confusion matrix

In [63]:
gsvc_mat = confusion_matrix(Y_test, gsvc_pred)
gsvc_mat

array([[  0, 152],
       [  0, 148]], dtype=int64)

#### F1 score

In [64]:
gsvc_f1 =  metrics.f1_score(Y_test, gsvc_pred, average='weighted', labels=np.unique(gsvc_pred))
gsvc_f1

0.6607142857142857

#### Classification report

In [65]:
print(classification_report(Y_test, gsvc_pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       152
          1       0.49      1.00      0.66       148

avg / total       0.24      0.49      0.33       300



  'precision', 'predicted', average, warn_for)


## Decision Tree Classifier

In [66]:
from sklearn.tree import DecisionTreeClassifier

##### Fit the data

In [67]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

#### Predict

In [68]:
dtree_pred = dtree.predict(X_test)

#### Accuracy

In [69]:
dtree_acc = metrics.accuracy_score(Y_test, dtree_pred)
dtree_acc

0.5966666666666667

#### Confusion matrix

In [70]:
dtree_mat = confusion_matrix(Y_test, dtree_pred)
dtree_mat

array([[83, 69],
       [52, 96]], dtype=int64)

#### F1 score

In [71]:
dtree_f1 = metrics.f1_score(Y_test, dtree_pred)
dtree_f1

0.6134185303514377

#### Classification report

In [72]:
print(classification_report(Y_test, dtree_pred))

             precision    recall  f1-score   support

          0       0.61      0.55      0.58       152
          1       0.58      0.65      0.61       148

avg / total       0.60      0.60      0.60       300



## Random Forest Classifier

In [73]:
from sklearn.ensemble import RandomForestClassifier
rand = RandomForestClassifier(n_estimators=100, n_jobs=2, random_state=7)

##### Fit the data

In [74]:
rand.fit(X_train , Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=7, verbose=0, warm_start=False)

#### Predict

In [75]:
rand_pred = rand.predict(X_test)

#### Accuracy

In [76]:
rand_acc = metrics.accuracy_score(Y_test, rand_pred)
rand_acc

0.6833333333333333

#### Confusion matrix

In [77]:
rand_mat = confusion_matrix(Y_test, rand_pred)
rand_mat

array([[105,  47],
       [ 48, 100]], dtype=int64)

#### F1 score

In [78]:
rand_f1 = metrics.f1_score(Y_test, rand_pred)
rand_f1

0.6779661016949152

#### Classification report

In [79]:
print(classification_report(Y_test, rand_pred))

             precision    recall  f1-score   support

          0       0.69      0.69      0.69       152
          1       0.68      0.68      0.68       148

avg / total       0.68      0.68      0.68       300



## Voting Classifier

In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

In [81]:
estimators =[]
model = LogisticRegression()
estimators.append(('logistic',model))
model1 = SVC(kernel = 'rbf')
estimators.append(('SVC',model1))
model2 = KMeans()
estimators.append(('KMeans',model2))
model3 = KNeighborsClassifier()
estimators.append(('KNN',model3))
voting = VotingClassifier(estimators)

##### Fit the data

In [82]:
voting.fit(X_train, Y_train)

VotingClassifier(estimators=[('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('SVC', ...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

#### Predict

In [83]:
voting_pred = voting.predict(X_test)

  if diff:


#### Accuracy

In [84]:
voting_acc = metrics.accuracy_score(Y_test,voting_pred)
voting_acc

0.7266666666666667

#### Confusion matrix

In [85]:
voting_mat = confusion_matrix(Y_test, voting_pred)
voting_mat

array([[ 97,  55],
       [ 27, 121]], dtype=int64)

#### F1 score

In [86]:
voting_f1 = metrics.f1_score(Y_test, voting_pred)
voting_f1


0.7469135802469135

#### Classification report

In [87]:
print(classification_report(Y_test, voting_pred))

             precision    recall  f1-score   support

          0       0.78      0.64      0.70       152
          1       0.69      0.82      0.75       148

avg / total       0.74      0.73      0.72       300



# Comparing the accuracy of the classifiers using different standard classification metrics.


In [88]:
Algo_name = ["Multinolmial Naive Bayes","Bernoulli Naive Bayes","Gaussain Naive Bayes","KNN","Linear SVC","Poly SVC",
             "Gaussian SVC","Decision Tree Classifier","Random Forest Classifier","Voting Classifier"]
accuracy = [multi_acc,bern_acc,gauss_acc,KNN_acc,lsvc_acc,psvc_acc,gsvc_acc,dtree_acc,rand_acc,voting_acc]

In [89]:
res2 = pd.DataFrame({"Algorithms":Algo_name, "Accuracy":accuracy}, index = [1,2,3,4,5,6,7,8,9,10])
res2

Unnamed: 0,Algorithms,Accuracy
1,Multinolmial Naive Bayes,0.776667
2,Bernoulli Naive Bayes,0.766667
3,Gaussain Naive Bayes,0.66
4,KNN,0.716667
5,Linear SVC,0.756667
6,Poly SVC,0.493333
7,Gaussian SVC,0.493333
8,Decision Tree Classifier,0.596667
9,Random Forest Classifier,0.683333
10,Voting Classifier,0.726667
