In [1]:
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [2]:
!pip install tensorflow
!pip install keras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
#importing required libraries
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout,Embedding
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(stop_words='english')


In [4]:
df=pd.read_csv('/drive/MyDrive/IMDB Dataset.csv')

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [7]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [8]:
pos_df=df.loc[df['sentiment']=='positive']
pos_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive


In [9]:
neg_df=df.loc[df['sentiment']=='negative']
neg_df.head()

Unnamed: 0,review,sentiment
3,Basically there's a family where a little boy ...,negative
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
10,Phil the Alien is one of those quirky films wh...,negative
11,I saw this movie when I was about 12 when it c...,negative


In [10]:
##taking only 20000 records to reduce training time 10000 from positive and 10000 from negative
pos_df=pos_df.iloc[0:10000]
neg_df=neg_df.iloc[0:10000]
final_df=pd.concat([pos_df,neg_df])
final_df.shape

(20000, 2)

In [11]:
final_df['sentiment'].value_counts()

positive    10000
negative    10000
Name: sentiment, dtype: int64

In [12]:
## seperating features and target(labels)
x=final_df['review']
y=final_df['sentiment']

In [13]:
## splitting data into train and test sets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)

In [14]:
## converting review column to numeric vectors using tf-idf

x_train=tfidf.fit_transform(x_train)
x_test=tfidf.transform(x_test)

In [21]:
pos_df_sam=pos_df[0:500]
neg_df_sam=neg_df[0:500]
sample_df=pd.concat([pos_df_sam,neg_df_sam])
sample_df.shape


(1000, 2)

In [22]:
## taking only sample of 1000 records to test for all models using cross val score and we can later train the best model on complete data
x_sam=sample_df['review']
y_sam=sample_df['sentiment']

In [23]:
x_sam=tfidf.transform(x_sam)

In [24]:
# using different models and using cross validation to find the best model

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [25]:
svc_rbf=SVC(kernel='rbf')
svc_lin=SVC(kernel='linear')
lr=LogisticRegression()
knc=KNeighborsClassifier(n_neighbors=5)
dtc=DecisionTreeClassifier()
rfc=RandomForestClassifier(n_estimators=100)
models=[lr,knc,dtc,rfc,svc_lin,svc_rbf]

In [26]:
from sklearn.model_selection import cross_val_score
for model in models:
    accuracy=cross_val_score(model,x_sam,y_sam,scoring='accuracy',cv=10)
    
    print(model)
    print('accuracy_score :',accuracy.mean())

LogisticRegression()
accuracy_score : 0.807
KNeighborsClassifier()
accuracy_score : 0.647
DecisionTreeClassifier()
accuracy_score : 0.6839999999999999
RandomForestClassifier()
accuracy_score : 0.786
SVC(kernel='linear')
accuracy_score : 0.819
SVC()
accuracy_score : 0.805


We can see from the above about that SupportVectorClassifier is giving best accuracy of around 82% with linear kernel, so proceeding with svc and training it on complete data

In [27]:
svc_model=SVC(kernel='linear')
svc_model.fit(x_train,y_train)

SVC(kernel='linear')

In [28]:
y_pred=svc_model.predict(x_test)
y_pred

array(['negative', 'positive', 'negative', ..., 'negative', 'positive',
       'negative'], dtype=object)

In [32]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
print('accuracy score:',accuracy_score(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))
print(classification_report(y_pred,y_test))

accuracy score: 0.8845
[[2626  309]
 [ 384 2681]]
              precision    recall  f1-score   support

    negative       0.87      0.89      0.88      2935
    positive       0.90      0.87      0.89      3065

    accuracy                           0.88      6000
   macro avg       0.88      0.88      0.88      6000
weighted avg       0.88      0.88      0.88      6000



In [36]:
import pickle

with open('/drive/MyDrive/SupportVectorClassifier.pkl','wb') as clf:
  pickle.dump(svc_model,clf)