# Preparing The Data

## Reading the dataset

In [None]:
!gdown "1WMfGVZ6W3EWTI8HI2DzcJjIpcl1bp9xf"

Downloading...
From: https://drive.google.com/uc?id=1WMfGVZ6W3EWTI8HI2DzcJjIpcl1bp9xf
To: /content/IMDB Dataset.csv
100% 66.2M/66.2M [00:00<00:00, 90.1MB/s]


In [None]:
import pandas as pd

df_review = pd.read_csv('IMDB Dataset.csv')

In [None]:
df_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
df_positive = df_review[df_review['sentiment']=='positive'][:1000]
df_negative = df_review[df_review['sentiment']=='negative'][:1000]

df_review_bal = pd.concat([df_positive, df_negative])

## Splitting data into train and test set

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_review_bal, test_size=0.33, random_state=42)

In [None]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

# Text Representation (Bag of Words)

## Turning our text data into numerical vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
train_x_vector

<1340x20202 sparse matrix of type '<class 'numpy.float64'>'
	with 116136 stored elements in Compressed Sparse Row format>

In [None]:
pd.DataFrame.sparse.from_spmatrix(train_x_vector, index=train_x.index, columns=tfidf.get_feature_names())



Unnamed: 0,00,000,007,02,06,08,10,100,1000,100th,...,zp,zu,zuber,zucker,zulu,zwick,zzzzzzzzzzzzzzzzzz,æon,élan,être
191,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1819,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
36,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
786,0.0,0.0,0.0,0.0,0.0,0.0,0.044532,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
56,0.0,0.0,0.0,0.0,0.0,0.0,0.111610,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
553,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1685,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
921,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.183876,0.0,0.0,0.0


In [None]:
test_x_vector = tfidf.transform(test_x)

# Model Selection

##SVM

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(train_x_vector, train_y)

SVC(kernel='linear')

In [None]:
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['I did not like this movie at all'])))

['positive']
['positive']
['negative']


##Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()
dec_tree.fit(train_x_vector, train_y)

DecisionTreeClassifier()

##Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x_vector.toarray(), train_y)

GaussianNB()

##Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(train_x_vector, train_y)

LogisticRegression()

#Model Evaluation

##Mean Accuracy

In [None]:
# svc.score('Test samples', 'True labels')

print('SVM: ', svc.score(test_x_vector, test_y))
print('Decision tree: ', dec_tree.score(test_x_vector, test_y))
print('Naive Bayes: ', gnb.score(test_x_vector.toarray(), test_y))
print('Logistic Regression: ', log_reg.score(test_x_vector, test_y))

SVM:  0.8378787878787879
Decision tree:  0.6818181818181818
Naive Bayes:  0.6287878787878788
Logistic Regression:  0.8121212121212121


##F1 Score

In [None]:
#F1 Score = 2*(Recall * Precision) / (Recall + Precision)
#F1 score reaches its best value at 1 and worst score at 0.

from sklearn.metrics import f1_score

f1_score(test_y, svc.predict(test_x_vector), labels=['positive', 'negative'], average=None)

array([0.8410104 , 0.83462133])

##Classification report

In [None]:
#Favorito da Letícia
from sklearn.metrics import classification_report

print(classification_report(test_y, svc.predict(test_x_vector), labels=['positive', 'negative']))

              precision    recall  f1-score   support

    positive       0.81      0.87      0.84       325
    negative       0.87      0.81      0.83       335

    accuracy                           0.84       660
   macro avg       0.84      0.84      0.84       660
weighted avg       0.84      0.84      0.84       660



##Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(test_y, svc.predict(test_x_vector), labels=['positive', 'negative'])
conf_mat

array([[283,  42],
       [ 65, 270]])

# Tuning the Model

## GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

#set the parameters
parameters = {'C': [1,4,8,16,32] ,'kernel':['linear', 'rbf']}
svc = SVC()
svc_grid = GridSearchCV(svc, parameters, cv=5)

svc_grid.fit(train_x_vector, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 4, 8, 16, 32], 'kernel': ['linear', 'rbf']})

In [None]:
print(svc_grid.best_params_)
print(svc_grid.best_estimator_)

{'C': 1, 'kernel': 'linear'}
SVC(C=1, kernel='linear')
