# Scikit Learn

In [56]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# from bs4 import BeautifulSoup
import re
import nltk

In [2]:
df = pd.read_csv("data/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# checking if data is biased

df['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

## Cleaning data

#### Removing HTML strips


In [4]:

# manual way
# df['review'].apply(lambda x: x.replace('<br /><br />', ''))

def html_strips_remove(text):
    text = re.sub('<\w*\s*/>', '', text)
    return text
df['review'] = df['review'].apply(html_strips_remove)

#### Removing special Characters

In [5]:
def sp_char_remove(text):
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    return text
df['review'] = df['review'].apply(sp_char_remove)

#### Test stemming

In [6]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text
df['review'] = df['review'].apply(simple_stemmer)

#### Removing stopwords

In [7]:
#stop = set( nltk.corpus.stopwords.words('English'))
stop = [word.lower() for word in nltk.corpus.stopwords.words('English')]


# removing stopwords
def remove_stopwords(text):
    filtered_list=[]
    split_text = text.split(' ')
    for word in split_text:
        if word.lower() in stop:
            pass
        else:
            filtered_list.append(word)
    filtered_text = ' '.join(filtered_list) 
    return filtered_text

df['review'] = df['review'].apply(remove_stopwords)    

text1 = df['review'][0]

#### Labeling sentiment

In [14]:
lb = sklearn.preprocessing.LabelBinarizer()
df['lb_sentiment'] = lb.fit_transform(df['sentiment'])

In [15]:
df.head()

Unnamed: 0,review,sentiment,lb_sentiment
0,one review ha mention watch 1 Oz episod youll ...,positive,1
1,wonder littl product film techniqu veri unassu...,positive,1
2,thought thi wa wonder way spend time hot summe...,positive,1
3,basic famili littl boy jake think zombi hi clo...,negative,0
4,petter mattei love time money visual stun film...,positive,1


## Splitting data

In [16]:
X = df['review']
y = df['lb_sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

## Vectorization

### Bag of words

In [34]:
# Count Vectorizer
cv_vect = CountVectorizer()
cv_x_train_vector = cv_vect.fit_transform(x_train)
cv_x_test_vector = cv_vect.transform(x_test)

# print(x_train[0])
# print(x_train_vector[0])

### TF-IDF features

In [33]:
# TF-IDF Vectorizer
tv_vectorizer = TfidfVectorizer()

tv_x_train_vector = tv_vectorizer.fit_transform(x_train)
tv_x_test_vector = tv_vectorizer.transform(x_test)

# print(x_train[0])
# print(x_train_vector[0])

In [35]:
tv_x_train_vector.shape

(40000, 155950)

In [36]:
cv_x_train_vector.shape

(40000, 155950)

# Classification

### SVM

In [40]:
# fitting the model
cv_svm = sklearn.svm.SVC(verbose=1, max_iter=500)
cv_svm.fit(cv_x_train_vector, y_train)

# Prediction
y_pred = cv_svm.predict(cv_x_test_vector)

#Accuracy
cv_svm_acc = accuracy_score(y_test, y_pred)
print('Count Vectorizer SVM Accuracy: ', cv_svm_acc)

[LibSVM]



Count Vectorizer Accuracy:  0.5458


In [41]:
# fitting the model
tv_svm = sklearn.svm.SVC(verbose=1, max_iter=500)
tv_svm.fit(tv_x_train_vector, y_train)

# Prediction
y_pred = tv_svm.predict(tv_x_test_vector)

#Accuracy
tv_svm_acc = accuracy_score(y_test, y_pred)
print('TFVID Vectorizer SVM Accuracy: ', tv_svm_acc)

[LibSVM]



TFVID Vectorizer SVM Accuracy:  0.7359


In [46]:
# Classification_report_tfidf
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))  # alphabatical order

              precision    recall  f1-score   support

    Negative       0.73      0.75      0.74      4988
    Positive       0.74      0.73      0.73      5012

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000



### Logistic Regression

In [37]:
# fitting the model
cv_lr = sklearn.linear_model.LogisticRegression(verbose=1, max_iter=500)
cv_lr.fit(cv_x_train_vector, y_train)

# Prediction
y_pred = cv_lr.predict(cv_x_test_vector)

#Accuracy
cv_lr_acc = accuracy_score(y_test, y_pred)
print('Count Vectorizer Logistic Regression Accuracy: ', cv_lr_acc)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Count Vectorizer Accuracy:  0.8818


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   25.6s finished


In [69]:
# fitting the model
tv_lr = sklearn.linear_model.LogisticRegression(verbose=1, max_iter=500)
tv_lr.fit(tv_x_train_vector, y_train)

# Prediction
y_pred = tv_lr.predict(tv_x_test_vector)

#Accuracy
tv_lr_acc = accuracy_score(y_test, y_pred)
print('TFVID Logistic Regression Accuracy: ', tv_lr_acc)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


TFVID Logistic Regression Accuracy:  0.889


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.5s finished


In [65]:
# Classification_report_tfidf
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))  # alphabatical order

              precision    recall  f1-score   support

    Negative       0.85      0.88      0.86      4988
    Positive       0.87      0.84      0.86      5012

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



### Stochastic gradient descent or Linear support vector machines

In [49]:
# fitting the model
cv_sgd = sklearn.linear_model.SGDClassifier(verbose=0, max_iter=500)
cv_sgd.fit(cv_x_train_vector, y_train)

# Prediction
y_pred = cv_sgd.predict(cv_x_test_vector)

#Accuracy
cv_sgd_acc = accuracy_score(y_test, y_pred)
print('Count Vectorizer SGD Accuracy: ', cv_sgd_acc)

Count Vectorizer SGD Accuracy:  0.8764


In [52]:
# fitting the model
tv_sgd = sklearn.linear_model.SGDClassifier(verbose=0, max_iter=500)
tv_sgd.fit(tv_x_train_vector, y_train)

# Prediction
y_pred = tv_sgd.predict(tv_x_test_vector)

#Accuracy
tv_sgd_acc = accuracy_score(y_test, y_pred)
print('Count Vectorizer SGD Accuracy: ', tv_sgd_acc)

-- Epoch 1
Norm: 47.86, NNZs: 97485, Bias: 0.050102, T: 40000, Avg. loss: 0.342546
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 45.89, NNZs: 107587, Bias: 0.049538, T: 80000, Avg. loss: 0.278299
Total training time: 0.05 seconds.
-- Epoch 3
Norm: 45.53, NNZs: 109702, Bias: 0.028649, T: 120000, Avg. loss: 0.266842
Total training time: 0.07 seconds.
-- Epoch 4
Norm: 45.33, NNZs: 110494, Bias: 0.013332, T: 160000, Avg. loss: 0.261676
Total training time: 0.08 seconds.
-- Epoch 5
Norm: 45.24, NNZs: 110824, Bias: 0.023834, T: 200000, Avg. loss: 0.258857
Total training time: 0.12 seconds.
-- Epoch 6
Norm: 45.23, NNZs: 111062, Bias: 0.033027, T: 240000, Avg. loss: 0.256949
Total training time: 0.13 seconds.
-- Epoch 7
Norm: 45.22, NNZs: 111200, Bias: 0.025426, T: 280000, Avg. loss: 0.255282
Total training time: 0.16 seconds.
-- Epoch 8
Norm: 45.20, NNZs: 111281, Bias: 0.026107, T: 320000, Avg. loss: 0.254294
Total training time: 0.18 seconds.
-- Epoch 9
Norm: 45.18, NNZs: 111316, Bias:

### Multinominal Naive Bayes

In [61]:
# fitting the model
cv_mnb = sklearn.naive_bayes.MultinomialNB()
cv_mnb.fit(cv_x_train_vector, y_train)

# prediction
y_pred = cv_mnb.predict(cv_x_test_vector)

# accuracy
cv_mnb_acc = accuracy_score(y_test, y_pred)
print('Count Vectorizer MNB Accuracy: ', cv_mnb_acc)

Count Vectorizer MNB Accuracy:  0.85


In [63]:
# fitting the model
tv_mnb = sklearn.naive_bayes.MultinomialNB()
tv_mnb.fit(tv_x_train_vector, y_train)

# prediction
y_pred = tv_mnb.predict(tv_x_test_vector)

# accuracy
tv_mnb_acc = accuracy_score(y_test, y_pred)
print('Count Vectorizer MNB Accuracy: ', tv_mnb_acc)

Count Vectorizer MNB Accuracy:  0.8589


In [64]:
# Classification_report_tfidf
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))  # alphabatical order

              precision    recall  f1-score   support

    Negative       0.85      0.88      0.86      4988
    Positive       0.87      0.84      0.86      5012

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



### Conclusion

Both logistic regression and multinomial naive bayes model performing well compared to linear support vector  machines.