In [None]:
import pandas as pd
import requests
from io import StringIO
import numpy as np
import string
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score,recall_score,precision_score
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
import gdown
import warnings
warnings.filterwarnings('ignore')
import re
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
file_id = "1bVYJIkYz5utt5n1HJcCNMg2x83ETf7CP"

# Generate the direct download link
url = f"https://drive.google.com/uc?id={file_id}"

# Download the CSV file
output = "data.csv"
gdown.download(url, output, quiet=False)
df = pd.read_csv(output)
print(df.head())


Downloading...
From: https://drive.google.com/uc?id=1bVYJIkYz5utt5n1HJcCNMg2x83ETf7CP
To: /content/data.csv
100%|██████████| 66.2M/66.2M [00:00<00:00, 194MB/s]


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
def cleaning(text):
    # converting to lowercase, removing URL links, special characters, punctuations...
    text = text.lower() # converting to lowercase
    text = re.sub('https?://\S+|www\.\S+', '', text) # removing URL links
    text = re.sub(r"\b\d+\b", "", text) # removing number
    text = re.sub('<.*?>+', '', text) # removing special characters,
    #text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # punctuations
    text = re.sub('\n', '', text)
    text = re.sub('[’“”…]', '', text)

    #removing emoji:
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

   # removing short form:

    text=re.sub("isn't",'is not',text)
    text=re.sub("he's",'he is',text)
    text=re.sub("wasn't",'was not',text)
    text=re.sub("there's",'there is',text)
    text=re.sub("couldn't",'could not',text)
    text=re.sub("won't",'will not',text)
    text=re.sub("they're",'they are',text)
    text=re.sub("she's",'she is',text)
    text=re.sub("There's",'there is',text)
    text=re.sub("wouldn't",'would not',text)
    text=re.sub("haven't",'have not',text)
    text=re.sub("That's",'That is',text)
    text=re.sub("you've",'you have',text)
    text=re.sub("He's",'He is',text)
    text=re.sub("what's",'what is',text)
    text=re.sub("weren't",'were not',text)
    text=re.sub("we're",'we are',text)
    text=re.sub("hasn't",'has not',text)
    text=re.sub("you'd",'you would',text)
    text=re.sub("shouldn't",'should not',text)
    text=re.sub("let's",'let us',text)
    text=re.sub("they've",'they have',text)
    text=re.sub("You'll",'You will',text)
    text=re.sub("i'm",'i am',text)
    text=re.sub("we've",'we have',text)
    text=re.sub("it's",'it is',text)
    text=re.sub("don't",'do not',text)
    text=re.sub("that´s",'that is',text)
    text=re.sub("I´m",'I am',text)
    text=re.sub("it’s",'it is',text)
    text=re.sub("she´s",'she is',text)
    text=re.sub("he’s'",'he is',text)
    text=re.sub('I’m','I am',text)
    text=re.sub('I’d','I did',text)
    text=re.sub("he’s'",'he is',text)
    text=re.sub('there’s','there is',text)

    return text

df['review'] = df['review'].apply(cleaning)


In [None]:
def process(text):
    # lowercase it
    text = text.lower()
    # remove punctuation
    text = ''.join([t for t in text if t not in string.punctuation])
    # remove stopwords
    text = [t for t in text.split() if t not in stopwords.words('english')]
    # stemming
    st = Stemmer()
    text = [st.stem(t) for t in text]
    # return token list
    return text

In [None]:
X = df['review'].head(1000)

y = df['sentiment'].head(1000)
len(X)

1000

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
count_vectorizer = CountVectorizer(analyzer=process,tokenizer = token.tokenize)
X_train_countv = count_vectorizer.fit_transform(X_train)
X_test_countv = count_vectorizer.transform(X_test)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train_countv.toarray(), y_train)
y_pred_gnb = gnb.predict(X_test_countv.toarray())#
print('Gaussian Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gnb)))
y_pred_train = gnb.predict(X_train_countv.toarray())
print('Gaussian Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_gnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_gnb))

Gaussian Naïve Bayes Model accuracy score: 0.5900
Gaussian Naïve Bayes Training-set accuracy score: 0.9971
Confusion matrix

 [[92 69]
 [54 85]]
              precision    recall  f1-score   support

    negative       0.63      0.57      0.60       161
    positive       0.55      0.61      0.58       139

    accuracy                           0.59       300
   macro avg       0.59      0.59      0.59       300
weighted avg       0.59      0.59      0.59       300



In [None]:
param_grid_gnb = {'var_smoothing': np.logspace(-9, 0, 10)}
grid_gnb = GridSearchCV(GaussianNB(), param_grid_gnb, cv=3, scoring='accuracy')
grid_gnb.fit(X_train_countv.toarray(), y_train)

In [None]:
print("Best GaussianNB params:", grid_gnb.best_params_)
y_pred_gnb = grid_gnb.best_estimator_.predict(X_test_countv.toarray())
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gnb))

Best GaussianNB params: {'var_smoothing': np.float64(0.001)}
GaussianNB Accuracy: 0.6366666666666667


In [None]:
mnb = MultinomialNB()
mnb.fit(X_train_countv.toarray(), y_train)
y_pred_mnb = mnb.predict(X_test_countv.toarray())
print('Multinomial Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_mnb)))
y_pred_train = mnb.predict(X_train_countv.toarray())
print('Multinomial Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_mnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_mnb))

Multinomial Naïve Bayes Model accuracy score: 0.7867
Multinomial Naïve Bayes Training-set accuracy score: 0.9857
Confusion matrix

 [[136  25]
 [ 39 100]]
              precision    recall  f1-score   support

    negative       0.78      0.84      0.81       161
    positive       0.80      0.72      0.76       139

    accuracy                           0.79       300
   macro avg       0.79      0.78      0.78       300
weighted avg       0.79      0.79      0.79       300



In [None]:
param_grid_mnb = {'alpha': np.linspace(-2, 1, 10), 'fit_prior': [True, False]}
grid_mnb = GridSearchCV(MultinomialNB(), param_grid_mnb, cv=3, scoring='accuracy')
grid_mnb.fit(X_train_countv.toarray(), y_train)

In [None]:
print("Best MultinomialNB params:", grid_mnb.best_params_)
y_pred_mnb = grid_mnb.best_estimator_.predict(X_test_countv.toarray())
print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_mnb))

Best MultinomialNB params: {'alpha': np.float64(1.0), 'fit_prior': False}
MultinomialNB Accuracy: 0.7866666666666666


In [None]:
bnb = BernoulliNB()
bnb.fit(X_train_countv.toarray(), y_train)
y_pred_bnb = bnb.predict(X_test_countv.toarray())
print('Bernoulli Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_bnb)))
y_pred_train = bnb.predict(X_train_countv.toarray())
print('Bernoulli Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_bnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_bnb))

Bernoulli Naïve Bayes Model accuracy score: 0.7900
Bernoulli Naïve Bayes Training-set accuracy score: 0.9914
Confusion matrix

 [[133  28]
 [ 35 104]]
              precision    recall  f1-score   support

    negative       0.79      0.83      0.81       161
    positive       0.79      0.75      0.77       139

    accuracy                           0.79       300
   macro avg       0.79      0.79      0.79       300
weighted avg       0.79      0.79      0.79       300



In [None]:
param_grid_bnb = {'alpha': np.linspace(0.1, 10, 10), 'binarize': np.linspace(0.0, 1.0, 10)}
grid_bnb = GridSearchCV(BernoulliNB(), param_grid_bnb, cv=3, scoring='accuracy')
grid_bnb.fit(X_train_countv.toarray(), y_train)

In [None]:
print("Best BernoulliNB params:", grid_bnb.best_params_)
y_pred_bnb = grid_bnb.best_estimator_.predict(X_test_countv.toarray())
print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bnb))

Best BernoulliNB params: {'alpha': np.float64(1.2000000000000002), 'binarize': np.float64(0.0)}
BernoulliNB Accuracy: 0.7933333333333333


#tfidf

In [None]:
vectorizer = TfidfVectorizer(analyzer=process)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train_tfidf.toarray(), y_train)
y_pred_gnb = gnb.predict(X_test_tfidf.toarray())#
print('Gaussian Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gnb)))
y_pred_train = gnb.predict(X_train_tfidf.toarray())
print('Gaussian Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_gnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_gnb))

Gaussian Naïve Bayes Model accuracy score: 0.5667
Gaussian Naïve Bayes Training-set accuracy score: 0.9971
Confusion matrix

 [[84 77]
 [53 86]]
              precision    recall  f1-score   support

    negative       0.61      0.52      0.56       161
    positive       0.53      0.62      0.57       139

    accuracy                           0.57       300
   macro avg       0.57      0.57      0.57       300
weighted avg       0.57      0.57      0.57       300



In [None]:
param_grid_gnb = {'var_smoothing': np.logspace(-9, 0, 10)}
grid_gnb = GridSearchCV(GaussianNB(), param_grid_gnb, cv=3, scoring='accuracy')
grid_gnb.fit(X_train_tfidf.toarray(), y_train)

In [None]:
print("Best GaussianNB params:", grid_gnb.best_params_)
y_pred_gnb = grid_gnb.best_estimator_.predict(X_test_tfidf.toarray())
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gnb))

Best GaussianNB params: {'var_smoothing': np.float64(1.0)}
GaussianNB Accuracy: 0.75


In [None]:
mnb = MultinomialNB()
mnb.fit(X_train_tfidf.toarray(), y_train)
y_pred_mnb = mnb.predict(X_test_tfidf.toarray())
print('Multinomial Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_mnb)))
y_pred_train = mnb.predict(X_train_tfidf.toarray())
print('Multinomial Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_mnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_mnb))

Multinomial Naïve Bayes Model accuracy score: 0.8033
Multinomial Naïve Bayes Training-set accuracy score: 0.9943
Confusion matrix

 [[130  31]
 [ 28 111]]
              precision    recall  f1-score   support

    negative       0.82      0.81      0.82       161
    positive       0.78      0.80      0.79       139

    accuracy                           0.80       300
   macro avg       0.80      0.80      0.80       300
weighted avg       0.80      0.80      0.80       300



In [None]:
param_grid_mnb = {'alpha': np.linspace(-2, 1, 10), 'fit_prior': [True, False]}
grid_mnb = GridSearchCV(MultinomialNB(), param_grid_mnb, cv=3, scoring='accuracy')
grid_mnb.fit(X_train_tfidf.toarray(), y_train)

In [None]:
print("Best MultinomialNB params:", grid_mnb.best_params_)
y_pred_mnb = grid_mnb.best_estimator_.predict(X_test_tfidf.toarray())
print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_mnb))

Best MultinomialNB params: {'alpha': np.float64(1.0), 'fit_prior': True}
MultinomialNB Accuracy: 0.8033333333333333


In [None]:
bnb = BernoulliNB()
bnb.fit(X_train_tfidf.toarray(), y_train)
y_pred_bnb = bnb.predict(X_test_tfidf.toarray())
print('Bernoulli Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_bnb)))
y_pred_train = bnb.predict(X_train_tfidf.toarray())
print('Bernoulli Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_bnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_bnb))

Bernoulli Naïve Bayes Model accuracy score: 0.7900
Bernoulli Naïve Bayes Training-set accuracy score: 0.9914
Confusion matrix

 [[133  28]
 [ 35 104]]
              precision    recall  f1-score   support

    negative       0.79      0.83      0.81       161
    positive       0.79      0.75      0.77       139

    accuracy                           0.79       300
   macro avg       0.79      0.79      0.79       300
weighted avg       0.79      0.79      0.79       300



In [None]:
param_grid_bnb = {'alpha': np.linspace(0.1, 10, 10), 'binarize': np.linspace(0.0, 1.0, 10)}
grid_bnb = GridSearchCV(BernoulliNB(), param_grid_bnb, cv=3, scoring='accuracy')
grid_bnb.fit(X_train_tfidf.toarray(), y_train)

In [None]:
print("Best BernoulliNB params:", grid_bnb.best_params_)
y_pred_bnb = grid_bnb.best_estimator_.predict(X_test_tfidf.toarray())
print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bnb))

Best BernoulliNB params: {'alpha': np.float64(1.2000000000000002), 'binarize': np.float64(0.0)}
BernoulliNB Accuracy: 0.7933333333333333
