In [1]:
import numpy as numpy
import pandas as pd

In [2]:
data = pd.read_csv("dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
data.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [5]:
#check if there are multiple same reviews in the dataset
data.duplicated().value_counts()
#removing those values
data.drop_duplicates(inplace=True)

In [6]:
#libraries to clean the text
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup

In [7]:
#cleaner
def cleanText(text, stemmer=PorterStemmer(), stopWords=set(stopwords.words("english"))):
    soup=BeautifulSoup(text,"html.parser")
    html_text=soup.get_text().lower()
    clean_text=[]
    for i in text.split():
        if i not in stopWords and i.isalpha():
            clean_text.append(stemmer.stem(i))
    return " ".join(clean_text)

In [8]:
data.review=data.review.apply(cleanText)

  soup=BeautifulSoup(text,"html.parser")


In [9]:
#vectorizing the text
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500) #play with max_features=[number] argument

In [10]:
X = cv.fit_transform(data.review).toarray()

In [11]:
X.shape

(49582, 500)

In [12]:
#encoding the label
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data.sentiment = le.fit_transform(data.sentiment)

In [13]:
y = data.iloc[:,-1].values

In [14]:
y.shape

(49582,)

#Builiding the model

In [15]:
#split the data
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42, stratify=data.sentiment)

In [16]:
#import the navieBayes models
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

clf0, clf1, clf2 = GaussianNB(), MultinomialNB(), BernoulliNB()

clf0.fit(train_X, train_y)
clf1.fit(train_X, train_y)
clf2.fit(train_X, train_y)

In [17]:
pred0 = clf0.predict(test_X)
pred1 = clf1.predict(test_X)
pred2 = clf2.predict(test_X)

In [18]:
#calculating the accuracy of each algo
from sklearn.metrics import accuracy_score

print(f"Gaussian Naive Bayes: {accuracy_score(pred0, test_y)}")
print(f"Multinomial Naive Bayes: {accuracy_score(pred1, test_y)}")
print(f"Bernoulli Navie Bayes: {accuracy_score(pred2, test_y)}")

Gaussian Naive Bayes: 0.7798729454472119
Multinomial Naive Bayes: 0.8040738126449531
Bernoulli Navie Bayes: 0.7992336392054049


In [19]:
#to track a word and its corresponding number
features_dict={}
for i in range(len(cv.get_feature_names_out())):
    features_dict[cv.get_feature_names_out()[i]]=i

In [20]:
#saving the model to pickle file
import pickle
pickle.dump(data, open("dataframe.pkl", "wb"))
pickle.dump(features_dict, open("features_dict.pkl", "wb"))
