# Program to predict review whether good or bad by using Naive Bayes Algorithm

"""
@author: nSyafi
"""

In [1]:
# import tensorflow as tf
# import tensorflow.keras as keras
import pandas as pd
import numpy as np
import nltk

# For stop words
from nltk.corpus import stopwords
# For tokenizing
from nltk.tokenize import word_tokenize
# For stemming
from nltk.stem.snowball import SnowballStemmer
'''Note: PorterStemmer is an option. Other than Stemming, Lemmatization is another method'''


# Library for splitting the data
from sklearn.model_selection import train_test_split
# Library to vectorize text
from sklearn.feature_extraction.text import CountVectorizer
# Library for naive bayes algorithm
from sklearn.naive_bayes import MultinomialNB

### Prerequisites if needed

In [2]:
# Loading data frame
df = pd.read_csv('dataset_elec_4000.csv')
df.head()

Unnamed: 0,review,rating
0,This case is just beautiful. I can't think of ...,1.0
1,My husband purchased these because he likes mo...,1.0
2,Very disappointed. This item worked a time or...,0.0
3,"...first of all, this Lightning cable does exa...",1.0
4,"Very bad, slow, flakey software. Very slow. I ...",0.0


# Preprocess(1): Removing Stopwords

In [3]:
# Selecting the stopwords for English dictionary
stop = stopwords.words('english')

# Column labels for the dataframe
df.columns = ["review", "rating"]

df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

df.head()

Unnamed: 0,review,rating
0,This case beautiful. I can't think anything I ...,1.0
1,My husband purchased likes movies I reader. I ...,1.0
2,"Very disappointed. This item worked time two, ...",0.0
3,"...first all, Lightning cable exactly supposed...",1.0
4,"Very bad, slow, flakey software. Very slow. I ...",0.0


# Preprocess(2): Removing Punctuations

In [4]:
# import string

# Removing punctuation
# df['review'].str.replace('[{}]'.format(string.punctuation), '')
df["review"] = df["review"].str.replace('[^\w\s]','')

df.head()

  df["review"] = df["review"].str.replace('[^\w\s]','')


Unnamed: 0,review,rating
0,This case beautiful I cant think anything I li...,1.0
1,My husband purchased likes movies I reader I h...,1.0
2,Very disappointed This item worked time two ne...,0.0
3,first all Lightning cable exactly supposed do ...,1.0
4,Very bad slow flakey software Very slow I repl...,0.0


# Preprocess(3): Tokenization

In [5]:
# Creating a new row for tokenized words
# df['tokenized'] = df['review'].apply(word_tokenize)
df['review'] = df['review'].apply(word_tokenize)

df.head()

Unnamed: 0,review,rating
0,"[This, case, beautiful, I, cant, think, anythi...",1.0
1,"[My, husband, purchased, likes, movies, I, rea...",1.0
2,"[Very, disappointed, This, item, worked, time,...",0.0
3,"[first, all, Lightning, cable, exactly, suppos...",1.0
4,"[Very, bad, slow, flakey, software, Very, slow...",0.0


# Preprocess(4): Stemming (note: this process also does lowercase casting)

In [6]:
# Stemmer of the English language
stemmer = SnowballStemmer("english")

# Apply stemming to the 'review'
df['review'] = df['review'].apply(lambda x: [stemmer.stem(y) for y in x])

df.head()

Unnamed: 0,review,rating
0,"[this, case, beauti, i, cant, think, anyth, i,...",1.0
1,"[my, husband, purchas, like, movi, i, reader, ...",1.0
2,"[veri, disappoint, this, item, work, time, two...",0.0
3,"[first, all, lightn, cabl, exact, suppos, do, ...",1.0
4,"[veri, bad, slow, flakey, softwar, veri, slow,...",0.0


# Preprocessing(5): Rejoining tokenized words

In [7]:
# Joining the token back together again
# Space is given in between words
df['review'] = df['review'].apply(lambda s: ' '.join(s))

df.head()

Unnamed: 0,review,rating
0,this case beauti i cant think anyth i like it ...,1.0
1,my husband purchas like movi i reader i hear r...,1.0
2,veri disappoint this item work time two never ...,0.0
3,first all lightn cabl exact suppos do move dat...,1.0
4,veri bad slow flakey softwar veri slow i repla...,0.0


# Phase2 Machine Learning Model

# Splitting Data for Training and Test

In [8]:
# Splitlitting the data for training and test
x = df['review']
y = df['rating']

x, x_test, y, y_test = train_test_split(x,y, stratify=y, test_size=0.25, random_state=42)

# Vectorize the texts

In [9]:
# Vectorize reviews to numbers
vec = CountVectorizer(stop_words='english')
x = vec.fit_transform(x).toarray()
x_test = vec.transform(x_test).toarray()

# Model generation

In [10]:
# Selecting the model and fitting
model = MultinomialNB()
model.fit(x, y)

# model.predict(x_test)

MultinomialNB()

### For testing the accuracy and prediction

In [None]:
# Testing the accuracy of the model
model.score(x_test, y_test)

0.846

In [12]:
model.predict(vec.transform(['This thing is Terribly Good!']))

array([1.])

In [13]:
model.predict(vec.transform(['An amazingly horrendous purchase']))

print(model.predict(vec.transform(['An amazingly horrendous purchase'])))

[0.]


# Main function

In [14]:
def main():
    while True:
        print("Program will predict whether the review is good or bad")
        inp = input("Enter your review ")
        prediction = int(model.predict(vec.transform([inp])))
        print("\n")
        print("Your review: ", inp)
        if prediction == 1.0:
            print("The model predicts the review is GOOD!")
        elif prediction == 0.0:
            print("The model predicts the review is BAD!")
        else:
            print("Unable to be defined")
        
        print("\n")
        inp2 = input("Type 1 or anything:Continue and 2:Stop ")
        if inp2 == '1':
            print("\n")
            pass
        elif inp2 == '2':
            print("\nThanks for using the program")
            break
    
main()

Program will predict whether the review is good or bad
Enter your review You suck ass mum


Your review:  You suck ass mum
The model predicts the review is BAD!


Type 1 or anything:Continue and 2:Stop 2

Thanks for using the program
