In [3]:
# Load Dataset
import pandas as pd
data = pd.read_csv('/content/all_kindle_review (1).csv', on_bad_lines='skip')
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [4]:
data.shape

(12000, 11)

In [5]:
df=data[['reviewText', 'rating']]
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [6]:
df.shape

(12000, 2)

In [7]:
## Missing Values
df.isnull().sum()

Unnamed: 0,0
reviewText,0
rating,0


In [8]:
df['rating'].unique()

array([3, 5, 4, 2, 1])

In [9]:
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,3000
4,3000
3,2000
2,2000
1,2000


In [10]:
## Preprocessing And Cleaning

In [11]:
## Positive review is 1 and negative review is 0
df['rating'] = df['rating'].apply(lambda x:0 if x<3 else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rating'] = df['rating'].apply(lambda x:0 if x<3 else 1)


In [12]:
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
1,8000
0,4000


In [13]:
## 1. Lower All the cases
df['reviewText'] = df['reviewText'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reviewText'] = df['reviewText'].str.lower()


In [14]:
df.head()

Unnamed: 0,reviewText,rating
0,"jace rankin may be short, but he's nothing to ...",1
1,great short read. i didn't want to put it dow...,1
2,i'll start by saying this is the first of four...,1
3,aggie is angela lansbury who carries pocketboo...,1
4,i did not expect this type of book to be in li...,1


In [16]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
from bs4 import BeautifulSoup

In [18]:
# Removing special characters
df['reviewText']=df['reviewText'].apply(lambda x:re.sub('[^a-z A-z 0-9-]+', '',x))

# Remove the stopswords
df['reviewText']=df['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))

# Remove url
df['reviewText']=df['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))

# Remove html tags
df['reviewText']=df['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

# Remove any additional spaces
df['reviewText']=df['reviewText'].apply(lambda x: " ".join(x.split()))

In [19]:
df.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short hes nothing mess man hau...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four books wasnt expect...,1
3,aggie angela lansbury carries pocketbooks inst...,1
4,expect type book library pleased find price right,1


In [20]:
## Lemmatizer
from nltk.stem import WordNetLemmatizer

In [33]:
lemmatizer = WordNetLemmatizer()

In [34]:
def lemmatize_words(text):
  return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [35]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
df['reviewText'] = df['reviewText'].apply(lambda x:lemmatize_words(x))

In [37]:
df.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short he nothing mess man haul...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four book wasnt expecti...,1
3,aggie angela lansbury carry pocketbook instead...,1
4,expect type book library pleased find price right,1


In [38]:
df['reviewText'][0]

'jace rankin may short he nothing mess man hauled saloon undertaker know he famous bounty hunter oregon 1890s shot man saloon finished year long quest avenge sister murder trying figure next snotty-nosed farm boy rescued gang bully offer money kill man forced ranch reluctantly agrees bring man justice kill outright first need tell sister widower newskyla kyle springer bailey riding trail sleeping ground past month trying find jace want revenge man killed husband took ranch amongst crime shes keen detour jace want take realizes shes option hide behind boy persona best try keep pace confrontation along way get shot jace discovers kyles kyla come clean whole reason need scoundrel dead hope hell still help herthe book share touching moment slow-blooming romance kyla find good reason fear men hide behind boy persona watching jace slowly pull shell help conquer fear endearing pain real deeply-rooted didnt disappear face sexiness neither understandable aversion marriage magically disappear ro

In [39]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'], df['rating'], test_size=0.2)

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer()

X_train_bow=bow.fit_transform(X_train).toarray()

X_test_bow=bow.transform(X_test).toarray()

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train).toarray()

X_test_tfidf = tfidf.transform(X_test).toarray()


In [42]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [44]:
from sklearn.naive_bayes import GaussianNB

nb_model_bow = GaussianNB().fit(X_train_bow,y_train)
nb_model_tfidf = GaussianNB().fit(X_train_tfidf,y_train)

In [45]:
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [46]:
y_pred_bow = nb_model_bow.predict(X_test_bow)
y_pred_tfidf = nb_model_tfidf.predict(X_test_tfidf)

In [47]:
confusion_matrix(y_test, y_pred_bow)

array([[526, 293],
       [706, 875]])

In [48]:
print("BOW accuracy: ",accuracy_score(y_test,y_pred_bow))

BOW accuracy:  0.58375


In [50]:
confusion_matrix(y_test, y_pred_tfidf)

array([[514, 305],
       [690, 891]])

In [51]:
print("TFIDF accuracy: ",accuracy_score(y_test,y_pred_tfidf))

TFIDF accuracy:  0.5854166666666667


In [49]:
print("confusion matrix BOW: ",classification_report(y_test,y_pred_bow))

confusion matrix BOW:                precision    recall  f1-score   support

           0       0.43      0.64      0.51       819
           1       0.75      0.55      0.64      1581

    accuracy                           0.58      2400
   macro avg       0.59      0.60      0.57      2400
weighted avg       0.64      0.58      0.59      2400



In [52]:
print("confusion matrix TFIDF: ",classification_report(y_test,y_pred_tfidf))

confusion matrix TFIDF:                precision    recall  f1-score   support

           0       0.43      0.63      0.51       819
           1       0.74      0.56      0.64      1581

    accuracy                           0.59      2400
   macro avg       0.59      0.60      0.57      2400
weighted avg       0.64      0.59      0.60      2400



In [53]:
new_review = "The story was boring and the characters were poorly developed."

In [54]:
# Preprocess if needed (e.g , remove HTML, lowercase, etc.)

#Transform the review using the fitted BOw vectorizer
new_bow = bow.transform([new_review]).toarray()

# Predict using the trained model
pred_bow = nb_model_bow.predict(new_bow)

print("BoW Prediction: ",pred_bow[0])

BoW Prediction:  0


In [55]:
new_tfidf = tfidf.transform([new_review]).toarray()

# Predict using the trained model
pred_tfidf = nb_model_tfidf.predict(new_tfidf)

print("TFIDF Prediction: ",pred_tfidf[0])

TFIDF Prediction:  0


In [56]:
def predict_review(text, model, vectorizer):
    vectorized = vectorizer.transform([text]).toarray()
    return model.predict(vectorized)[0]

# Example usage
print("BoW Prediction:", predict_review("Great Kindle read!", nb_model_bow, bow))
print("TF-IDF Prediction:", predict_review("Great Kindle read!", nb_model_tfidf, tfidf))


BoW Prediction: 0
TF-IDF Prediction: 0
