#Text Classifier using Naive Bayes Theorem

####Name: Mayukha Thumiki
####UTA ID: 1002055616

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

from google.colab import drive
from sklearn.model_selection import train_test_split

import numpy as np
import nltk
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import gutenberg
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from typing import List, Dict, Tuple
import itertools

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import classification_report as cr
from sklearn.metrics import confusion_matrix as cm

In [2]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


###Understanding the dataset

In [3]:
movie_data = pd.read_csv('/content/gdrive/MyDrive/NBC/rt_reviews.csv', encoding='latin1')

In [4]:
movie_data.head()

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...


In [5]:
print(movie_data.shape)

(480000, 2)


In [6]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480000 entries, 0 to 479999
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Freshness  480000 non-null  object
 1   Review     480000 non-null  object
dtypes: object(2)
memory usage: 7.3+ MB


In [7]:
movie_data.isnull().any()

Freshness    False
Review       False
dtype: bool

In [8]:
movie_data.reset_index(inplace=True)
movie_data.head(10)

Unnamed: 0,index,Freshness,Review
0,0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,1,fresh,Wilfully offensive and powered by a chest-thu...
2,2,rotten,It would be difficult to imagine material mor...
3,3,rotten,Despite the gusto its star brings to the role...
4,4,rotten,If there was a good idea at the core of this ...
5,5,rotten,"Gleeson goes the Hallmark Channel route, dama..."
6,6,fresh,It was the height of satire in 1976: dark as ...
7,7,rotten,"Everyone in ""The Comedian"" deserves a better ..."
8,8,rotten,Actor encourages grumpy Christians to embrace...
9,9,fresh,"Slight, contained, but ineffably soulful."


In [9]:
classes = movie_data['Freshness'].unique()
print(classes)

['fresh' 'rotten']


In [10]:
movie_data = movie_data.replace("[^a-zA-Z]+", " ", regex=True)

###Splitting and processing the dataset

In [11]:
train_data,test_data =  train_test_split(movie_data, test_size=0.2, random_state=1)
test_data, valid_data = train_test_split(test_data, test_size=0.5, random_state=1)
print("Size of Train Data: {}".format(len(train_data.index)))
print("Size of Test Data: {}".format(len(test_data.index)))
print("Size of Validation Data: {}".format(len(valid_data.index)))

Size of Train Data: 384000
Size of Test Data: 48000
Size of Validation Data: 48000


In [12]:
train_data['Freshness'].value_counts()

fresh     192124
rotten    191876
Name: Freshness, dtype: int64

In [13]:
#For Lemmatization
import nltk
nltk.download('punkt')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [14]:
def get_frequency(df, Type="total", voclist=None) -> Tuple[Dict[str, int], FreqDist]:
    if Type != "total":
        df = df[df["Freshness"] == Type]

    reviews = df["Review"].tolist()
    words = list(itertools.chain.from_iterable([review.split() for review in reviews]))

    # Lemmatize the words using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmas = (lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha())

    # Create a frequency distribution of the lemmas
    freq_dist = FreqDist(lemmas)

    return dict(freq_dist), freq_dist

In [15]:
train_vocdic, train_freq_dist=get_frequency(train_data)

In [16]:
print("Number of unique words in all the sentences after lemmatization:",len(train_freq_dist))

Number of unique words in all the sentences after lemmatization: 71716


In [17]:
all_train_freq_dist = train_freq_dist.items() #For probability calculation after smoothing i.e. before the removal of stopwords
all_train_freq_list = list(train_freq_dist.items())
all_train_vocdic = train_vocdic

In [18]:
train_freq_dist_list = list(train_freq_dist.items()) #For likelihood probability
print(train_freq_dist_list)



In [39]:
#Removal of stopwords
# Download stopwords
nltk.download('stopwords')

def stop_words_removal(freq_dist_list, vocdic):
  stop_words = stopwords.words('english')
  freq_dist_list_copy = freq_dist_list.copy() # create a copy of freq_dist_list so that you can modify it while iterating over it
  for sublist in freq_dist_list_copy:
    if sublist[0] in stop_words:
      freq_dist_list.remove(sublist)

  for sw in stop_words:
    if sw in vocdic:
      vocdic.pop(sw)
  return freq_dist_list, vocdic

train_freq_dist_list, train_vocdic = stop_words_removal(train_freq_dist_list, train_vocdic)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
train_freq_dist_list[:10]

[('featured', 89),
 ('cast', 6677),
 ('woodley', 111),
 ('really', 7865),
 ('achieves', 452),
 ('lasting', 294),
 ('impact', 1009),
 ('episode', 1059),
 ('although', 2545),
 ('ham', 297)]

In [45]:
print('a' in train_vocdic.keys())
print('marshall' in train_vocdic.keys())

False
True


In [22]:
train_vocabulary=[]
for lemma, count in train_freq_dist_list:
    train_vocabulary.append(lemma)
print(train_vocabulary)



In [23]:
#marginal probability:
train_wfreqdf_total=pd.DataFrame(train_freq_dist_list,columns=["Vocabulary", "Total_Frequency"])
train_wfreqdf_total.head(10)

Unnamed: 0,Vocabulary,Total_Frequency
0,featured,89
1,cast,6677
2,woodley,111
3,really,7865
4,achieves,452
5,lasting,294
6,impact,1009
7,episode,1059
8,although,2545
9,ham,297


In [24]:
sum_of_values = sum(train_vocdic.values())
print(sum_of_values)

4821291


In [25]:
st=True
marginal_prob = (
    train_wfreqdf_total.set_index("Vocabulary")
    .fillna(0) if st else train_wfreqdf_total.set_index("Vocabulary")
)

marginal_prob = marginal_prob.apply(lambda x: x/sum_of_values, axis=1)
marginal_prob

Unnamed: 0_level_0,Total_Frequency
Vocabulary,Unnamed: 1_level_1
featured,1.845979e-05
cast,1.384899e-03
woodley,2.302288e-05
really,1.631306e-03
achieves,9.375082e-05
...,...
kingmaking,2.074133e-07
remolding,2.074133e-07
bagging,2.074133e-07
vaxxers,2.074133e-07


In [26]:
#Likelihod probability calculation
train_wfreqdf=pd.DataFrame(train_vocabulary,columns=["Vocabulary"])
train_wfreqdf.head(10)

Unnamed: 0,Vocabulary
0,featured
1,cast
2,woodley
3,really
4,achieves
5,lasting
6,impact
7,episode
8,although
9,ham


In [27]:
freshness_count = train_data['Freshness'].value_counts()
print("Fresh : {}".format(freshness_count.values[0]), "\tRotten : {}".format(freshness_count.values[1]))

for freshness in freshness_count.index:
    print("\nFreshness: {}".format(freshness))
    vocdic_type, freq_cnt_type=get_frequency(train_data, freshness, voclist = train_vocabulary)
    #vocdic_type, freq_cnt = get_frequency(train_data, freshness, voclist=train_vocabulary)
    freq_dist_list=list(freq_cnt_type.items())
    freq_cnt_type, vocdic_type = stop_words_removal(freq_dist_list, vocdic_type)
    temp = [(lemma, freshness) for lemma, count in freq_cnt_type]
    voctypedf=pd.DataFrame(vocdic_type.items(),columns=["Vocabulary",freshness])
    train_wfreqdf=train_wfreqdf.merge(voctypedf,how="left",on="Vocabulary")

print(len(train_wfreqdf))
train_wfreqdf.head(20)

Fresh : 192124 	Rotten : 191876

Freshness: fresh

Freshness: rotten
71567


Unnamed: 0,Vocabulary,fresh,rotten
0,featured,37.0,52.0
1,cast,3665.0,3012.0
2,woodley,62.0,49.0
3,really,3151.0,4714.0
4,achieves,293.0,159.0
5,lasting,142.0,152.0
6,impact,532.0,477.0
7,episode,317.0,742.0
8,although,1349.0,1196.0
9,ham,43.0,254.0


In [28]:
st=True
likelihoodprob = (
    train_wfreqdf.set_index("Vocabulary")
    .fillna(0) if st else train_wfreqdf.set_index("Vocabulary")
)
likelihoodprob

Unnamed: 0_level_0,fresh,rotten
Vocabulary,Unnamed: 1_level_1,Unnamed: 2_level_1
featured,37.0,52.0
cast,3665.0,3012.0
woodley,62.0,49.0
really,3151.0,4714.0
achieves,293.0,159.0
...,...,...
kingmaking,1.0,0.0
remolding,0.0,1.0
bagging,0.0,1.0
vaxxers,1.0,0.0


In [29]:
sum_fresh = likelihoodprob['fresh'].sum()
sum_rotten = likelihoodprob['rotten'].sum()
print('Sum of fresh : {}'.format(sum_fresh))
print('Sum of rotten : {}'.format(sum_rotten))

Sum of fresh : 2453902.0
Sum of rotten : 2367389.0


In [30]:
if 'fresh' in likelihoodprob.columns:
  likelihoodprob = likelihoodprob.apply(lambda x: x/sum_fresh, axis=1)
else:
  likelihoodprob = likelihoodprob.apply(lambda x: x/sum_rotten, axis=1)
likelihoodprob

Unnamed: 0_level_0,fresh,rotten
Vocabulary,Unnamed: 1_level_1,Unnamed: 2_level_1
featured,1.507803e-05,2.119074e-05
cast,1.493540e-03,1.227433e-03
woodley,2.526588e-05,1.996820e-05
really,1.284077e-03,1.921022e-03
achieves,1.194017e-04,6.479476e-05
...,...,...
kingmaking,4.075142e-07,0.000000e+00
remolding,0.000000e+00,4.075142e-07
bagging,0.000000e+00,4.075142e-07
vaxxers,4.075142e-07,0.000000e+00


###Testing the algorithm on sample user given input data

In [31]:
review = "good one"
words=review.split()
print("List of words in review : {}".format(list(words)))

prob_fresh=1
prob_rotten=1
prior_type = train_data['Freshness'].value_counts(normalize=True).to_frame().T
p_of_fresh=prior_type.iloc[0, 0]
p_of_rotten=prior_type.iloc[0, 1]

for wd in words:
  element_fresh = likelihoodprob.loc[wd, "fresh"]
  prob_fresh*=element_fresh
  element_rotten = likelihoodprob.loc[wd, "rotten"]
  prob_rotten*=element_rotten

final_fresh_prob=prob_fresh*p_of_fresh
final_rotten_prob=prob_rotten*p_of_rotten
print("Prob(words in review|fresh) = {}".format(final_fresh_prob), "\tProb(words in review|rotten) = {}".format(final_rotten_prob))

sol=[]
if final_fresh_prob > final_rotten_prob:
  sol.append('fresh')
else:
  sol.append('rotten')
print("Prediction : {}".format(sol))

List of words in review : ['good', 'one']
Prob(words in review|fresh) = 1.2742452795650694e-05 	Prob(words in review|rotten) = 9.99335060352259e-06
Prediction : ['fresh']


###Testing using validation data

In [None]:
valid_prediction = []

for s in valid_data['Review']:
    words = s.split()
    fresh_prob = 1.0
    rotten_prob = 1.0

    for w in words:
        if w in train_vocabulary:
            fresh_prob *= likelihoodprob.loc[w, 'fresh']
            rotten_prob *= likelihoodprob.loc[w, 'rotten']
    fresh_prob *= prior_type.loc['Freshness', 'fresh']
    rotten_prob *= prior_type.loc['Freshness', 'rotten']
    if fresh_prob > rotten_prob:
        valid_prediction.append('fresh')
    else:
        valid_prediction.append('rotten')

print('Generated Accuracy for Valid Data: {}'.format(accuracy_score(valid_data["Freshness"], valid_prediction)))

###Testing using est data

In [None]:
test_prediction = []

for s in test_data['Review']:
    words = s.split()
    fresh_prob = 1.0
    rotten_prob = 1.0

    for w in words:
        if w in train_vocabulary:
            fresh_prob *= likelihoodprob.loc[w, 'fresh']
            rotten_prob *= likelihoodprob.loc[w, 'rotten']
    fresh_prob *= prior_type.loc['Freshness', 'fresh']
    rotten_prob *= prior_type.loc['Freshness', 'rotten']
    if fresh_prob > rotten_prob:
        test_prediction.append('fresh')
    else:
        test_prediction.append('rotten')

print('Generated Accuracy for Test Data: {}'.format(accuracy_score(test_data["Freshness"], test_prediction)))

###Top 10 words that predicted each class

In [32]:
# Top 10 words under "fresh"
top_fresh = likelihoodprob.nlargest(10, "fresh").index.tolist()

# Top 10 words under "rotten"
top_rotten = likelihoodprob.nlargest(10, "rotten").index.tolist()

print("The Top 10 words that predict fresh : {}".format(top_fresh))
print("The Top 10 words that predict rotten : {}".format(top_rotten))

The Top 10 words that predict fresh : ['film', 'movie', 'one', 'ha', 'story', 'like', 'make', 'performance', 'time', 'character']
The Top 10 words that predict rotten : ['film', 'movie', 'like', 'one', 'ha', 'much', 'story', 'make', 'character', 'feel']


###Repeating the above steps by performing Laplacian smoothing

In [46]:
'''all_train_freq_dist = train_freq_dist.items() #For probability calculation after smoothing i.e. before the removal of stopwords
all_train_freq_list = list(train_freq_dist.items())
all_train_vocdic = train_vocdic'''
sm_train_vocabulary=[]
for lemma, count in all_train_freq_list:
    sm_train_vocabulary.append(lemma)
print(sm_train_vocabulary)



In [52]:
sm_train_wfreqdf=pd.DataFrame(sm_train_vocabulary,columns=["Vocabulary"])
sm_train_wfreqdf.head(10)

Unnamed: 0,Vocabulary
0,of
1,the
2,featured
3,cast
4,only
5,woodley
6,really
7,achieves
8,any
9,lasting


In [54]:
freshness_count = train_data['Freshness'].value_counts()
print("Fresh : {}".format(freshness_count.values[0]), "\tRotten : {}".format(freshness_count.values[1]))

for freshness in freshness_count.index:
    print("\nFreshness: {}".format(freshness))
    vocdic_type, freq_cnt_type=get_frequency(train_data, freshness, voclist = sm_train_vocabulary)

    freq_dist_list=list(freq_cnt_type.items())
    #freq_cnt_type, vocdic_type = stop_words_removal(freq_dist_list, vocdic_type)
    temp = [(lemma, freshness) for lemma, count in freq_dist_list]
    voctypedf=pd.DataFrame(vocdic_type.items(),columns=["Vocabulary",freshness])
    sm_train_wfreqdf=sm_train_wfreqdf.merge(voctypedf,how="left",on="Vocabulary")

print(len(sm_train_wfreqdf))
sm_train_wfreqdf.head(20)

Fresh : 192124 	Rotten : 191876

Freshness: fresh

Freshness: rotten
71716


Unnamed: 0,Vocabulary,fresh,rotten
0,of,133281.0,116268.0
1,the,220965.0,218082.0
2,featured,37.0,52.0
3,cast,3665.0,3012.0
4,only,4414.0,8096.0
5,woodley,62.0,49.0
6,really,3151.0,4714.0
7,achieves,293.0,159.0
8,any,3739.0,6063.0
9,lasting,142.0,152.0


In [59]:
st=True
sm_likelihoodprob = (
    sm_train_wfreqdf.set_index("Vocabulary")
    .fillna(0) 
    if st else sm_train_wfreqdf.set_index("Vocabulary")
)
sm_likelihoodprob

KeyError: ignored