In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv("IMDB_dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape

(50000, 2)

In [5]:
df.groupby(['sentiment']).count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
negative,25000
positive,25000


## Data Processing

In [6]:
text = df['review'][2]

In [7]:
def processing_raw_text(text):
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filter_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    clean_sen = " ".join(filter_words)
    return clean_sen

In [8]:
filtered = processing_raw_text(text)
print(filtered)

thought wonderful way spend time hot summer weekend sitting air conditioned theater watching comedy plot simplistic dialogue witty characters likable even well bread suspected serial killer may disappointed realize match point risk addiction thought proof woody allen still fully control style many us grown br br laughed one woody comedies years dare say decade never impressed scarlet johanson managed tone sexy image jumped right average spirited young br br may crown jewel career wittier devil wears prada interesting superman great comedy go see friends


In [9]:
df['clean_review'] = df['review'].apply(processing_raw_text)

In [10]:
df

Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode ho...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movie right good job creative original...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,negative,catholic taught parochial elementary schools n...
49998,I'm going to have to disagree with the previou...,negative,going disagree previous comment side maltin on...


In [11]:
X = df['clean_review']
y = df['sentiment']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [13]:
model = CountVectorizer()

In [14]:
X_train_bow = model.fit_transform(X_train)
X_test_bow = model.transform(X_test)

In [15]:
model_nb = MultinomialNB()

In [16]:
model_nb.fit(X_train_bow, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [17]:
clean_text = "This movie is my favourite movie"

In [18]:
count_vector = model.transform(pd.Series(clean_text))

In [19]:
model_nb.predict(count_vector)

array(['positive'], dtype='<U8')