In [4]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

### group members:
- thwisha nahender - tn130
- medha sreenivasan - ms1112

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import string

In [6]:
## read the dataset 
movie_data = pd.read_csv('IMDB_Dataset.csv')
movie_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
X_train, X_test, y_train, y_test = train_test_split(movie_data['review'], movie_data['sentiment'], 
                                                    test_size=0.33, random_state=42,stratify=movie_data['sentiment'])

In [8]:
## data preprocessing steps 
## 1. remove the stop words from the review 
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('br','')
    tokens = word_tokenize(text)
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

X_train_clean = X_train.apply(preprocess_text)
X_test_clean = X_test.apply(preprocess_text)

In [6]:
X_train_clean.iloc[0]

'sure making good film storyline good bit bland acting good enough though didnt understand olivia dabo pronounced australian accent character supposed raised u biggest problem however wardrobe know rule average american considered frumpy dresser selfrespecting european beyond anna colour combination positively ghastly potato sacklike sad excuse coat wore throughout film made eak hive suppose idea realistic possible many school teacher walk around prada simple doesnt mean absolute lack taste word wise'

In [9]:
from collections import Counter
cnt = Counter()
for text in X_train_clean:
    for word in text.split():
        cnt[word] += 1
cnt.most_common(10)
temp = pd.DataFrame(cnt.most_common(10))
temp.columns=['word', 'count']
temp

Unnamed: 0,word,count
0,movie,67603
1,film,61323
2,one,36064
3,like,26547
4,time,20214
5,good,19416
6,character,18831
7,story,16634
8,even,16436
9,get,16413


In [71]:
len(cnt)

104086

In [10]:
## label encode the sentiment
y_train = [1 if sentiment == 'positive' else 0 for sentiment in y_train]
y_test = [1 if sentiment == 'positive' else 0 for sentiment in y_test]

In [214]:
X_train_tokenized=X_train_clean.apply(lambda x: x.split())
X_test_tokenized=X_test_clean.apply(lambda x: x.split())

In [215]:
X_train_tokenized

44272    [sure, making, good, film, storyline, good, bi...
33427    [miniseries, iconic, australian, spirit, may, ...
36331    [see, cover, dvd, youre, convinced, class, b, ...
25718    [saw, documentary, disappointed, see, serbian,...
40140    [abysmal, indonesian, action, film, legendary,...
                               ...                        
39654    [disappointed, show, hearing, reading, hoopla,...
22917    [first, time, saw, shade, sneakpreview, hadnt,...
47481    [waste, time, danger, watch, tempted, tear, dv...
35597    [far, pathetic, movie, indian, cinema, cinema,...
27491    [movie, forever, left, impression, watched, fr...
Name: review, Length: 33500, dtype: object

In [11]:
##2. we have to create feature vectors before training
vectorizer = CountVectorizer()      
X_train_vectorized = vectorizer.fit_transform(X_train_clean)
X_test_counts = vectorizer.transform(X_test_clean)

In [12]:
X_test_counts = vectorizer.transform(X_test_clean)

In [13]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized,y_train)

In [14]:
predictions = nb_classifier.predict(X_test_counts)

In [15]:
acc_counts = sum([1 if pred == actual else 0 for pred,actual in zip(predictions,y_test)])

In [18]:
acc_counts/len(y_test)

0.8561818181818182

In [17]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

print('F1 SCORE: ',f1_score(y_test,predictions))
print('RECALL : ',recall_score(y_test,predictions))
print('PRECISION : ',precision_score(y_test,predictions))

F1 SCORE:  0.8534732942266132
RECALL :  0.8376969696969697
PRECISION :  0.8698552548772813


### qualitative analysis

In [19]:
def get_transformed_test_data(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('br','')
    tokens = word_tokenize(text)
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    clean_text = " ".join(filtered_tokens)
    vector_text = vectorizer.transform(list([clean_text]))
    return vector_text
    

In [20]:
review = 'I enjoyed the movie!'
label = nb_classifier.predict(get_transformed_test_data(review))
print(['positive' if label == 1 else 'negative'])

['positive']


In [21]:
review = 'I hated the acting and the plot!'
label = nb_classifier.predict(get_transformed_test_data(review))
print(['positive' if label == 1 else 'negative'])

['negative']


In [22]:
review = 'i did not love the vibrant colors in the movie'
label = nb_classifier.predict(get_transformed_test_data(review))
print(['positive' if label == 1 else 'negative'])

['positive']
