# SETUP
---

## import packages

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter as cntr

## read data file and create data frame

In [2]:
imdb_raw = pd.read_csv('train.txt', delimiter= '\t', header=None)
imdb_raw.columns = ['comment', 'sentiment']

## inspect data

In [3]:
imdb_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
comment      1000 non-null object
sentiment    1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


#### data frame info
 - 1000 rows
 - 2 columns (1 object, 1 integer)
 - 0 NULL records

In [4]:
print(imdb_raw.sentiment.nunique(), imdb_raw.sentiment.unique())
print(imdb_raw.comment.nunique())

2 [0 1]
997


#### data frame num unique
 - sentiment has 2 unique values [0, 1]
 - comment has 997 unique values (3 redundant)

In [5]:
imdb_raw.head()

Unnamed: 0,comment,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


#### data frame head
 - comment column appears to be String object, containing user reviews of movies in text
 - sentiment columns contains either 0 or 1, representing negative or positive sentiment, respectively

# DATA CLEANING
---

## clean comment variable
 - strip non alpha-numeric characters and space
 - strip leading and trailing spaces
 - replace double space with single space

In [6]:
#imdb_raw.comment = imdb_raw.comment.str.replace('"','')

# strip non alpha-numeric characters
imdb_raw.comment = imdb_raw.comment.str.replace('[^a-zA-Z\s0-9\'\/]', ' ')

# strip leading and trailing spaces
imdb_raw.comment = imdb_raw.comment.str.strip()

# replace double space with single space
imdb_raw.comment = imdb_raw.comment.str.replace('  ',' ')

# replace ' out of 10' with '/10' to standardize ratings
imdb_raw.comment = imdb_raw.comment.str.replace(' out of 10','/10')

print('--- Comment variable cleaning completed ---')

--- Comment variable cleaning completed ---


# FEATURE ENGINEERING
---

## create word lists, split by sentiment

In [7]:
# initialize empty lists
good_sentiment_words = []
bad_sentiment_words = []

# iterate through every row of the data frame
for row in range(len(imdb_raw.comment)):
    # iterate through every word in each row for the comment variable
    for word in imdb_raw.comment[row].split(' '):
        # filter by word length [3,12)
        if len(word) in range(3,12):
            # assign to proper sentiment list
            if imdb_raw.sentiment[row]:
                good_sentiment_words.append(word.upper())
            else:
                bad_sentiment_words.append(word.upper())

print('--- Word list creation completed ---')
print('')
print(list(set(good_sentiment_words)))
print('')
print(list(set(bad_sentiment_words)))

--- Word list creation completed ---

['TONS', 'PERFORMANCE', 'DISCOVERING', 'AAILIYAH', 'SISTERS', 'SOCIAL', 'SHOT', 'LEAVE', 'RUN', 'RIGHT', 'SEEING\x85', 'SIMPLY', 'THOUGH', "AMERICA'S", 'CLOSED', 'MARTIN', 'RECURRING', 'ELIAS', 'MOVIE', 'FLOWED', 'LEAVES', 'LAYERS', 'COLORFUL', 'ENTERTAINED', 'CENTURY', 'QUITE', 'WHERE', 'VALUES', 'HAVING', 'SMITH', 'WORTHY', 'SOUND', 'DEEPLY', 'RECOMMENDED', 'PRELUDE', 'WASTED', 'PUZZLE', 'BOTHERSOME', 'CAILLES', 'FIELDS', 'FRENCH', 'SIMMERING', 'BELIEVE', 'REALITY', "HOFFMAN'S", 'PARTS', 'HEELS', 'BIT', 'ORIENTED', 'SHARING', "BAILEY'S", 'BALANCE', 'TRY', 'HARD', 'UNTIL', 'TRUMBULL', 'PRESENTS', 'PART', 'TRIUMPHED', 'EVIL', 'STARLET', 'BEAR', "'TITTA", 'THEATRES', 'ENCHANTING', 'MANAGES', 'LIEUTENANT', 'TOTALLY', 'EVER', 'RACE', 'TELLY', 'EXTREMELY', 'CHARLES', 'GOVERNMENT', 'OUR', 'ACHILLE', 'THOROUGHLY', 'LIGHT', 'WHEN', 'SISTER', 'NOTEWORTHY', 'GERARDO', 'REPAIR', 'PROBLEMS', 'BLACK', 'EUROPEAN', 'FLORIDA', 'OLIVIA', 'ASPECT', 'MATRIX', 'JUNE'

## create neutral word list

In [8]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

def diff_list(first, second):
        second = set(second)
        return [item for item in first if item not in second]

neutral_sentiment_words = [pair[0] for pair in cntr(intersection(good_sentiment_words, bad_sentiment_words)).most_common(30)]
#print(neutral_sentiment_words)
neutral_sentiment_words = diff_list(neutral_sentiment_words,['GOOD','GREAT','VERY','ALL','ONE','BUT','NOT'])
print('--- Neutral word list creation completed ---')
print('')
print(neutral_sentiment_words)

--- Neutral word list creation completed ---

['THE', 'AND', 'THIS', 'FILM', 'MOVIE', 'WAS', 'THAT', 'WITH', 'FOR', 'ARE', "IT'S", 'YOU', 'HIS', 'FROM', 'WELL', 'ABOUT', 'HAS', 'CHARACTERS', 'JUST', 'OUT', 'WHO', 'MORE', 'HAVE']


## create word-pair lists, split by sentiment

In [9]:
from itertools import tee#, izip

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

# initialize empty lists
good_sentiment_word_pairs = []
bad_sentiment_word_pairs = []

# iterate through every row of the data frame
for row in range(len(imdb_raw.comment)):
    word_pair_list = list(pairwise(imdb_raw.comment[row].split()))
    #print(word_pair_list)
    #print('---')
    tmp = []
    for word_pair in word_pair_list:
        tmp.append((' '.join(word_pair)).upper())
    if imdb_raw.sentiment[row]:
        good_sentiment_word_pairs += tmp
    else:
        bad_sentiment_word_pairs += tmp

print('--- Word-pair list creation completed ---')
print('')
print(list(set(good_sentiment_word_pairs)))
print('')
print(list(set(bad_sentiment_word_pairs)))

--- Word-pair list creation completed ---

['IT PLAY', 'SHOT IN', 'PRODUCERS SCOT', 'CASTING THE', 'LIES A', 'A CAMERA', 'CHARACTER AGE', 'LEAVES ROOM', 'BOMBARDMENTS OF', 'EACH DO', 'THROUGH THE', 'FILM IT', 'WHERE SOMETHING', 'VERY STRONG', 'STUNNING TO', 'BERTOLUCCI BUT', 'BRILLIANT TWIST', 'SUBTLE REFERENCES', 'ARE TREMENDOUS', 'ACTING HELPS', 'UNDERSTAND WHERE', 'GRIMES AND', 'LOVE THAT', 'THAT SHOWS', 'THE SCREAM', 'A LESSER', 'HER ROOM', 'THAT IT', 'THEMSELVES NO', 'WAS SUCH', 'IS PLAYED', 'AND YOU', 'NOT BE', 'TIME WITH', 'CHOICES WITH', 'PRESENCE SHINED', 'SENTIMENT ACTUALLY', 'MICKEY MOUSE', 'WHITE AND', 'THAT HER', 'SONGS EVER', 'THOUGHT ERROL', 'ONE GALLEY', 'FI FILM', 'GETS NAKED', 'WHOLESOME APPEARANCE', 'WORD OF', 'MOVIE AS', 'AN ART', "WOMEN THERE'S", 'RUNNING THROUGH', 'EFFECTS AND', 'YET I', 'UNDERAPPRECIATED BRIAN', "I CAN'T", 'KNOWS WHAT', 'OUT SURPRISINGLY', 'THE PERFORMANCES', 'TO MONSTROUS', 'BEST SCORE', 'HAD FOR', 'ANGLE AND', 'WORK OF', 'ISSUES AT', 'ACTORS WH

## combine word and word-pair lists

In [10]:
keywords_s = list(set(good_sentiment_words)) + list(set(bad_sentiment_words))
keywords = keywords_s
keywords += list(set(good_sentiment_word_pairs)) + list(set(bad_sentiment_word_pairs))
print('--- Combine word lists completed ---')
print('')
print(keywords_s)

--- Combine word lists completed ---



## add keywords to data frame as feature columns

In [11]:
for key in keywords:   
    try:
        imdb_raw[str(key)] = imdb_raw.comment.str.contains(str(key), case=False)
    except:
        print(key)

print('--- Initial feature creation completed ---')

--- Initial feature creation completed ---


### additional feature BRILL
 - combines 'BRILLIANT' and 'BRILLIANCE' to strengthen positive connotation

In [12]:
contains_list = ['BRILLIANT','BRILLIANCE']
imdb_raw['BRILL'] = imdb_raw.comment.str.contains('|'.join(contains_list),case=False)

print('--- Additional feature BRILL added --')

--- Additional feature BRILL added --


### additional feature GOOD_RATING
 - combines ratings 8/10, 9/10, 10/10 to strengthen positive connotation

In [13]:
contains_list = ['10/10','9/10', '8/10']
imdb_raw['GOOD_RATING'] = imdb_raw.comment.str.contains('|'.join(contains_list), case=False)

print('--- Additional feature GOOD_RATING added --')

--- Additional feature GOOD_RATING added --


In [14]:
imdb_raw.head()

Unnamed: 0,comment,sentiment,TONS,PERFORMANCE,DISCOVERING,AAILIYAH,SISTERS,SOCIAL,SHOT,LEAVE,...,A TERRIBLE,GET THAT,BY PAUL,BAKERY ASSISTANT,THAT AS,IN SUCH,AND CERTAINLY,THING YOU,BRILL,GOOD_RATING
0,A very very very slow moving aimless movie abo...,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Not sure who was more lost the flat character...,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Attempting artiness with black white and clev...,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Very little music or anything to speak of,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The best scene in the movie was when Gerardo i...,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
#imdb_raw.corrwith(imdb_raw.sentiment).sort_values()

### find least correlating features

#### correlate each feature with sentiment

In [16]:
# correlate features with sentiment, convert result to data frame and reset index
sentiment_corr = imdb_raw.corrwith(imdb_raw.sentiment).sort_values().to_frame().reset_index()

# add column names
sentiment_corr.columns = ['feature', 'correlation']

# view new data frame
sentiment_corr

Unnamed: 0,feature,correlation
0,BAD,-0.223922
1,STUPID,-0.123404
2,WASTE,-0.122713
3,EVEN,-0.119648
4,AWFUL,-0.119159
5,WOULD,-0.117224
6,PLOT,-0.109109
7,NOT,-0.108625
8,WORST,-0.105463
9,DIDN'T,-0.104614


#### filter correlation data frame  to least correlating features

In [17]:
low_corr_features = sentiment_corr[(sentiment_corr['correlation'] < 0.001) & (sentiment_corr['correlation'] > -0.001)]
low_corr_features

Unnamed: 0,feature,correlation
5462,FILM AND,-1.032130e-16
5463,FEEL,-1.001645e-16
5464,AND THAT,-9.092890e-17
5465,GOT,-6.803623e-17
5466,LEVEL,-6.792712e-17
5467,LIKE A,-5.575191e-17
5468,MOMENT,-4.384714e-17
5469,I HAVE,-2.929338e-17
5470,SCENE,-2.817994e-17
5471,ARE A,-2.740348e-17


# EXECUTE NAIVE BAYES

## generate data and target inputs to the model

In [29]:
# save each iteration of work into a separate variable
data1 = imdb_raw[keywords_s]
data2 = imdb_raw[keywords]
data3 = imdb_raw[keywords_s + ['BRILL'] + ['GOOD_RATING']] #4.5% inaccuracy
data4 = imdb_raw[diff_list(keywords_s + ['BRILL'] + ['GOOD_RATING'],neutral_sentiment_words)] # 3.6% inaccuracy
data5 = imdb_raw[diff_list(keywords_s + ['BRILL'] + ['GOOD_RATING'],low_corr_features.feature.to_list() + neutral_sentiment_words)] # 3.6% inaccuracy

# collect iterations of work into a list to iterate through
data_list = [data1, data2, data3, data4, data5]

# define target variable
target = imdb_raw['sentiment']

## execute Bernoulli NB model

In [30]:
# import BernoulliNB
from sklearn.naive_bayes import BernoulliNB

# iterate through data iterations
for data in data_list:
    nb = BernoulliNB() # Instantiate our model and store it in a new variable.
    nb.fit(data, target) # Fit our model to the data.
    y_pred = nb.predict(data) # Classify, storing the result in a new variable.
    print(f'[BernoulliNB] mislabelled:\t {(target != y_pred).sum()}/{data.shape[0]}; {(target != y_pred).sum()/target.count()*100:.4f}%')

[BernoulliNB] mislabelled:	 53/1000; 5.3000%
[BernoulliNB] mislabelled:	 53/1000; 5.3000%
[BernoulliNB] mislabelled:	 45/1000; 4.5000%
[BernoulliNB] mislabelled:	 36/1000; 3.6000%
[BernoulliNB] mislabelled:	 36/1000; 3.6000%


## execute Gaussian NB model

In [31]:
# import GaussianNB
from sklearn.naive_bayes import GaussianNB

# iterate through data iterations
for data in data_list:
    nb = GaussianNB() # Instantiate our model and store it in a new variable.
    nb.fit(data, target) # Fit our model to the data.
    y_pred = nb.predict(data) # Classify, storing the result in a new variable.
    print(f'[GaussianNB] mislabelled:\t {(target != y_pred).sum()}/{data.shape[0]}; {(target != y_pred).sum()/target.count()*100:.4f}%')

[GaussianNB] mislabelled:	 1/1000; 0.1000%
[GaussianNB] mislabelled:	 1/1000; 0.1000%
[GaussianNB] mislabelled:	 1/1000; 0.1000%
[GaussianNB] mislabelled:	 1/1000; 0.1000%
[GaussianNB] mislabelled:	 1/1000; 0.1000%


## execute Multinomial NB model

In [32]:
# import MultinomialNB
from sklearn.naive_bayes import MultinomialNB

# iterate through data iterations
for data in data_list:
    nb = MultinomialNB() # Instantiate our model and store it in a new variable.
    nb.fit(data, target) # Fit our model to the data.
    y_pred = nb.predict(data) # Classify, storing the result in a new variable.
    print(f'[MultinomialNB] mislabelled:\t {(target != y_pred).sum()}/{data.shape[0]}; {(target != y_pred).sum()/target.count()*100:.4f}%')

[MultinomialNB] mislabelled:	 21/1000; 2.1000%
[MultinomialNB] mislabelled:	 21/1000; 2.1000%
[MultinomialNB] mislabelled:	 21/1000; 2.1000%
[MultinomialNB] mislabelled:	 19/1000; 1.9000%
[MultinomialNB] mislabelled:	 20/1000; 2.0000%


## execute Complement NB model

In [33]:
# import ComplementNB
from sklearn.naive_bayes import ComplementNB

# iterate through data iterations
for data in data_list:
    nb = ComplementNB() # Instantiate our model and store it in a new variable.
    nb.fit(data, target) # Fit our model to the data.
    y_pred = nb.predict(data) # Classify, storing the result in a new variable.
    print(f'[ComplementNB] mislabelled:\t {(target != y_pred).sum()}/{data.shape[0]}; {(target != y_pred).sum()/target.count()*100:.4f}%')

[ComplementNB] mislabelled:	 21/1000; 2.1000%
[ComplementNB] mislabelled:	 21/1000; 2.1000%
[ComplementNB] mislabelled:	 21/1000; 2.1000%
[ComplementNB] mislabelled:	 19/1000; 1.9000%
[ComplementNB] mislabelled:	 20/1000; 2.0000%


## view Mislabelled records

### Trained=1 (good sentiment), Predicted=0 (bad sentiment)

In [23]:
print('Trained Good, Predicted Bad')
print('count\tindex\ttrain\tpred\tcomment')
print('------------------------------------')
cnt = 0
for i in range(len(target)):
    if target[i] != y_pred[i]:
        if target[i]:
            cnt += 1
            print(cnt,'\t', i,'\t', target[i],'\t', y_pred[i],'\t', imdb_raw.comment[i])

Trained Good, Predicted Bad
count	index	train	pred	comment
------------------------------------
1 	 31 	 1 	 0 	 Waste your money on this game
2 	 32 	 1 	 0 	 This is the kind of money that is wasted properly
3 	 50 	 1 	 0 	 I'm glad the film didn't go for the most obvious choice as a lesser film certainly would have
4 	 57 	 1 	 0 	 Totally believable
5 	 100 	 1 	 0 	 I don't think you will be disappointed
6 	 279 	 1 	 0 	 There is a lot of beautiful places
7 	 296 	 1 	 0 	 Three of the most visually appealing movies i've ever seen
8 	 443 	 1 	 0 	 The last 15 minutes of movie are also not bad as well
9 	 492 	 1 	 0 	 But this movie really got to me
10 	 619 	 1 	 0 	 Don't miss it
11 	 640 	 1 	 0 	 But it is entertaining nonetheless
12 	 751 	 1 	 0 	 But I thought his acting was skilled
13 	 794 	 1 	 0 	 Predictable but not a bad watch
14 	 989 	 1 	 0 	 Anyway the plot flowed smoothly and the male bonding scenes were a hoot


### Trained=0 (bad sentiment), Predicted=1 (good sentiment)

In [24]:
print('Trained Bad, Predicted Good')
print('count\tindex\ttrain\tpred\comment')
print('------------------------------------')
cnt = 0
for i in range(len(target)):
    if target[i] != y_pred[i]:
        if not target[i]:
            cnt += 1
            print(cnt,'\t', i,'\t', target[i],'\t', y_pred[i],'\t', imdb_raw.comment[i])

Trained Bad, Predicted Good
count	index	train	pred\comment
------------------------------------
1 	 26 	 0 	 1 	 Graphics is far from the best part of the game
2 	 133 	 0 	 1 	 All in all a great disappointment
3 	 185 	 0 	 1 	 Highly unrecommended
4 	 199 	 0 	 1 	 The film is way too long
5 	 222 	 0 	 1 	 Mishima is extremely uninteresting
6 	 602 	 0 	 1 	 I saw this movie and I thought this is a stupid movie


## view all records

In [25]:
print('count\tindex\ttrain\tpred\tcomment')
print('------------------------------------')
cnt = 0
for i in range(len(target)):
    print(cnt,'\t', i,'\t', target[i],'\t', y_pred[i],'\t', imdb_raw.comment[i])

count	index	train	pred	comment
------------------------------------
0 	 0 	 0 	 0 	 A very very very slow moving aimless movie about a distressed drifting young man
0 	 1 	 0 	 0 	 Not sure who was more lost  the flat characters or the audience nearly half of whom walked out
0 	 2 	 0 	 0 	 Attempting artiness with black  white and clever camera angles the movie disappointed  became even more ridiculous  as the acting was poor and the plot and lines almost non existent
0 	 3 	 0 	 0 	 Very little music or anything to speak of
0 	 4 	 1 	 1 	 The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head
0 	 5 	 0 	 0 	 The rest of the movie lacks art charm meaning  If it's about emptiness it works I guess because it's empty
0 	 6 	 0 	 0 	 Wasted two hours
0 	 7 	 1 	 1 	 Saw the movie today and thought it was a good effort good messages for kids
0 	 8 	 0 	 0 	 A bit predictable
0 	 9 	 1 	 1 	 Loved the casting of Jimmy Buffet as the science

0 	 233 	 0 	 0 	 Woa talk about awful
0 	 234 	 0 	 0 	 Do not waste your time
0 	 235 	 0 	 0 	 It was just too horrible
0 	 236 	 0 	 0 	 The worst sappiest dialogue  I could go on and on
0 	 237 	 0 	 0 	 But what really made it unwatchable was the direction
0 	 238 	 0 	 0 	 The poor actors
0 	 239 	 0 	 0 	 You can't even tell if they have any talent because they not only have pathetic lines to speak but the director gave them no action
0 	 240 	 0 	 0 	 If you check the director's filmography on this site you will see why this film didn't have a chance
0 	 241 	 0 	 0 	 This would not even be good as a made for TV flick
0 	 242 	 1 	 1 	 If good intentions made a film great then this film might be one of the greatest films ever made
0 	 243 	 1 	 1 	 The film has great actors a master director a significant theme at least a would be significant theme undertone of fifties existential world weariness aerial scenes that ought to have thrilled both senses and imagination and charact

0 	 344 	 1 	 1 	 is pretty funny
0 	 345 	 1 	 1 	 Babie Bop is very cute
0 	 346 	 1 	 1 	 The kids are very cool too
0 	 347 	 0 	 0 	 But Storm Trooper is not even bad enough to make it to the list of wonderfully terrible movies
0 	 348 	 0 	 0 	 It's just lame
0 	 349 	 0 	 0 	 The guy who said he's had better dialogue with his potted plants has it right
0 	 350 	 0 	 0 	 Everything about this movie is stupid
0 	 351 	 0 	 0 	 Even if you love bad movies do not watch this movie
0 	 352 	 0 	 0 	 It is shameful
0 	 353 	 1 	 1 	 I would give this television series a 10 plus if i could
0 	 354 	 1 	 1 	 The writers were smack on and I think the best actors and actresses were a bonus to the show These characters were so real
0 	 355 	 1 	 1 	 I must say I have taped most of the episodes and i find myself watching them over and over again
0 	 356 	 1 	 1 	 Now you know why I gave it a 10
0 	 357 	 1 	 1 	 I don't know exactly what it is about this movie but we latched on to this endea

0 	 566 	 0 	 0 	 The visual effects were AWFUL
0 	 567 	 0 	 0 	 The CG opening sequence in space looked like it could have been created on Microsoft Slideshow for God's sake
0 	 568 	 0 	 0 	 The explosion of the Gas tanks at the end was just as awful
0 	 569 	 1 	 1 	 Okay I like to consider myself a fair critic so I'll give credit where credit's due the creature effects were actually pretty cool
0 	 570 	 1 	 1 	 Gotta love those close ups of slimy drooling teeth
0 	 571 	 0 	 0 	 To sum the film up Breeders is a terrible cheaply made horror movie that should be avoided like the Ebola virus
0 	 572 	 0 	 0 	 Not recommended
0 	 573 	 0 	 0 	 He is almost unbearable to watch on screen he has little to no charisma and terrible comedic timing
0 	 574 	 0 	 0 	 Aside from it's terrible lead this film has loads of other debits
0 	 575 	 0 	 0 	 I understand that it's supposed to be a cheap popcorn comedy but that doesn't mean that it has to completely insult our intelligence and have wr

0 	 677 	 1 	 1 	 GO AND SEE IT
0 	 678 	 1 	 1 	 This is an excellent film
0 	 679 	 1 	 1 	 The aerial scenes were well done
0 	 680 	 1 	 1 	 It was also the right balance of war and love
0 	 681 	 1 	 1 	 The film gives meaning to the phrase Never in the history of human conflict has so much been owed by so many to so few
0 	 682 	 1 	 1 	 Kris Kristoffersen is good in this movie and really makes a difference
0 	 683 	 1 	 1 	 A good film by a great director
0 	 684 	 1 	 1 	 This is definitely one of the better documentaries I have seen looking at family relationships and marriage
0 	 685 	 0 	 0 	 There are many continuity errors one other user commented on different cars in the garage Joe's glasses  the one that got to me the most was the fact Joe's facial hair configuration seemed to change from scene to scene
0 	 686 	 1 	 1 	 This is just a great movie
0 	 687 	 1 	 1 	 10/10 stars
0 	 688 	 0 	 0 	 I can't see how this movie can be an inspiration to anyone to come out or ove

0 	 955 	 0 	 0 	 The movie seemed a little slow at first
0 	 956 	 1 	 1 	 But it picked up speed and got right to the point
0 	 957 	 1 	 1 	 It showed exactly how the government and the scientist argued for humanity and the reasons of the gadget
0 	 958 	 1 	 1 	 I enjoyed it
0 	 959 	 1 	 1 	 I have recommended it to friends
0 	 960 	 1 	 1 	 I was particularly pleased with the acting ability of Dwight Schultz
0 	 961 	 1 	 1 	 Both actors truly understand and become their particular character delivering a convincing sincere performance
0 	 962 	 1 	 1 	 Their on screen chemistry critical to the entire film is genuine
0 	 963 	 1 	 1 	 The film's dialogue is natural real to life
0 	 964 	 1 	 1 	 The writer Gorman Bechard undoubtedly did his homework because all references are industry and character age appropriate
0 	 965 	 1 	 1 	 The incredible soundtrack truly captures the essence of the film
0 	 966 	 1 	 1 	 Each track commands sentiment actually contributing to the scenes an

# Summary
---

- Given the same features, the `Gaussian Naive Bayes` classifier appears to the be the _most accurate_ at 99.9% correct prediction, more so than Complement, MultiNomial, and Bernoulli classifiers.
- Furthermore, removing neutral sentiment keywords has no impact on the Gaussian NB classifier's accuracy, though it is quite hard to improve on 999/1000 correct prediction.
 - For Complement and MultiNomial classifiers, removing neutral keywrods improved accuracy by 0.2%.
 - For Bernoulli classifier, removing neutral keywords improved accuracy by 0.9%.
- The Bernoulli classifier accuracy was improved 0.8% first by grouping similar keywords (Brilliance + Brilliant, 8/10 + 9/10 + 10/10)