In [96]:
import re
from bs4 import BeautifulSoup 
import nltk
from nltk.corpus import stopwords
import pandas as pd  
import numpy as np
from sklearn.model_selection import train_test_split


In [97]:
def review_to_words( raw_review ):

    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words )) 

In [98]:
train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
X_train, X_test, y_train, y_test = train_test_split(train["review"], train["sentiment"], test_size=0.2, train_size=0.8)

In [99]:
refined_reviews=[review_to_words(review) for review in X_train]


In [100]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 3000) 
refined_reviews1 = vectorizer.fit_transform(refined_reviews)
refined_reviews1 = refined_reviews1.toarray()
print (refined_reviews1)   

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [101]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0.00001, class_prior=None, fit_prior=True)
clf.fit(refined_reviews1, np.array(y_train))


MultinomialNB(alpha=1e-05, class_prior=None, fit_prior=True)

In [102]:
test_refined_reviews=[review_to_words(review) for review in X_test]
tX1 = vectorizer.transform(test_refined_reviews).toarray()
print(clf.predict(tX1))


[1 0 1 ... 1 0 1]


In [103]:
accuracy01=clf.score(tX1,y_test) *100
print (accuracy01)

83.46000000000001


In [104]:
clf = MultinomialNB(alpha=5, class_prior=None, fit_prior=True)
clf.fit(refined_reviews1, np.array(y_train))


MultinomialNB(alpha=5, class_prior=None, fit_prior=True)

In [105]:
test_refined_reviews=[review_to_words(review) for review in X_test]
tX2 = vectorizer.transform(test_refined_reviews).toarray()
print(clf.predict(tX2))

[1 0 1 ... 1 0 1]


In [106]:
accuracy02=clf.score(tX2,y_test) *100
print (accuracy02)

83.48


In [115]:
vectorizer2 = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 5000) 
refined_reviews2 = vectorizer2.fit_transform(refined_reviews)
refined_reviews2= refined_reviews2.toarray()
print (refined_reviews2) 

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [116]:
clf = MultinomialNB(alpha=0.00001, class_prior=None, fit_prior=True)
clf.fit(refined_reviews2, np.array(y_train))

MultinomialNB(alpha=1e-05, class_prior=None, fit_prior=True)

In [117]:
test_refined_reviews=[review_to_words(review) for review in X_test]
tX3 = vectorizer2.transform(test_refined_reviews).toarray()
print(clf.predict(tX3))


[1 0 1 ... 1 0 1]


In [118]:
accuracy03=clf.score(tX3,y_test) *100
print (accuracy03)

84.14


In [119]:
clf2 = MultinomialNB(alpha=5, class_prior=None, fit_prior=True)
clf2.fit(refined_reviews2, np.array(y_train))


MultinomialNB(alpha=5, class_prior=None, fit_prior=True)

In [120]:
test_refined_reviews=[review_to_words(review) for review in X_test]
tX4 = vectorizer2.transform(test_refined_reviews).toarray()
print(clf2.predict(tX4))

[1 0 1 ... 1 0 1]


In [122]:
accuracy04=clf2.score(tX4,y_test) *100
print (accuracy04)

84.2


In [128]:
import pandas as pd
data = [[3000, 0.00001,83.46000000000001], [3000, 5,83.48],[5000, 0.00001,84.14], [5000, 5,84.2]]
pd.DataFrame(data, columns=["Vocabulary Size", "Alpha","Accuracy on 20% validation set"])

Unnamed: 0,Vocabulary Size,Alpha,Accuracy on 20% validation set
0,3000,1e-05,83.46
1,3000,5.0,83.48
2,5000,1e-05,84.14
3,5000,5.0,84.2
