# Tutorial - build MNB with sklearn

This tutorial demonstrates how to use the Sci-kit Learn (sklearn) package to build Multinomial Naive Bayes model, rank features, and use the model for prediction. 

The data from the Kaggle Sentiment Analysis on Movie Review Competition are used in this tutorial. Check out the details of the data and the competition on Kaggle.
https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews

The tutorial also includes sample code to prepare your prediction result for submission to Kaggle. Although the competition is over, you can still submit your prediction to get an evaluation score.

# Step 1: Read in data

In [1]:


import pandas as p 
train=p.read_csv("/Users/kenmckee/Desktop/GS/S18/tm/HW6/ddcfx.csv", delimiter=',') 
y=train['sentiment'].values 
X=train['review'].values 
z=train['lie'].values

In [2]:
# check the sklearn documentation for train_test_split
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
# "test_size" : float, int, None, optional
# If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. 
# If int, represents the absolute number of test samples. 
# If None, the value is set to the complement of the train size. 
# By default, the value is set to 0.25. The default will change in version 0.21. It will remain 0.25 only if train_size is unspecified, otherwise it will complement the specified train_size.    

from sklearn.model_selection import train_test_split
Xy_train, Xy_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
Xz_train, Xz_test, z_train, z_test = train_test_split(X, z, test_size=0.4, random_state=0)

X_train = Xy_train
X_test = Xy_test

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(y_train)
print(z_train)
print(X_train[0:5])
print(X_test[0:5])
print(X_test[0:5])
print(y_test)
print(z_test)



(55,) (55,) (37,) (37,)
['p' 'p' 'p' 'n' 'n' 'n' 'n' 'n' 'n' 'p' 'p' 'n' 'n' 'n' 'p' 'n' 'n' 'n'
 'p' 'n' 'n' 'p' 'p' 'p' 'n' 'n' 'p' 'n' 'p' 'n' 'n' 'p' 'p' 'p' 'p' 'n'
 'p' 'p' 'n' 'n' 'p' 'n' 'p' 'p' 'n' 'p' 'n' 'n' 'p' 'n' 'p' 'p' 'p' 'p'
 'n']
['t' 'f' 'f' 'f' 'f' 'f' 't' 't' 'f' 't' 'f' 'f' 't' 't' 'f' 'f' 't' 't'
 'f' 'f' 't' 'f' 'f' 't' 'f' 't' 't' 'f' 'f' 'f' 't' 'f' 't' 't' 't' 'f'
 't' 't' 't' 't' 'f' 't' 'f' 'f' 'f' 't' 't' 'f' 't' 'f' 't' 'f' 'f' 'f'
 't']
["'My sister and I ate at this restaurant called Matador. The overall look and ambiance of the restaurant was very appealing. We first ordered strawberry margaritas--which were really good.Then my sister ordered a spinach lasagna with Alfredo sauce and I ordered Pasta ravioli with marinara sauce. My sister and I unanimously agreed they were the best pastas we had ever had. It was a beautiful blend of flavors which complimented each other. I would totally recommend Matador and it was an overall amazing experience.'"
 "'Th

# Step 2.1 Data Checking

In [3]:
# Check how many training examples in each category
# this is important to see whether the data set is balanced or skewed

#print(list(y_train))

training_labelsy = set(y_train)
print(training_labelsy)
from scipy.stats import itemfreq
training_category_disty = itemfreq(y_train)
print(training_category_disty)

training_labelsz = set(z_train)
print(training_labelsz)
from scipy.stats import itemfreq
training_category_distz = itemfreq(z_train)
print(training_category_distz)

{'p', 'n'}
[['n' 28]
 ['p' 27]]
{'t', 'f'}
[['f' 29]
 ['t' 26]]


`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  if __name__ == '__main__':
`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  from ipykernel import kernelapp as app


In [4]:
# Print out the category distribution in the test data set. 
#Is the test data set's category distribution similar to the training data set's?

# Your code starts here
#train=p.read_csv("/Users/kenmckee/Desktop/GS/S18/tm/ks/test.tsv", delimiter='\t')

list(y_test)

test_labelsy = set(y_test)
print(test_labelsy)
from scipy.stats import itemfreq
test_category_disty = itemfreq(y_test)
print(test_category_disty)

list(z_test)

test_labelsz = set(z_test)
print(test_labelsz)
from scipy.stats import itemfreq
test_category_distz = itemfreq(z_test)
print(test_category_distz)
# Your code ends here

{'p', 'n'}
[['n' 18]
 ['p' 19]]
{'t', 'f'}
[['f' 17]
 ['t' 20]]


`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  if sys.path[0] == '':
`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`


# Step 3: Vectorization

In [5]:
# sklearn contains two vectorizers

# CountVectorizer can give you Boolean or TF vectors
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

# TfidfVectorizer can give you TF or TFIDF vectors
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

# Read the sklearn documentation to understand all vectorization options

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold

# several commonly used vectorizer setting

#  unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=1, stop_words='english')

#  unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=1, stop_words='english')

#  unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(2,3), min_df=2, stop_words='english')

#  unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=1, stop_words='english')

VT = VarianceThreshold(threshold=.5)



## Step 3.1: Vectorize the training data

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk.stem

english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

stem_vectorizer = StemmedCountVectorizer(min_df=3, analyzer="word", stop_words='english')
X_train_stem = stem_vectorizer.fit_transform(X_train)



In [7]:
# check the content of a document vector
print(X_train_stem.shape)
print(X_train_stem[0].toarray())

# check the size of the constructed vocabulary
print(len(stem_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(stem_vectorizer.vocabulary_.items())[:10])

# check word index in vocabulary
print(stem_vectorizer.vocabulary_.get('restaur'))

(55, 40)
[[1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 3 0 0 1 2 0 0 0 0 0 0
  0 0 0 0]]
40
[('restaur', 29), ('call', 4), ('look', 22), ('order', 25), ('realli', 28), ('good', 15), ('best', 3), ('amaz', 0), ('experi', 9), ('servic', 31)]
29


In [8]:
# The vectorizer can do "fit" and "transform"
# fit is a process to collect unique tokens into the vocabulary
# transform is a process to convert each document to vector based on the vocabulary
# These two processes can be done together using fit_transform(), or used individually: fit() or transform()

# fit vocabulary in training documents and transform the training documents into vectors
X_train_tfidf = unigram_tfidf_vectorizer.fit_transform(X_train)
X_train_bool = unigram_bool_vectorizer.fit_transform(X_train)
X_train_count = unigram_count_vectorizer.fit_transform(X_train)
X_train_gram12 = gram12_count_vectorizer.fit_transform(X_train)

X_train_VT = VT.fit_transform(X_train_count)

# check the content of a document vector
print(X_train_gram12.shape)
print(X_train_gram12[:10].toarray())
print(X_train_tfidf.shape)
print(X_train_tfidf[:10].toarray())

print("check the size of the constructed vocabulary")
print(len(unigram_bool_vectorizer.vocabulary_))
print(len(gram12_count_vectorizer.vocabulary_))
print(len(unigram_tfidf_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_bool_vectorizer.vocabulary_.items())[:10])
print(list(gram12_count_vectorizer.vocabulary_.items())[:10])
print(list(unigram_tfidf_vectorizer.vocabulary_.items())[:10])

# check word index in vocabulary
print(unigram_bool_vectorizer.vocabulary_.get('restaurant'))
print(gram12_count_vectorizer.vocabulary_.get('restaurant'))
print(unigram_tfidf_vectorizer.vocabulary_.get('restaurant'))

(55, 21)
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
(55, 321)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.23916021 ... 0.23916021 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
check the size of the constructed vocabulary
321
21
321
[('sister', 245), ('ate', 16),

In [9]:
# The vectorizer can do "fit" and "transform"
# fit is a process to collect unique tokens into the vocabulary
# transform is a process to convert each document to vector based on the vocabulary

X_test_tfidf = unigram_tfidf_vectorizer.transform(X_test)
X_test_stem = stem_vectorizer.transform(X_test)
X_test_bool = unigram_bool_vectorizer.transform(X_test)
#sel_X_train_bool = sel.fit_transform(X_test_bool)

X_test_count = unigram_count_vectorizer.transform(X_test)
X_test_gram12 = gram12_count_vectorizer.transform(X_test)
X_test_stem_vec = stem_vectorizer.transform(X_test)

X_test_VT = VT.transform(X_test_count)
# print out #examples and #features in the test set
print(X_test_gram12.shape)
print(X_test_tfidf.shape)
print(X_test_bool.shape)
print(X_test_VT.shape)




(37, 21)
(37, 321)
(37, 321)
(37, 1)


# Step 4: Train a MNB classifier

In [10]:
# import the MNB module
from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()


# use the training data to train the MNB model
nb_clf.fit(X_train_gram12,y_train)
nb_clf.fit(X_train_gram12,z_train)

nb_clf.fit(X_train_tfidf,y_train)
nb_clf.fit(X_train_tfidf,z_train)

nb_clf.fit(X_train_bool,y_train)
nb_clf.fit(X_train_bool,z_train)

nb_clf.fit(X_train_stem,y_train)
nb_clf.fit(X_train_stem,z_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel_X_train_bool = sel.fit_transform(X_train_bool)

sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel_X_train_bool = sel.fit_transform(X_train_bool)

# Step 4.1 Interpret a trained MNB model

In [11]:
## interpreting naive Bayes models
## by consulting the sklearn documentation you can also find out how to print the coef_ for naive Bayes 
## which are the conditional probabilities
## http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

# the code below will print out the conditional prob of the word "worthless" in each category
# sample output
# -8.98942647599 -> logP('worthless'|very negative')
# -11.1864401922 -> logP('worthless'|negative')
# -12.3637684625 -> logP('worthless'|neutral')
# -11.9886066961 -> logP('worthless'|positive')
# -11.0504454621 -> logP('worthless'|very positive')
# the above output means the word feature "worthless" is indicating "very negative" 
# because P('worthless'|very negative) is the greatest among all conditional probs

stem_vectorizer.vocabulary_.get('owners')
for i in range(0,1):
  print(nb_clf.coef_[i][stem_vectorizer.vocabulary_.get('owners')])

stem_vectorizer.vocabulary_.get('owners')
for i in range(0,1):
  print(nb_clf.coef_[i][gram12_count_vectorizer.vocabulary_.get('owners')])




[[-3.99513791 -4.40060302 -3.48431229 -3.14784005 -4.40060302 -3.99513791
  -3.99513791 -4.40060302 -3.99513791 -3.30199073 -4.40060302 -3.99513791
  -2.52880084 -3.99513791 -3.30199073 -3.14784005 -3.70745584 -5.0937502
  -3.99513791 -3.99513791 -3.70745584 -3.70745584 -3.99513791 -4.40060302
  -3.99513791 -3.01430866 -3.99513791 -3.99513791 -3.48431229 -2.09801793
  -4.40060302 -4.40060302 -4.40060302 -3.99513791 -4.40060302 -3.99513791
  -5.0937502  -5.0937502  -3.99513791 -3.01430866]]
[[-3.99513791 -4.40060302 -3.48431229 -3.14784005 -4.40060302 -3.99513791
  -3.99513791 -4.40060302 -3.99513791 -3.30199073 -4.40060302 -3.99513791
  -2.52880084 -3.99513791 -3.30199073 -3.14784005 -3.70745584 -5.0937502
  -3.99513791 -3.99513791 -3.70745584 -3.70745584 -3.99513791 -4.40060302
  -3.99513791 -3.01430866 -3.99513791 -3.99513791 -3.48431229 -2.09801793
  -4.40060302 -4.40060302 -4.40060302 -3.99513791 -4.40060302 -3.99513791
  -5.0937502  -5.0937502  -3.99513791 -3.01430866]]


In [12]:
# sort the conditional probability for category 0 "very negative"
# print the words with highest conditional probs
# these can be words popular in the "very negative" category alone, or words popular in all cateogires

feature_ranks = sorted(zip(nb_clf.coef_[0], unigram_tfidf_vectorizer.get_feature_names()))
very_negative_features = feature_ranks[-5:]
print(very_negative_features)

[(-3.147840051751449, 'asked'), (-3.0143086591269266, 'believe'), (-3.0143086591269266, 'bring'), (-2.5288008433452256, 'applied'), (-2.0980179272527715, 'birthday')]


In [13]:
# sort the conditional probability for category 0 "postive"
# print the words with highest conditional probs
# these can be words popular in the "positive" category alone, or words popular in all cateogires

positive = sorted(zip(nb_clf.coef_[0], unigram_tfidf_vectorizer.get_feature_names()))
positive = feature_ranks[-10:]

real = sorted(zip(nb_clf.coef_[0], unigram_tfidf_vectorizer.get_feature_names()))
real = feature_ranks[-10:]
print(positive)
print(real)

[(-3.484312288372662, 'acknowledge'), (-3.484312288372662, 'better'), (-3.3019907315787074, 'appealing'), (-3.3019907315787074, 'ask'), (-3.147840051751449, 'agreed'), (-3.147840051751449, 'asked'), (-3.0143086591269266, 'believe'), (-3.0143086591269266, 'bring'), (-2.5288008433452256, 'applied'), (-2.0980179272527715, 'birthday')]
[(-3.484312288372662, 'acknowledge'), (-3.484312288372662, 'better'), (-3.3019907315787074, 'appealing'), (-3.3019907315787074, 'ask'), (-3.147840051751449, 'agreed'), (-3.147840051751449, 'asked'), (-3.0143086591269266, 'believe'), (-3.0143086591269266, 'bring'), (-2.5288008433452256, 'applied'), (-2.0980179272527715, 'birthday')]


In [14]:
# pretty print of top and bottom features

# This is a function I found from stackexchange, and adapted a little bit
# The purpose is to print the top and bottom features nicely
# https://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers

# You can find many useful scripts from stackexchange or GitHub
# Most tasks are not so unique, so someone in this world might have done something similar and shared their code

def show_most_and_least_informative_features(vectorizer, clf, class_idx=0, n=10):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[class_idx], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[-n:])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [15]:
# show most positive features (category 4)
print("positive-negative")
show_most_and_least_informative_features(unigram_tfidf_vectorizer, nb_clf, class_idx=0, n=10)
print("true-false")
show_most_and_least_informative_features(unigram_tfidf_vectorizer, nb_clf, class_idx=0, n=10)

# show most positive features (category 4)
print("positive-negative")
show_most_and_least_informative_features(gram12_count_vectorizer, nb_clf, class_idx=0, n=10)
print("true-false")
show_most_and_least_informative_features(gram12_count_vectorizer, nb_clf, class_idx=0, n=10)

positive-negative
	-5.0938	atmosphere     		-3.4843	acknowledge    
	-5.0938	boat           		-3.4843	better         
	-5.0938	box            		-3.3020	appealing      
	-4.4006	6pm            		-3.3020	ask            
	-4.4006	air            		-3.1478	agreed         
	-4.4006	ambiance       		-3.1478	asked          
	-4.4006	appetizer      		-3.0143	believe        
	-4.4006	beef           		-3.0143	bring          
	-4.4006	bit            		-2.5288	applied        
	-4.4006	bland          		-2.0980	birthday       
true-false
	-5.0938	atmosphere     		-3.4843	acknowledge    
	-5.0938	boat           		-3.4843	better         
	-5.0938	box            		-3.3020	appealing      
	-4.4006	6pm            		-3.3020	ask            
	-4.4006	air            		-3.1478	agreed         
	-4.4006	ambiance       		-3.1478	asked          
	-4.4006	appetizer      		-3.0143	believe        
	-4.4006	beef           		-3.0143	bring          
	-4.4006	bit            		-2.5288	applied        
	-4.4006	bland       

In [16]:
#print("y")
#nb_clfy.coef_[0][1]
#print("z")
#nb_clfz.coef_[0][1]


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X_new = SelectKBest(chi2, k=1).fit_transform(X_train_stem, y_train)
X_new.shape
print("X_new \n",X_new)
X_newtf = SelectKBest(chi2, k=1).fit_transform(X_train_tfidf, y_train)
X_newtf.shape
print("X_newtf \n", X_newtf)

X_new 
   (0, 0)	1
  (10, 0)	1
  (14, 0)	1
  (18, 0)	1
  (22, 0)	1
  (23, 0)	1
  (26, 0)	1
  (33, 0)	1
  (37, 0)	1
  (42, 0)	2
  (52, 0)	1
  (53, 0)	1
  (54, 0)	1
X_newtf 
   (0, 0)	0.07755537782497592
  (10, 0)	0.17793928209058169
  (14, 0)	0.14792327783504614
  (18, 0)	0.18684314815978417
  (22, 0)	0.31937437619961495
  (23, 0)	0.11552358499673741
  (26, 0)	0.10165322847399401
  (33, 0)	0.08023216920914446
  (37, 0)	0.2545624471187513
  (42, 0)	0.41534253430612766
  (52, 0)	0.10528246827684114
  (53, 0)	0.18355573600372138
  (54, 0)	0.15588761705592066


# Exercise C

In [17]:
# calculate log ratio of conditional probs

# In this exercise you will calculate the log ratio 
# between conditional probs in the "very negative" category
# and conditional probs in the "very positive" category,
# and then sort and print out the top and bottom 10 words

# the conditional probs for the "very negative" category is stored in nb_clf.coef_[0]
# the conditional probs for the "very positive" category is stored in nb_clf.coef_[4]

# You can consult with similar code in week 4's sample script on feature weighting
# Note that in sklearn's MultinomialNB the conditional probs have been converted to log values.

# Your code starts here

ratios = (nb_clf.feature_log_prob_[0]/nb_clf.feature_log_prob_[1])
feature_ranks = sorted(zip(ratios,unigram_count_vectorizer.get_feature_names()))
bottom_features = feature_ranks[:10]
top_features = feature_ranks[10:]
print("Top Features:")
[print(x[0],x[1]) for x in reversed(top_features) ]
print("Bottom Features:")
[print(x[0],x[1]) for x in bottom_features]



# Your code ends here

Top Features:
1.4456088276651355 acknowledge
1.2511246597660637 asked
1.2223352667303482 applied
1.2111096288165546 believe
1.1716405021888427 bag
1.1429336430591281 birthday
1.1303063525298733 better
1.1055931218644974 ask
1.087272959628166 bread
1.087272959628166 blast
1.087272959628166 bento
1.087272959628166 began
1.087272959628166 beautiful
1.087272959628166 authentic
1.087272959628166 area
1.087272959628166 applebee
1.087272959628166 alfredo
1.0254565152108925 bring
0.9857833197145557 blue
0.9857833197145557 bad
0.9857833197145557 american
0.9857833197145557 5pm
0.9846801685983319 ate
0.936114818190439 appealing
0.924492384586033 bar
0.902119541778304 agreed
0.8949546904425452 blend
0.8949546904425452 blanking
0.8949546904425452 beef
0.8949546904425452 appetizer
Bottom Features:
0.7166936142089452 atmosphere
0.7166936142089452 boat
0.7166936142089452 box
0.7737010639774482 best
0.8295813606674715 6pm
0.8295813606674715 bit
0.8295813606674715 bland
0.857921494916488 amazing
0.8949

[None, None, None, None, None, None, None, None, None, None]

# Step 5: Test the MNB classifier

In [18]:
# test the classifier on the test data set, print accuracy score
print(X_test_bool.shape)
print(y_test.shape)

#print(X_train_gram12[:10].toarray())
print(X_test_bool[:10].toarray())

#nb_clf.score(X_test_bool,y_test)


#nb_clfy.score(X_test_vec,y_test)

print(X_test_gram12.shape)
print(y_test.shape)
print(X_train_gram12.shape)

print(y_train.shape)
print(z_train.shape)

print(X_test_gram12)



(37, 321)
(37,)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(37, 21)
(37,)
(55, 21)
(55,)
(55,)
  (4, 3)	1
  (4, 4)	1
  (10, 11)	1


In [19]:
# print confusion matrix (row: ground truth; col: prediction)
#X_train_tfidf = unigram_tfidf_vectorizer.fit_transform(X_train)
#X_train_bool = unigram_bool_vectorizer.fit_transform(X_train)
#X_train_count = unigram_count_vectorizer.fit_transform(X_train)
#X_train_gram12 = gram12_count_vectorizer.fit_transform(X_train)



from sklearn.metrics import confusion_matrix
#y_true = y_test[1]
y_pred = nb_clf.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
z_pred = nb_clf.fit(X_train_tfidf, z_train).predict(X_test_tfidf)
cmy=confusion_matrix(y_test, y_pred)
cmz=confusion_matrix(z_test, z_pred)

print("Test tfidf precision and recall" )
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))
print("Train tfidf CM for positive and negative review")
print(cmy)

print("Train tfidf CM for true and false review")
print(cmz)

y_predbool = nb_clf.fit(X_train_bool, y_train).predict(X_test_bool)
z_predbool = nb_clf.fit(X_train_bool, z_train).predict(X_test_bool)
cmy=confusion_matrix(y_test, y_predbool)
cmz=confusion_matrix(z_test, z_predbool)

print("Test boolean prcision and recall")
print(precision_score(y_test, y_predbool, average=None))
print(recall_score(y_test, y_predbool, average=None))
print(cmy)

print("Test tfidf CM for true and false review")
print(cmz)

y_count = nb_clf.fit(X_train_count, y_train).predict(X_test_count)
z_count = nb_clf.fit(X_train_count, z_train).predict(X_test_count)
cmycount=confusion_matrix(y_test, y_count)
cmzcount=confusion_matrix(z_test, y_count)

print(precision_score(y_test, y_count, average=None))
print(recall_score(y_test, y_count, average=None))

print("Test boolean CM for positive and negative review")
print(cmycount)

print("Test boolean CM for true and false review")
print(cmzcount)

y_pstem = nb_clf.fit(X_train_stem, y_train).predict(X_test_stem)
z_pstem = nb_clf.fit(X_train_stem, z_train).predict(X_test_stem)
cmystem=confusion_matrix(y_test, y_pstem)
cmzstem=confusion_matrix(z_test, z_pstem)

print(precision_score(y_test, y_pstem, average=None))
print(recall_score(y_test, y_pstem, average=None))

print("Test stem CM for positive and negative review")
print(cmystem)

print("Test stem CM for true and false review")
print(cmzstem)



Test tfidf precision and recall


NameError: name 'precision_score' is not defined

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_bool, y_train).predict(X_test_bool)
z_pred = nb_clf.fit(X_train_bool, z_train).predict(X_test_bool)

cmyg12=confusion_matrix(y_test, y_pred)
cmzg12=confusion_matrix(z_test, z_pred)

print(precision_score(y_test, y_predg12, average=None))
print(recall_score(y_test, y_predg12, average=None))

print("Test bool NB CM for positive and negative review")
print(cmyg12)

print("Test bool NB CM for true and false review")
print(cmzg12)



In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print(X_train_bool.shape)

#y_true = y_test[1]
y_pred = nb_clf.fit(X_train_bool, y_train).predict(X_test_bool)
z_pred = nb_clf.fit(X_train_bool, z_train).predict(X_test_bool)
cmy=confusion_matrix(y_test, y_pred)
cmz=confusion_matrix(z_test, z_pred)
print("Test bool precision and recall y" )
target_names = ['0','1','2','3']
print(classification_report(y_test, y_pred, target_names=target_names))

print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))
print(accuracy_score(y_test, y_pred))
print(cmy)

print("\n \n")
print("Test bool precision and recall accuracy z" )
print(classification_report(z_test, z_pred, target_names=target_names))
print(precision_score(z_test, z_pred, average=None))
print(recall_score(z_test, z_pred, average=None))
print(accuracy_score(z_test, z_pred))

print(cmz)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print(X_train_tfidf.shape)

#y_true = y_test[1]
y_pred = nb_clf.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
z_pred = nb_clf.fit(X_train_tfidf, z_train).predict(X_test_tfidf)
cmy=confusion_matrix(y_test, y_pred)
cmz=confusion_matrix(z_test, z_pred)
print("Test tfidf precision and recall y" )
target_names = ['0','1','2','3']
print(classification_report(y_test, y_pred, target_names=target_names))

print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))
print(accuracy_score(y_test, y_pred))
print(cmy)

print("\n \n")
print("Test tfidf precision and recall accuracy z" )
print(classification_report(z_test, z_pred, target_names=target_names))
print(precision_score(z_test, z_pred, average=None))
print(recall_score(z_test, z_pred, average=None))
print(accuracy_score(z_test, z_pred))

print(cmz)



In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print(X_train_stem.shape)

#y_true = y_test[1]
y_pred = nb_clf.fit(X_train_stem, y_train).predict(X_test_stem)
z_pred = nb_clf.fit(X_train_stem, z_train).predict(X_test_stem)
cmy=confusion_matrix(y_test, y_pred)
cmz=confusion_matrix(z_test, z_pred)
print("Test stem precision and recall y" )
target_names = ['0','1','2','3']
print(classification_report(y_test, y_pred, target_names=target_names))

print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))
print(accuracy_score(y_test, y_pred))
print(cmy)

print("\n \n")
print("Test stem precision and recall accuracy z" )
print(classification_report(z_test, z_pred, target_names=target_names))
print(precision_score(z_test, z_pred, average=None))
print(recall_score(z_test, z_pred, average=None))
print(accuracy_score(z_test, z_pred))

print(cmz)



# BenoulliNB

In [None]:

from sklearn.naive_bayes import BernoulliNB

BNB = BernoulliNB()
print(X_train_bool.shape)

y_predb = BNB.fit(X_train_bool, y_train).predict(X_test_bool)
z_predb = BNB.fit(X_train_bool, z_train).predict(X_test_bool)

cmyb=confusion_matrix(y_test, y_predb)
cmzb=confusion_matrix(z_test, z_predb)

print("Test BNB precision and recall y" )
target_names = ['0','1','2','3']
print(classification_report(y_test, y_predb, target_names=target_names))

print(precision_score(y_test, y_predb, average=None))
print(recall_score(y_test, y_predb, average=None))
print(accuracy_score(y_test, y_predb))
print(cmyb)

print("\n \n")
print("Test BNB precision and recall accuracy z" )
print(classification_report(z_test, z_predb, target_names=target_names))
print(precision_score(z_test, z_predb, average=None))
print(recall_score(z_test, z_predb, average=None))
print(accuracy_score(z_test, z_predb))
print(cmzb)

print("\n \n")


print(X_train_stem.shape)

y_pred = BNB.fit(X_train_stem, y_train).predict(X_test_stem)
z_pred = BNB.fit(X_train_stem, z_train).predict(X_test_stem)

print(precision_score(y_test, y_predg12, average=None))
print(recall_score(y_test, y_predg12, average=None))

cmy=confusion_matrix(y_test, y_pred)
cmz=confusion_matrix(z_test, z_pred)

print("Test BNB/stem precision and recall y" )
target_names = ['0','1','2','3']
print(classification_report(y_test, y_pred, target_names=target_names))

print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))
print(accuracy_score(y_test, y_pred))
print(cmy)

print("\n \n")
print("Test BNB/stem precision and recall accuracy z" )
print(classification_report(z_test, z_pred, target_names=target_names))
print(precision_score(z_test, z_pred, average=None))
print(recall_score(z_test, z_pred, average=None))
print(accuracy_score(z_test, z_pred))

print(cmz)


In [None]:
print(X_train_tfidf.shape)

#y_true = y_test[1]
y_pred = nb_clf.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
z_pred = nb_clf.fit(X_train_tfidf, z_train).predict(X_test_tfidf)
cmy=confusion_matrix(y_test, y_pred)
cmz=confusion_matrix(z_test, z_pred)
print("Test tfidf precision and recall y" )
target_names = ['0','1','2','3']
print(classification_report(y_test, y_pred, target_names=target_names))

print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))
print(accuracy_score(y_test, y_pred))
print(cmy)

print("\n \n")
print("Test tfidf precision and recall accuracy z" )
print(classification_report(y_test, z_pred, target_names=target_names))
print(precision_score(z_test, z_pred, average=None))
print(recall_score(z_test, z_pred, average=None))
print(accuracy_score(z_test, z_pred))

print(cmz)

# Cross Validation

In [None]:
# cross validation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=False)),('nb', MultinomialNB())])
scoresy = cross_val_score(nb_clf_pipe, X, y, cv=10)
avgy=sum(scoresy)/len(scoresy)
print(avgy)
print(scoresy[:10])


scoresz = cross_val_score(nb_clf_pipe, X, z, cv=10)
avgz=sum(scoresz)/len(scoresz)
print(avgz)
print(scoresz[:10])

In [None]:
# run 3-fold cross validation to compare the performance of 
# (1) BernoulliNB (2) MultinomialNB with TF vectors (3) MultinomialNB with boolean vectors

# Your code starts here
# cross validation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=False)),('nb', BernoulliNB())])

scores = cross_val_score(nb_clf_pipe, X, y, cv=10)
avg=sum(scores)/len(scores)
print(avg)
print(scores[:10])


scoresz = cross_val_score(nb_clf_pipe, X, z, cv=10)
avgz=sum(scoresz)/len(scoresz)
print(avgz)
print(scoresz[:10])

# Your code ends here

# Step 5.1 Interpret the prediction result

In [None]:
## find the calculated posterior probability
posterior_probs = nb_clf.predict_proba(X_test_tfidf)

## find the posterior probabilities for the first test example
print(posterior_probs[0])

# find the category prediction for the first test example
y_pred = nb_clf.predict(X_test_tfidf)
print(y_pred[0])

# check the actual label for the first test example
print(y_test[0])

# Step 5.2 Error Analysis

In [None]:
# print out specific type of error for further analysis

# print out the very positive examples that are mistakenly predicted as negative
# according to the confusion matrix, there should be 53 such examples
# note if you use a different vectorizer option, your result might be different

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]==1 and y_predtfidf[i]==1):
        print(X_test_tfidf[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]==1 and y_predg12[i]==1):
        print(X_test_bool[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

# Exercise D

In [None]:
# Can you find linguistic patterns in the above errors? 
# What kind of very positive examples were mistakenly predicted as negative?

# Can you write code to print out the errors that very negative examples were mistakenly predicted as very positive?
# Can you find lingustic patterns for this kind of errors?
# Based on the above error analysis, what suggestions would you give to improve the current model?

# Your code starts here
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]==4 and y_pred[i]==0):
        print(X_test_tfidf[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)
# Your code ends here

# Step 6: write the prediction output to file

In [None]:
y_pred=nb_clf.predict(X_test_tfidf)
print(y_pred)
output = open('/Users/kenmckee/Desktop/GS/S18/tm/HW6/prediction_outputHW6.csv', 'w')
for x, value in enumerate(y_pred):
  output.write(str(value) + '\n') 
output.close()

# Step 6.1 Prepare submission to Kaggle sentiment classification competition

In [None]:
print(X_train)

In [None]:
output = open('/Users/kenmckee/Desktop/GS/S18/tm/HW6/X.tsv', 'w')
for x, value in enumerate(X_train):
  output.write(str(value) + '\n') 
output.close()

output = open('/Users/kenmckee/Desktop/GS/S18/tm/HW6/testY.tsv', 'w')
for x, value in enumerate(y_train):
  output.write(str(value) + '\n') 
output.close()
 
output.close()
output = open('/Users/kenmckee/Desktop/GS/S18/tm/HW6/testZ.tsv', 'w')
for x, value in enumerate(z_train):
  output.write(str(value) + '\n') 
output.close()

In [None]:
########## submit to HW6 submission

# we are still using the model trained on 60% of the training data
# you can re-train the model on the entire data set 
#   and use the new model to predict the HW6 test data
# below is sample code for using a trained model to predict HW6 test data 
#    and format the prediction output for HW6 submission

# read in the test data
HW6_testX=p.read_csv("/Users/kenmckee/Desktop/GS/S18/tm/HW6//testX.tsv", delimiter='\t') 
HW6_idsY=p.read_csv("/Users/kenmckee/Desktop/GS/S18/tm/HW6//testY.tsv", delimiter='\t') 
HW6_testZ=p.read_csv("/Users/kenmckee/Desktop/GS/S18/tm/HW6//testZ.tsv", delimiter='\t') 

#print(HW6_idsY)

# vectorize the test examples using the vocabulary fitted from the 60% training data
HW6_X_test_vec=unigram_tfidf_vectorizer.transform(HW6_testX)

# predict using the NB classifier that we built
HW6_pred=nb_clf.fit(X_train_tfidf, y_train).predict(HW6_X_test_vec)

# combine the test example ids with their predictions
HW6_submission=zip(HW6_idsY, HW6_pred)

print(HW6_X_test_vec)

# prepare output file
outf=open('/Users/kenmckee/Desktop/GS/S18/tm/HW6/HW6_submission.csv', 'w')

# write header
outf.write('PhraseId,Sentiment\n')

# write predictions with ids to the output file
for x, value in enumerate(HW6_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\n')

# close the output file
outf.close()

# Exercise E

In [None]:
# generate your HW6 submissions with boolean representation and TF representation
# read in the test data
HW6_test=p.read_csv("/Users/kenmckee/Desktop/GS/S18/tm/HW6/test.tsv", delimiter='\t') 

# preserve the id column of the test examples
HW6_ids=HW6_test['PhraseId'].values

# read in the text content of the examples
HW6_X_test=HW6_test['Phrase'].values

# vectorize the test examples using the vocabulary fitted from the 60% training data
HW6_X_test_vec=unigram_tfidf_vectorizer.transform(HW6_X_test)

# predict using the NB classifier that we built
HW6_pred=nb_clf.fit(X_train_vec, y_train).predict(HW6_X_test_vec)

# combine the test example ids with their predictions
HW6_submission=zip(HW6_ids, HW6_pred)

# prepare output file
outf=open('/Users/kenmckee/Desktop/GS/S18/tm/HW6/HW6_submission.csv', 'w')

# write header
outf.write('PhraseId,Sentiment\n')

# write predictions with ids to the output file
for x, value in enumerate(HW6_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\n')

# close the output file
outf.close()
# submit to HW6

# report your scores here
# which model gave better performance in the hold-out test
# which model gave better performance in the HW6 test