# Lab 05 Text Data



In [1]:
import numpy as np
import pandas as df

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression




### Download Movie Reviews
We used the following commands to download and load the movie review data

In [2]:
# note: you only need to execute this cell once!
#!wget -nc http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -P data
#!tar xzf data/aclImdb_v1.tar.gz -C data
#!rm -r data/aclImdb/train/unsup

In [3]:
from sklearn.datasets import load_files
# NOTE -- this takes a few minutes!
reviews_train = load_files("data/aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))


type of text_train: <class 'list'>
length of text_train: 25000


In [4]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

In [5]:
print(len(text_train))

25000


In [6]:
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: {}".format(len(text_test)))
print("Samples per class (test): {}".format(np.bincount(y_test)))
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

Number of documents in test data: 25000
Samples per class (test): [12500 12500]


### Question 0

Use CountVectorizer to transform the text_data, by completing the following code. Assign the transformed data to X_data. Note that we use <b>min_df</b> of 10 and <b>max_df</b> of .5. These settings help keep things from getting too slow. You can experiment with different values -- higher minimums and lower maximums can speed things up, potentially with a reduction in accuracy.

In [7]:
vect = CountVectorizer(min_df=10, max_df=.5)
vect = vect.fit(text_train )
X_train = vect.transform(text_train)
X_test = vect.transform(text_test)

print("X_train:\n{}".format(repr(X_train)))

X_train:
<25000x18497 sparse matrix of type '<class 'numpy.int64'>'
	with 2848822 stored elements in Compressed Sparse Row format>


### Question 1
Fit a LogisticRegression to X_train and y_train. Use the optional parameter C=.1
Print the training and test accuracy.

In [8]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=.1).fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test score: {:.2f}".format(logreg.score(X_test, y_test)))



Training set score: 0.967
Test score: 0.88


### Question 2
Make a MultinomialNB classifier for X_train and y_train. Print training and test accuracy.

In [10]:
from sklearn.naive_bayes import MultinomialNB
nbModel = MultinomialNB().fit(X_train, y_train)
print("Training set score: {:.3f}".format(nbModel.score(X_train, y_train)))
print("Test score: {:.2f}".format(nbModel.score(X_test, y_test)))

Training set score: 0.876
Test score: 0.83


### Question 3
Try changing countVectorizer so that it constructs unigrams, bigrams and trigrams. You do this by setting the parameter ngram_range = (1,3), and create a new vectorizer. Now transform text_train and text_test, and report new train and test results for the two models you built above.
(It might be convenient to use the function <b>applyModel</b>, defined below.)

In [11]:
vect = CountVectorizer(min_df=10, max_df=.5,ngram_range=(1,3))
vect = vect.fit(text_train )
X_train = vect.transform(text_train)
X_test = vect.transform(text_test)



In [12]:
def applyModel(model,name,X_train, y_train, X_test, y_test):
    m = model.fit(X_train,y_train)
    print("{} Training score: {:.3f}".format(name, m.score(X_train, y_train)))
    print("{} Test score: {:.3f}".format(name, m.score(X_test, y_test)))



In [13]:
logreg = LogisticRegression(C=.1)
applyModel(logreg,"logreg", X_train, y_train, X_test, y_test)
nbModel = MultinomialNB()
applyModel(nbModel,"naive bayes", X_train, y_train, X_test, y_test)








logreg Training score: 0.999
logreg Test score: 0.897
naive bayes Training score: 0.924
naive bayes Test score: 0.871


### Question 4
For both of the above models, sort the features by their coefficients, and print the top 10 features and their coefficients.

In [15]:
coefs=logreg.coef_[0]
sorted_coefs = sorted ((zip(vect.get_feature_names(), coefs)),
                key = lambda e:e[1], reverse=True)

low = sorted_coefs[-20:]
high = sorted_coefs[:20]
print("Logistic Regression")
print("Lowest coefficients:")
for i in low:
    print(i)
print("Highest coefficients:")
for i in high:
    print(i)
    
coefs=nbModel.coef_[0]
sorted_coefs = sorted ((zip(vect.get_feature_names(), coefs)),
                key = lambda e:e[1], reverse=True)

low = sorted_coefs[-20:]
high = sorted_coefs[:20]
print ("Multinomial NB")
print("Lowest coefficients:")
for i in low:
    print(i)
print("Highest coefficients:")
for i in high:
    print(i)

Logistic Regression
Lowest coefficients:
('unfortunately', -0.541858013713471)
('bad', -0.5464447135239142)
('avoid', -0.5552750228561513)
('annoying', -0.5691010503211021)
('lame', -0.5871430788862058)
('mess', -0.5915906804708063)
('lacks', -0.5999421801520495)
('the worst', -0.6050191333668404)
('dull', -0.6101205556935515)
('horrible', -0.6402199135044385)
('worse', -0.6412519970608491)
('terrible', -0.6416185026508155)
('disappointing', -0.679593345185616)
('poor', -0.6809414852586397)
('disappointment', -0.7376499566778868)
('poorly', -0.7459827842801316)
('waste', -0.8597800622899489)
('boring', -0.8789592423906969)
('worst', -0.9532294508642182)
('awful', -0.9539919970848368)
Highest coefficients:
('excellent', 0.7647372780175421)
('perfect', 0.7124699796130035)
('wonderful', 0.6594738585259878)
('superb', 0.5847949349929705)
('amazing', 0.5633187250922497)
('enjoyable', 0.5616770687000522)
('great', 0.5217275899987744)
('today', 0.503113552150552)
('brilliant', 0.4957537061751