In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
try:
    from sklearn.model_selection import cross_val_score, train_test_split
except ImportError:
    from sklearn.cross_validation import cross_val_score, train_test_split
#data handling/modeling
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import scipy.stats as stats

# visualization
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
tweets = ["These debates are boring", "we want more debates", "debates are useful"]
target = [0, 1, 1]

In [3]:
vect = CountVectorizer(stop_words="english")
# Init signature: CountVectorizer(self, input=u'content', encoding=u'utf-8', decode_error=u'strict', 
#                                 strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, 
#                                 stop_words=None, token_pattern=u'(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), 
#                                 analyzer=u'word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, 
#                                 binary=False, dtype=<type 'numpy.int64'>)
# Docstring:
# Convert a collection of text documents to a matrix of token counts

# This implementation produces a sparse representation of the counts using
# scipy.sparse.coo_matrix.

# If you do not provide an a-priori dictionary and you do not use an analyzer
# that does some kind of feature selection then the number of features will
# be equal to the vocabulary size found by analyzing the data.

# stop_words : string {'english'}, list, or None (default)
#     If 'english', a built-in stop word list for English is used.

#     If a list, that list is assumed to contain stop words, all of which
#     will be removed from the resulting tokens.
#     Only applies if ``analyzer == 'word'``.

#     If None, no stop words will be used. max_df can be set to a value
#     in the range [0.7, 1.0) to automatically detect and filter stop
#     words based on intra corpus document frequency of terms.


tweet_X = vect.fit_transform(tweets)
print(tweet_X.shape)
print(tweet_X)

(3, 4)
  (0, 1)	1
  (0, 0)	1
  (1, 1)	1
  (1, 3)	1
  (2, 1)	1
  (2, 2)	1


In [4]:
tweet_X.toarray()

array([[1, 1, 0, 0],
       [0, 1, 0, 1],
       [0, 1, 1, 0]])

In [5]:
vect.get_feature_names()

[u'boring', u'debates', u'useful', u'want']

In [6]:
pd.DataFrame(tweet_X.toarray(), columns=vect.get_feature_names())

Unnamed: 0,boring,debates,useful,want
0,1,1,0,0
1,0,1,0,1
2,0,1,1,0


In [7]:
from glob import glob

In [8]:


def load_data(path, target):
    reviews = []
    for file in glob(path):
        review = open(file).read()
        reviews.append({
                "target": target,
                "review": review
            })
    return reviews

In [9]:
reviews = load_data("../data/review_polarity/txt_sentoken/neg/*", "neg") + \
    load_data("../data/review_polarity/txt_sentoken/pos/*", "pos")


In [10]:
data = pd.DataFrame(reviews)

In [11]:
print(data.shape)
print(data.head)
data.sample(10)

(2000, 2)
<bound method DataFrame.head of                                                  review target
0     plot : two teen couples go to a church party ,...    neg
1     the happy bastard's quick movie review \ndamn ...    neg
2     it is movies like these that make a jaded movi...    neg
3      " quest for camelot " is warner bros . ' firs...    neg
4     synopsis : a mentally unstable man undergoing ...    neg
5     capsule : in 2176 on the planet mars police ta...    neg
6     so ask yourself what " 8mm " ( " eight millime...    neg
7     that's exactly how long the movie felt to me ....    neg
8     call it a road trip for the walking wounded . ...    neg
9     plot : a young french boy sees his parents kil...    neg
10    best remembered for his understated performanc...    neg
11    janeane garofalo in a romantic comedy -- it wa...    neg
12    and now the high-flying hong kong style of fil...    neg
13    a movie like mortal kombat : annihilation work...    neg
14    she was

Unnamed: 0,review,target
156,senseless ( r ) marlon wayans is a very talent...,neg
163,robin williams is a comedic genus . \nthat is ...,neg
278,it would be hard to choose the best american p...,neg
1664,if you've ever perused my college comedy diary...,pos
1602,"everyone's heard about this movie , and more s...",pos
196,"susan granger's review of "" america's sweethea...",neg
321,studio 54 attracted so many weird and bizarre ...,neg
1695,who would have thought ? \njim carrey does dra...,pos
60,one of the most respected names in american in...,neg
333,"as a hot-shot defense attorney , kevin lomax (...",neg


In [12]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['target'], test_size=0.2, random_state=42)

In [13]:
print(X_train.head())
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

968    while watching loser , it occurred to me that ...
240    georges polti once wrote a paper called " the ...
819    sylvester stallone has made some crap films in...
692    attention moviegoers : you are about to enter ...
420    plot : something about a bunch of kids going i...
Name: review, dtype: object
(1600,)
(400,)
(1600,)
(400,)


In [14]:
vect = CountVectorizer(stop_words='english')  # instantiate the model
X_train_vect = vect.fit_transform(X_train)

In [15]:
X_train_vect

<1600x35944 sparse matrix of type '<type 'numpy.int64'>'
	with 390595 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()   # define the logistic regression
logreg.fit(X_train_vect, y_train)   # we fit it
# outcome_pred_class_log = logreg.predict(X_test)   # we make (class) predictions based on the data that we get

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
X_test_vect = vect.transform(X_test)   # this is an important step

In [18]:
logreg.score(X_test_vect, y_test)

0.83250000000000002

In [19]:
y_pred=logreg.predict(X_test_vect)
print(y_pred.shape)
print(y_pred)

(400,)
['pos' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos'
 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos'
 'pos' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'pos'
 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'pos' 'pos' 'neg' 'neg'
 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg'
 'pos' 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg'
 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'pos'
 'neg' 'pos' 'pos' 'pos' 'neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'pos'
 'pos' 'pos' 'neg' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos'
 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos'
 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'neg'
 'neg' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' '

In [20]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(y_test, y_pred)   # What is the confusion matrix telling us here? Instructor explained but
                                   # I seemed to have missed it

array([[164,  35],
       [ 32, 169]])

In [21]:
print(y_pred)
print(y_test)
print(X_test_vect)

['pos' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos'
 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos'
 'pos' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'pos'
 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'pos' 'pos' 'neg' 'neg'
 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg'
 'pos' 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg'
 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'pos'
 'neg' 'pos' 'pos' 'pos' 'neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'pos'
 'pos' 'pos' 'neg' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos'
 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos'
 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'neg'
 'neg' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'pos' 'p

In [22]:
# The word 'movie' adds noise because it occurs in all sentences in the review and is therefore useless.
vect = TfidfVectorizer(stop_words='english')  # instantiate the model
X_train_vect = vect.fit_transform(X_train)
print(X_train_vect.shape)
print(X_train_vect)

(1600, 35944)
  (0, 13169)	0.0410632747207
  (0, 3997)	0.0328509013804
  (0, 7413)	0.0386364198572
  (0, 7776)	0.0416695334775
  (0, 25002)	0.0365302536074
  (0, 13350)	0.033408605374
  (0, 15525)	0.053547502712
  (0, 4083)	0.053547502712
  (0, 10886)	0.0471631734538
  (0, 14283)	0.0637076695209
  (0, 15517)	0.0326744975201
  (0, 9552)	0.0487179440674
  (0, 21275)	0.0487179440674
  (0, 16827)	0.0286764969936
  (0, 35377)	0.029080527868
  (0, 12789)	0.053547502712
  (0, 8420)	0.0344504797388
  (0, 19738)	0.0228859477727
  (0, 14341)	0.0202193617644
  (0, 18262)	0.0423336148093
  (0, 7815)	0.0430677226632
  (0, 2733)	0.0336048882216
  (0, 31894)	0.0405055707271
  (0, 718)	0.0471631734538
  (0, 28225)	0.0218179997652
  :	:
  (1599, 14341)	0.0231149680283
  (1599, 23939)	0.0176607068953
  (1599, 17848)	0.0314867107273
  (1599, 9490)	0.0296897872448
  (1599, 33124)	0.0227247477411
  (1599, 32110)	0.0351210236127
  (1599, 15444)	0.0695457137845
  (1599, 29230)	0.0399153421782
  (1599, 15344)

In [23]:
from sklearn.feature_selection import SelectKBest

In [24]:
kbest = SelectKBest(k=2000)
print(kbest)

SelectKBest(k=2000, score_func=<function f_classif at 0x11725df50>)


In [25]:
X_train_best = kbest.fit_transform(X_train_vect, y_train)
print(X_train_best.shape)
print(X_train_best)

(1600, 2000)
  (0, 1946)	0.0179293172722
  (0, 1827)	0.0193954605291
  (0, 643)	0.0241477439235
  (0, 258)	0.022800976228
  (0, 321)	0.135227444851
  (0, 1902)	0.0308464534379
  (0, 60)	0.0378614463125
  (0, 1568)	0.0410632747207
  (0, 133)	0.0149246554176
  (0, 1035)	0.0270514345596
  (0, 676)	0.0275049894885
  (0, 1866)	0.019526188197
  (0, 1860)	0.0390588267783
  (0, 31)	0.0327155550084
  (0, 1518)	0.032125271724
  (0, 1037)	0.0340148641308
  (0, 1160)	0.00874229915048
  (0, 144)	0.0268711363183
  (0, 661)	0.0131524238845
  (0, 784)	0.0725696389225
  (0, 1268)	0.265063690659
  (0, 1596)	0.0193696067692
  (0, 1796)	0.0221177268503
  (0, 735)	0.0148838694652
  (0, 663)	0.0412798937154
  :	:
  (1599, 245)	0.0334865323579
  (1599, 1284)	0.0216294445495
  (1599, 686)	0.0280888317761
  (1599, 644)	0.0446450507338
  (1599, 460)	0.0239579111362
  (1599, 805)	0.0463063564139
  (1599, 221)	0.0424863628227
  (1599, 1112)	0.0304637440019
  (1599, 1361)	0.034124104905
  (1599, 1039)	0.0240698390

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
logreg = LogisticRegression()   # define the logistic regression
logreg.fit(X_train_best, y_train);   # we fit it

In [28]:
X_test_vect = vect.transform(X_test)   # this is an important step
X_test_best = kbest.transform(X_test_vect)

In [29]:
logreg.score(X_test_best, y_test)

0.8075

In [30]:
y_pred = logreg.predict(X_test_best)

In [31]:
from sklearn.metrics import confusion_matrix, classification_report

In [32]:
confusion_matrix(y_test, y_pred)

array([[157,  42],
       [ 35, 166]])