In [27]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
try:
    from sklearn.model_selection import cross_val_score, train_test_split
except ImportError:
    from sklearn.cross_validation import cross_val_score, train_test_split
#data handling/modeling
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import scipy.stats as stats

# visualization
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
tweets = ["These debates are boring", "we want more debates", "debates are useful"]
target = [0, 1, 1]

In [10]:
vect = CountVectorizer(stop_words="english")
tweet_X = vect.fit_transform(tweets)

In [11]:
tweet_X.toarray()

array([[1, 1, 0, 0],
       [0, 1, 0, 1],
       [0, 1, 1, 0]], dtype=int64)

In [12]:
vect.get_feature_names()

['boring', 'debates', 'useful', 'want']

In [13]:
pd.DataFrame(tweet_X.toarray(), columns=vect.get_feature_names())

Unnamed: 0,boring,debates,useful,want
0,1,1,0,0
1,0,1,0,1
2,0,1,1,0


In [3]:
from glob import glob

In [4]:


def load_data(path, target):
    reviews = []
    for file in glob(path):
        review = open(file).read()
        reviews.append({
                "target": target,
                "review": review
            })
    return reviews

In [5]:
reviews = load_data("../data/review_polarity/txt_sentoken/neg/*", "neg") + \
    load_data("../data/review_polarity/txt_sentoken/pos/*", "pos")

In [6]:
data = pd.DataFrame(reviews)

In [7]:
data.sample(10)

Unnamed: 0,review,target
483,movies about teenagers and teenage culture rar...,neg
935,the plot of big momma's house is martin lawren...,neg
787,my friend here in film school just made a two ...,neg
360,capsule : dumb dud of an entry in the body hea...,neg
761,weighed down by tired plot lines and spielberg...,neg
62,there are two things the american film industr...,neg
1423,seen at the 21st portland international film f...,pos
45,when it comes to the average teenage romantic ...,neg
1036,dora ( fernanda montenegro ) sits behind a mak...,pos
659,i came to an epiphany while watching the bache...,neg


In [8]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['target'], test_size=0.2, random_state=42)

In [9]:
X_train.head()

968    while watching loser , it occurred to me that ...
240    georges polti once wrote a paper called " the ...
819    sylvester stallone has made some crap films in...
692    attention moviegoers : you are about to enter ...
420    plot : something about a bunch of kids going i...
Name: review, dtype: object

In [13]:
vect = CountVectorizer(stop_words='english')  # instantiate the model
X_train_vect = vect.fit_transform(X_train)

In [14]:
X_train_vect

<1600x35944 sparse matrix of type '<type 'numpy.int64'>'
	with 390595 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()   # define the logistic regression
logreg.fit(X_train_vect, y_train)   # we fit it
# outcome_pred_class_log = logreg.predict(X_test)   # we make (class) predictions based on the data that we get

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
X_test_vect = vect.transform(X_test)   # this is an import step

In [19]:
logreg.score(X_test_vect, y_test)

0.83250000000000002

In [20]:
y_pred=logreg.predict(X_test_vect)

In [21]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(y_test, y_pred)   # What is the confusion matrix telling us here? Instructor explained but
                                   # I seemed to have missed it

array([[164,  35],
       [ 32, 169]])

In [26]:
print(y_pred)
print(y_test)
print(X_test_vect)

['pos' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos'
 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos'
 'pos' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'pos'
 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'pos' 'pos' 'neg' 'neg'
 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg'
 'pos' 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg'
 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'pos'
 'neg' 'pos' 'pos' 'pos' 'neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'pos'
 'pos' 'pos' 'neg' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos'
 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos'
 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'neg'
 'neg' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'pos' 'p

In [None]:
# The word 'movie' adds noise because it occurs in all sentences in the review and is therefore useless.
vect = TdidfVectorizer(stop_words='english')  # instantiate the model
X_train_vect = vect.fit_transform(X_train)

In [29]:
from sklearn.feature_selection import SelectKBest

In [30]:
kbest = SelectKBest(k=2000)

In [32]:
X_train_best = kbest.fit_transform(X_train_vect, y_train)

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
logreg = LogisticRegression()   # define the logistic regression
logreg.fit(X_train_best, y_train);   # we fit it

In [36]:
X_test_vect = vect.transform(X_test)   # this is an import step
X_test_best = kbest.transform(X_test_vect)

In [38]:
logreg.score(X_test_best, y_test)

0.81000000000000005

In [39]:
y_pred = logreg.predict(X_test_best)

In [40]:
from sklearn.metrics import confusion_matrix, classification_report

In [41]:
confusion_matrix(y_test, y_pred)

array([[160,  39],
       [ 37, 164]])