In [45]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

In [10]:
amazon_df = pd.read_csv('https://raw.githubusercontent.com/nealcaren/CSSS-CABD/master/files/amazon_reviews.csv')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(amazon_df['Text'], 
                                                    amazon_df['Positive Review'], 
                                                    train_size = .8)


In [12]:
# Big Vocab
big_vector = CountVectorizer(max_features  = 5000)

big_vector.fit(X_train) # train model
tf_train = big_vector.transform(X_train) # build TF matrix


In [18]:
lr = LogisticRegression()


lr.fit(tf_train, y_train) #fit logistic regression model to training data



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
# test accuracy of model on training data

print accuracy_score(y_train, lr.predict(tf_train))
confusion_matrix(y_train, lr.predict(tf_train))

0.864425


array([[10910,  3529],
       [ 1894, 23667]])

In [21]:
# test accuracy of model on test data


tf_test = big_vector.transform(X_test) #build TF matrix for test data, based on train model

print accuracy_score(y_test, lr.predict(tf_test)) #predicted values based on train lr model

confusion_matrix(y_test, lr.predict(tf_test))

0.7934


array([[2405, 1223],
       [ 843, 5529]])

In [22]:
# Small Vocab
small_vector = CountVectorizer(min_df=.01,
                             stop_words = 'english')

small_vector.fit(X_train) # train model
tf_train = small_vector.transform(X_train) # build TF matrix


In [23]:
lr = LogisticRegression()


lr.fit(tf_train, y_train) #fit logistic regression model to training data

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
# test accuracy of model on training data

print accuracy_score(y_train, lr.predict(tf_train))
confusion_matrix(y_train, lr.predict(tf_train))

0.783175


array([[ 8490,  5949],
       [ 2724, 22837]])

In [26]:
# test accuracy of model on test data


tf_test = small_vector.transform(X_test) #build TF matrix for test data, based on train model

print accuracy_score(y_test, lr.predict(tf_test)) #predicted values based on train lr model

confusion_matrix(y_test, lr.predict(tf_test))

0.762


array([[2018, 1610],
       [ 770, 5602]])

In [36]:
pipe = make_pipeline(CountVectorizer(), 
                     LogisticRegression())
#
param_grid = {"countvectorizer__min_df": [.005, .01, .02, .05],
             "countvectorizer__stop_words": ['english', []]}

grid = GridSearchCV(pipe, param_grid, cv=3)

grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'countvectorizer__stop_words': ['english', []], 'countvectorizer__min_df': [0.005, 0.01, 0.02, 0.05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [42]:
results = pd.DataFrame(grid.cv_results_)
results[['mean_test_score', 'param_countvectorizer__min_df', 'param_countvectorizer__stop_words']].sort_values('mean_test_score')

Unnamed: 0,mean_test_score,param_countvectorizer__min_df,param_countvectorizer__stop_words
6,0.7064,0.05,english
4,0.75265,0.02,english
7,0.75375,0.05,[]
2,0.770775,0.01,english
5,0.779125,0.02,[]
0,0.783325,0.005,english
3,0.790375,0.01,[]
1,0.79695,0.005,[]


In [None]:
pipe = make_pipeline(CountVectorizer(), 
                     KNeighborsClassifier())
#
param_grid = {"countvectorizer__min_df": [.005, .01, .02, .05],
             "kneighborsclassifier__n_neighbors": [1, 2, 3, 5, 10]}

grid = GridSearchCV(pipe, param_grid, cv=3)

grid.fit(X_train, y_train)

In [None]:
results = pd.DataFrame(grid.cv_results_)
vs_of_interest = ['mean_test_score'] + param_grid.keys()
results[vs_of_interest].sort_values('mean_test_score')