In [8]:
import pandas as pd

import boto3
import io

from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer



# Settings

In [4]:
# S3 bucket we read from and write to
bucket = 'movie-rating-sagemaker-mark'

# preprocessed input filename (in subfolder key)
prepped_csv_file = 'movie_data.csv'

### Loading data

In [7]:
s3 = boto3.client('s3')
file_obj = s3.get_object(Bucket=bucket, Key=prepped_csv_file)
file_body = file_obj['Body']
df = pd.read_csv(io.BytesIO(file_body.read()), encoding='latin1')
del(file_body)

### Clean text data functions

In [9]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower())
    return text

df['review'] = df['review'].apply(preprocessor)

In [10]:
def tokenizer(text):
    return text.split()

### Create train and test partitions

In [11]:
X_train = df.loc[:40000, 'review'].values
Y_train = df.loc[:40000, 'sentiment'].values
X_test = df.loc[40000:, 'review'].values
Y_test = df.loc[40000:, 'sentiment'].values

# Train logistic regression model

In [12]:
tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
               'vect__stop_words': [None],
               'vect__tokenizer': [tokenizer],
               'clf__penalty': ['l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words': [None],
               'vect__tokenizer': [tokenizer],
               'vect__use_idf': [False],
               'vect__norm': [None],
               'clf__penalty': ['l2'],
               'clf__C': [1.0, 10.0, 100.0]}]
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring = 'accuracy', cv = 5, verbose = 1, n_jobs = -1)
gs_lr_tfidf.fit(X_train, Y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 25.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [None], 'vect__tokenizer': [<function tokenizer at 0x7f91c0171ea0>], 'clf__penalty': ['l2'], 'clf__C': [1.0, 10.0, 100.0]}, {'vect__ngram_range': [(1, 1)], 'vect__stop_words': [None], 'vect__tokenizer': [<function tokenizer at 0x7f91c0171ea0>], 'vect__use_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, sco

### Output results and predict test set

In [14]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f'% gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f'% clf.score(X_test, Y_test))

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7f91c0171ea0>} 
CV Accuracy: 0.902
Test Accuracy: 0.903
