In [3]:
import graphlab as gl
import matplotlib.pyplot as plt
%matplotlib inline
from graphlab.toolkits.feature_engineering import *

In [2]:
reviews = gl.load_sframe("yelp_dataset_challenge_academic_dataset/reviews/")

## Feature Generation

- Lets start with a simple Bag of words feature set 
- Could try TFIDF in the second pass

In [7]:
word_counter = WordCounter('text', to_lower=True, output_column_prefix='bow')
bow_reviews = word_counter.fit_transform(reviews)

In [9]:
# Selecting only the relevant columns
bow_reviews = bow_reviews['review_id', 'stars', 'bow.text']

## Generating the class labels

In [10]:

# Creating the sentiment 'class' 
bow_reviews['class'] = bow_reviews['stars'].apply(lambda x: 1 if x >= 3 else 0)

## Choosing Non-Burger reviews
- For a start we will train our model on reviews which are not related to burgers
- Only about 67000 out of 2.2 million reviews mention 'burger' explicitly
    - To do this well though it might be better to stem the words and then look for burger, as this will not match 'Burgers' o

In [16]:
# Selecting reviews with burgers in them
br = bow_reviews[bow_reviews['bow.text'].apply(lambda x: 'burger' in x)]

In [18]:
# Choosing only reviews which don't have the word 'burger' in them 
bow_reviews = bow_reviews[bow_reviews['bow.text'].apply(lambda x: 'burger' not in x)]

## Doing a K-Fold CV split

In [None]:
bow_reviews = gl.cross_validation.shuffle(bow_reviews, random_seed=100)

In [35]:
# 3- Fold cross validation
bow_reviews_kfold = gl.toolkits.cross_validation.KFold(bow_reviews, 3)

In [81]:
# First trying out a simple random split to understand model tuning 
train_data, test_data = bow_reviews.random_split(0.7)


In [82]:
test_data, validation_data = test_data.random_split(0.5)

## Developing Preliminary model

### Todo's
- Trying out custom evaluator
- Trying out step sizes
- Trying out stopping criteria
- Trying out l1_penalty

In [84]:
params = {'target':'class', 'features':[['bow.text']], 'l1_penalty':[0.001, 0.005, 0.01, 0.05, 0.1, 0.2], 'l2_penalty':0,
          'step_size':[0.01, 0.05, 0.1, 0.5, 1, 5, 10], 
          'convergence_threshold':[0.0001, 0.0005, 0.001, 0.005, 0.01], 
          'max_iterations':10,
          'class_weights':'auto'}

In [None]:
model_search = gl.toolkits.model_parameter_search.random_search.create((train_data, validation_data),
                                                              gl.logistic_classifier.create, 
                                                                     model_parameters=params,
                                                                     return_model=True)

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Jun-06-2016-00-46-5200000' ready for execution


In [86]:
model_search.get_best_params()


{'class_weights': 'auto',
 'convergence_threshold': 0.0001,
 'features': ['bow.text'],
 'l1_penalty': 0.005,
 'l2_penalty': 0,
 'max_iterations': 10,
 'step_size': 1,
 'target': 'class'}

In [87]:
model_search.get_results()

model_id,class_weights,convergence_threshold,features,l1_penalty,l2_penalty,max_iterations,step_size
9,auto,0.005,[bow.text],0.005,0,10,0.01
8,auto,0.0005,[bow.text],0.001,0,10,5.0
1,auto,0.0005,[bow.text],0.2,0,10,0.5
0,auto,0.001,[bow.text],0.001,0,10,0.05
3,auto,0.0005,[bow.text],0.01,0,10,10.0
2,auto,0.0001,[bow.text],0.005,0,10,1.0
5,auto,0.005,[bow.text],0.005,0,10,0.05
4,auto,0.005,[bow.text],0.001,0,10,0.01
7,auto,0.001,[bow.text],0.05,0,10,0.1
6,auto,0.0001,[bow.text],0.005,0,10,0.05

target,training_accuracy,validation_accuracy
class,0.947676360486,0.910730530855
class,0.949428225614,0.911787363686
class,0.946619784915,0.909987263809
class,0.947508531972,0.910540842911
class,0.947871471467,0.910877635791
class,0.94959109368,0.911857044972
class,0.94750770523,0.910544714094
class,0.947676360486,0.910730530855
class,0.946815722638,0.910060816278
class,0.94750770523,0.910544714094


In [88]:
models = model_search.get_models()

In [111]:
# Zero coefficients 
models[1].coefficients[models[1].coefficients['value']==0]

name,index,class,value,stderr
bow.text,tr,1,0.0,
bow.text,probaly,1,0.0,
bow.text,eindruck,1,0.0,
bow.text,hdoas.,1,0.0,
bow.text,"marriott,",1,0.0,
bow.text,mccalls,1,0.0,
bow.text,sharable.,1,0.0,
bow.text,*trying*,1,0.0,
bow.text,dobra,1,0.0,
bow.text,"standards""",1,0.0,


In [124]:
model_search.get_models

<bound method ModelSearchJob.get_models of Model parameter search
-------------------------
Strategy            : random
Num. combinations   : 10

Current status
-------------------------
Completed           : 10
Running             : 0
Pending             : 0
Failed              : 0
Canceled            : 0

Jobs
-------------------------
Model-Parameter-Search-Jun-06-2016-00-46-5200000-a0e17

Help
-------------------------
Get status          : self.get_status()
Get exceptions      : self.get_metrics()
Get a single job    : self.jobs[i]>

In [121]:
params = {'target':'class', 'features':[['bow.text']], 'l1_penalty':[0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5], 
          'l2_penalty':[0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5],
          'step_size':[0.01, 0.05, 0.1, 0.5, 1, 5, 10], 
          'convergence_threshold':[0.0001, 0.0005, 0.001, 0.005, 0.01], 
          'max_iterations':20,
          'class_weights':'auto'}

In [None]:
model_search_2 = gl.toolkits.model_parameter_search.random_search.create((train_data, validation_data),
                                                              gl.logistic_classifier.create, 
                                                                     model_parameters=params,
                                                                     return_model=True)

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Jun-06-2016-03-52-5500000' ready for execution


In [118]:
model_search_2.get_best_params()

{'class_weights': 'auto',
 'convergence_threshold': 0.005,
 'features': ['bow.text'],
 'l1_penalty': 0.1,
 'l2_penalty': 0,
 'max_iterations': 10,
 'step_size': 1,
 'target': 'class'}

In [119]:
model_search_2.get_results()

model_id,class_weights,convergence_threshold,features,l1_penalty,l2_penalty,max_iterations,step_size
9,auto,0.0001,[bow.text],0.001,0,10,0.1
8,auto,0.001,[bow.text],0.1,0,10,5.0
1,auto,0.005,[bow.text],0.1,0,10,0.05
0,auto,0.001,[bow.text],0.01,0,10,0.1
3,auto,0.0001,[bow.text],0.2,0,10,10.0
2,auto,0.001,[bow.text],0.01,0,10,0.01
5,auto,0.005,[bow.text],0.1,0,10,1.0
4,auto,0.001,[bow.text],0.05,0,10,10.0
7,auto,0.001,[bow.text],0.05,0,10,0.05
6,auto,0.0005,[bow.text],0.001,0,10,0.1

target,training_accuracy,validation_accuracy
class,0.946823163311,0.910060816278
class,0.949420784941,0.911771878956
class,0.94750191804,0.910552456459
class,0.946821509828,0.910060816278
class,0.94785576338,0.910885378156
class,0.947675533744,0.910730530855
class,0.949579519299,0.911860916154
class,0.94786733776,0.910877635791
class,0.947503571523,0.910556327641
class,0.946823163311,0.910060816278


In [120]:
model_search_2.

task_name,status,start_time,run_time,exception,exception_message,exception_traceback
_train_test_model-0-0,Completed,2016-06-06 04:01:32,273.106339931,,,
_train_test_model-0-1,Completed,2016-06-06 04:06:07,246.978356123,,,
_train_test_model-0-2,Completed,2016-06-06 04:10:16,234.822746038,,,
_train_test_model-0-3,Completed,2016-06-06 04:14:13,279.109241009,,,
_train_test_model-0-4,Completed,2016-06-06 04:18:54,279.047322035,,,
_train_test_model-0-5,Completed,2016-06-06 04:23:35,262.02560997,,,
_train_test_model-0-6,Completed,2016-06-06 04:27:59,247.518553019,,,
_train_test_model-0-7,Completed,2016-06-06 04:32:09,240.823469877,,,
_train_test_model-0-8,Completed,2016-06-06 04:36:12,272.415170908,,,
_train_test_model-0-9,Completed,2016-06-06 04:40:46,247.941239834,,,

job_name
Model-Parameter-Search-Ju n-06-2016-03-52-55000 ...
Model-Parameter-Search-Ju n-06-2016-03-52-55000 ...
Model-Parameter-Search-Ju n-06-2016-03-52-55000 ...
Model-Parameter-Search-Ju n-06-2016-03-52-55000 ...
Model-Parameter-Search-Ju n-06-2016-03-52-55000 ...
Model-Parameter-Search-Ju n-06-2016-03-52-55000 ...
Model-Parameter-Search-Ju n-06-2016-03-52-55000 ...
Model-Parameter-Search-Ju n-06-2016-03-52-55000 ...
Model-Parameter-Search-Ju n-06-2016-03-52-55000 ...
Model-Parameter-Search-Ju n-06-2016-03-52-55000 ...


In [None]:
model_search = gl.toolkits.cross_validation.cross_val_score(bow_reviews_kfold, 
                                                            gl.logistic_classifier.create,
                                                           params)

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.job: Creating a LocalAsync environment called 'async'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Cross-Validation-Jun-05-2016-22-55-30-105918--5508351' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Cross-Validation-Jun-05-2016-22-55-30-105918--5508351' scheduled.


In [43]:
model_search.show()

Canvas is accessible via web browser at the URL: http://localhost:62354/index.html
Opening Canvas in default web browser.


In [48]:
model_search.get_parameters()['_train_test_model-0-1']

{'evaluator': <function graphlab.toolkits.model_parameter_search._model_parameter_search_evaluators.default_evaluator>,
 'folds': <graphlab.toolkits.cross_validation.KFold at 0x128da8190>,
 'metadata': {'fold_id': 1, 'model_id': 1},
 'model_factory': <function graphlab.toolkits.classifier.logistic_classifier.create>,
 'model_parameters': {'features': ['bow.text'],
  'l1_penalty': 0.01,
  'l2_penalty': 0,
  'target': 'class'},
 'return_model': True}

AttributeError: 'LocalAsynchronousJob' object has no attribute 'get_best_params'