In [1]:
import graphlab as gl
import matplotlib.pyplot as plt
%matplotlib inline
from graphlab.toolkits.feature_engineering import *

This non-commercial license of GraphLab Create is assigned to mkkedia@dons.usfca.edu and will expire on May 31, 2017. For commercial licensing options, visit https://dato.com/buy/.


[INFO] graphlab.cython.cy_server: GraphLab Create v1.10 started. Logging: /tmp/graphlab_server_1465340887.log


In [3]:
reviews = gl.load_sframe("yelp_dataset_challenge_academic_dataset/reviews/")

## Feature Generation

- Lets start with a simple Bag of words feature set 
- Could try TFIDF in the second pass

In [4]:
def generate_features(featureGenerator, reviews):
    """
    Generate features based on a feature generator from graphlab 
    """
    feat_reviews = featureGenerator.fit_transform(reviews)
    # Selecting only the relevant columns
    return feat_reviews['review_id', 'stars', 'bow.text', 'business_id']

    

In [5]:
word_counter = WordCounter('text', to_lower=True, output_column_prefix='bow')
bow_reviews = generate_features(word_counter, reviews) 

## Generating the class labels

In [6]:
# Creating the sentiment 'class' 
bow_reviews['class'] = bow_reviews['stars'].apply(lambda x: 1 if x >= 3 else 0)

## Choosing Non-Burger reviews
- For a start we will train our model on reviews which are not related to burgers
- Only about 67000 out of 2.2 million reviews mention 'burger' explicitly
    - To do this well though it might be better to stem the words and then look for burger, as this will not match 'Burgers' o

In [7]:
# Selecting reviews with burgers in them
br = bow_reviews[bow_reviews['bow.text'].apply(lambda x: 'burger' in x)]

In [8]:
# Choosing only reviews which don't have the word 'burger' in them 
review_dataset = bow_reviews[bow_reviews['bow.text'].apply(lambda x: 'burger' not in x)]

## Doing a Test Train CV split

In [9]:
review_dataset = gl.cross_validation.shuffle(review_dataset, random_seed=100)

In [35]:
# 3- Fold cross validation
#bow_reviews_kfold = gl.toolkits.cross_validation.KFold(bow_reviews, 3)


In [10]:
# First trying out a simple random split to understand model tuning 
train_data, test_data = review_dataset.random_split(0.7)


In [11]:
test_data, validation_data = test_data.random_split(0.5)

## Trying out the standard Logistic regression model 
- This is the model developed by sentiment analysis wrapper in graphlab create


In [12]:
review_dataset.head(1)

review_id,stars,bow.text,business_id,class
a2XM5TEgVjOH0kMmMZGXxQ,5,"{'and': 2, 'frequenting': 1, 'love': 1, 'often' ...",CWoXwnoxdFihXB1KhH9goA,1


In [14]:
# Trying out a simple model based on the model created by the inbuilt sentiment analysis function
simple_model = gl.logistic_classifier.create(train_data, 
                              target='class', 
                              features=['bow.text'], 
                              l2_penalty=0.2, 
                              max_iterations=20, 
                              class_weights='auto', 
                              validation_set=validation_data)

In [None]:
## Use model to generate test accuracy
## Find top 5 burger places 

## Test Set Evaluation 

In [14]:
def test_set_evaluation(model, test_data):
    """
    @param model: a trained model
    @param test_data: An SFrame with the same columns as the training set of the model
    
    return: Returns a tuple with (precision, recall, f1_score, auc, confusion matrix)
    """
    test_predictions = model.classify(test_data)
    cm = gl.evaluation.confusion_matrix(test_data['class'], test_predictions['class'])
    precision = gl.evaluation.precision(test_data['class'], test_predictions['class'])
    recall = gl.evaluation.recall(test_data['class'], test_predictions['class'])
    f1_score = gl.evaluation.f1_score(test_data['class'], test_predictions['class'])
    auc = gl.evaluation.auc(test_data['class'], test_predictions['class'])
    
    return (precision, recall, f1_score, auc, cm)
    

In [18]:
(precision, recall, f1_score, auc, cm) = test_set_evaluation(simple_model, test_data)

In [19]:
precision, recall, f1_score, auc

(0.9457232223643568,
 0.9397601100625884,
 0.9427322366131913,
 0.7643023535193608)

In [20]:
# Confusion matrix
cm

target_label,predicted_label,count
0,1,13917
0,0,51741
1,0,15544
1,1,242491


## Choosing only meaningful businesses for estimating burger review sentiment

In [22]:
#br = bow_reviews.join(br, on='review_id')
br_count = br.groupby('business_id',gl.aggregate.COUNT('review_id'))
br_count['Count'].sketch_summary()
# Considering only businesses with atleast 30 reviews
len(br_ic)

In [23]:
br_ic = br_count[br_count['Count']>30]

In [24]:
len(br_ic)

386

In [45]:
br_ic.head(1)

business_id,Count
0Zec6RiFV6h6TbOFb7xu3A,62


In [26]:
br_ic.save("yelp_dataset_challenge_academic_dataset/burger_businesses")

## Predicting sentiment on test set

In [74]:
def get_avg_business_review_sentiment_for_burgers(model,br_test_set):
    """
    Construct a dataframe which has avg review sentiment for reviews mentioning burger with the 
    class (positive:1, negative:0) of the reviews and the avg sentiment
    
    
    """
    # Make a test set
    # Do predictions
    br_predictions = model.classify(br_test_set['review_id', 'stars', 'bow.text', 'business_id', 'class'])
    # Join predictions back to test set 
    br_predictions['review_id'] = br_test_set['review_id']
    br_predictions = br_predictions.join(br_test_set, on='review_id')
    
    avg_business_ratings = br_predictions.groupby(['business_id','class'],
                                                  {'count':gl.aggregate.COUNT('probability'),
                                                   'avg':gl.aggregate.AVG('probability')})

    return avg_business_ratings.to_dataframe()
    

In [42]:
# Constructing the smaller set of businesses for sentiment will be rated 
br_test_set = br_ic.join(br, on='business_id')
# Avg Business ratings as a data frame for reviews 
abr = get_avg_business_review_sentiment_for_burgers(simple_model, br_test_set)

In [44]:
abr.head()

Unnamed: 0,business_id,class,count,avg
0,HBRZ8oaROD3wLCr-8ssRxQ,0,10,0.794774
1,svoahtxi05CncY954Vu-uw,0,20,0.902001
2,PzYjXRKgDwjVg1YicWuYLg,0,26,0.946388
3,DZUuB-afy0GEzlPdDHvuFg,0,2,0.988462
4,qQNjHyoFL3_0xugLf8sM2A,0,4,0.809672


In [None]:
businesses = gl.load_sframe('yelp_dataset_challenge_academic_dataset/business/')


## Picking the top 5 burger places

In [86]:
def get_top_k_places(abr, k, businesses):
    """
    Pick out the top 5 places from the list, choosing only the avg of positive reviews 
    """
    top_k_places = abr[abr['class']==1].sort_values('avg', ascending=False)[0:k]
    return gl.SFrame(top_k_places).join(businesses, on='business_id').to_dataframe().sort_values('avg', ascending=False)


In [47]:
get_top_k_places(abr, 5, businesses)

business_id,class,count,avg,attributes,categories
NGJDjdiDJHmN2xxU7KauuA,1,24,0.998111163774,"{'Accepts Credit Cards': 'true', 'Good for Kids': ...","[Hotels & Travel, Arts & Entertainment, Casinos, ..."
lzl3cfgVEGMtezF8oRqXDA,1,30,0.998426230521,"{'Alcohol': 'full_bar', 'Noise Level': 'avera ...","[Pubs, Bars, American (Traditional), Nightl ..."
tteHjtbjQfWnJgFe4jPoMA,1,27,0.999548307098,"{'Take-out': 'true', 'Accepts Credit Cards': ...","[American (New), Gluten- Free, Restaurants] ..."
xY1sPHTA2RGVFlh5tZhs9g,1,38,0.99888256404,"{'Alcohol': 'full_bar', 'Noise Level': 'loud', ...","[Arts & Entertainment, American (New), ..."
VgLiSW1iGkpzIEXOgvUBEw,1,25,0.998912402513,"{'Take-out': 'true', 'Accepts Credit Cards': ...","[Food, Juice Bars & Smoothies, Breakfast & ..."

city,full_address,hours,latitude,longitude
Las Vegas,Flamingo Las Vegas Hotel & Casino\n3555 Las Vegas ...,"{'Monday': {'close': '00:00', 'open': ...",36.115673,-115.172557
Las Vegas,"8168 Las Vegas Blvd S\nSoutheast\nLas Vegas, ...","{'Monday': {'close': '00:00', 'open': ...",36.0407804215,-115.171651706
Madison,"2611 Monroe St\nDudgeon- Monroe\nMadison, WI 5 ...","{'Monday': {'close': '22:00', 'open': ...",43.0574842,-89.4280617
Scottsdale,"7135 E Camelback Rd\nSte 125\nScottsdale, AZ 8 ...","{'Monday': {'close': '23:00', 'open': ...",33.5020064299,-111.928296379
Las Vegas,8680 W Warm Springs Rd\nSte ...,"{'Monday': {'close': '21:00', 'open': ...",36.0562823585,-115.280428139

name,neighborhoods,open,review_count,stars,state,type,acity
Flamingo Las Vegas Hotel & Casino ...,[The Strip],True,2024,2.5,NV,business,Las Vegas
Steiner's A Nevada Style Pub ...,[Southeast],True,200,4.0,NV,business,Las Vegas
Jacs Dining and Tap House,[Dudgeon-Monroe],True,165,4.0,WI,business,Madison
Culinary Dropout,[],True,871,3.5,AZ,business,Phoenix
SkinnyFATS,"[Southwest, Spring Valley] ...",True,520,4.5,NV,business,Las Vegas


### Notes
- Visual inspection of businesses
    - Mirage Hotel is a casino and there are a lot of reviews which are 'positive' but also mention burgers
    - Steiner's A Nevada Style Pub also seems to have great burgers 
    - The Barrymore seems to be highly rated for its burgers too
    - Thirsty Lion Gastropub (the one in phoenix) seems to be popular for burgers
    - Yard House LINQ seems to have fantastic burgers too
    

## Trying out hyperparameter search

### Todo's
- Try out custom evaluator

<p>using random search instead of grid search because of [this](http://blog.dato.com/how-to-evaluate-machine-learning-models-part-4-hyperparameter-tuning) article</p>

In [57]:
#import scipy
#scipy.stats.distributions.expon(.1).rvs()

2.5156201230985706

In [58]:
params = {'target':'class', 'features':[['bow.text']], 
          'l1_penalty':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5], 
          'l2_penalty':[0,0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5],
          'step_size':[0.01, 0.05, 0.1, 0.5, 1, 5, 10], 
          'convergence_threshold':[0.0001, 0.0005, 0.001, 0.005, 0.01], 
          'max_iterations':20,
          'class_weights':'auto'}

In [59]:
model_search = gl.toolkits.model_parameter_search.random_search.create((train_data, validation_data),
                                                              gl.logistic_classifier.create, 
                                                                     model_parameters=params,
                                                                     return_model=True)

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.job: Creating a LocalAsync environment called 'async'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Jun-06-2016-23-35-5600000' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Jun-06-2016-23-35-5600000' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: A job with name 'Model-Parameter-Search-Jun-06-2016-23-35-5600000' already exists. Renaming the job to 'Model-Parameter-Search-Jun-06-2016-23-35-5600000-4c4ba'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Jun-06-2016-23-35-5600000-4c4ba' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Jun-06-2016-23-35-5600000-4c4ba' scheduled.


In [60]:
model_search.get_best_params()


{'class_weights': 'auto',
 'convergence_threshold': 0.005,
 'features': ['bow.text'],
 'l1_penalty': 5,
 'l2_penalty': 0,
 'max_iterations': 20,
 'step_size': 0.05,
 'target': 'class'}

In [61]:
model_search.get_results()

model_id,class_weights,convergence_threshold,features,l1_penalty,l2_penalty,max_iterations,step_size
9,auto,0.0005,[bow.text],0.001,0.5,20,0.01
8,auto,0.0001,[bow.text],0.001,0.0,20,5.0
1,auto,0.001,[bow.text],0.05,1.0,20,0.01
0,auto,0.005,[bow.text],5.0,0.0,20,0.05
3,auto,0.0001,[bow.text],0.005,0.001,20,1.0
2,auto,0.001,[bow.text],5.0,0.01,20,5.0
5,auto,0.0005,[bow.text],0.1,0.01,20,0.01
4,auto,0.005,[bow.text],0.5,0.05,20,0.01
7,auto,0.0005,[bow.text],5.0,1.0,20,5.0
6,auto,0.005,[bow.text],0.005,1.0,20,5.0

target,training_accuracy,validation_accuracy
class,0.960329250676,0.91668548694
class,0.958203771997,0.916215237924
class,0.960327265484,0.916679299453
class,0.960073160935,0.916929892678
class,0.958316927929,0.916221425411
class,0.957988709553,0.916530799764
class,0.960325280293,0.91668548694
class,0.960316677795,0.916697861914
class,0.957988709553,0.916530799764
class,0.958204433727,0.916215237924


In [63]:
models = model_search.get_models()

In [67]:
# Zero coefficients for 5527 parameters
len(models[0].coefficients[models[0].coefficients['value']==0])

5527

In [68]:
# Best model 
models[0]

Class                         : LogisticClassifier

Schema
------
Number of coefficients        : 1583113
Number of examples            : 1511189
Number of classes             : 2
Number of feature columns     : 1
Number of unpacked features   : 1583112

Hyperparameters
---------------
L1 penalty                    : 5.0
L2 penalty                    : 0.0

Training Summary
----------------
Solver                        : fista
Solver iterations             : 20
Solver status                 : TERMINATED: Iteration limit reached.
Training time (sec)           : 352.334

Settings
--------
Log-likelihood                : 82825.7082

Highest Positive Coefficients
-----------------------------
bow.text[first....well]       : 2.2536
bow.text[corporate?".]        : 2.2536
bow.text[wait....quite]       : 2.2536
bow.text[first.***]           : 2.2536
bow.text[payless!!]           : 2.2509

Lowest Negative Coefficients
----------------------------
bow.text[instructs)]          : -6.3873
bow.tex

### Test set evaluation for Best Model

- Precision 0.961
- Recall 0.9331
- AUC 0.824 

In [17]:
(precision, recall, f1_score, auc, cm) = test_set_evaluation(models[0], test_data)

In [18]:
precision, recall, f1_score, auc

(0.9614606405267884, 0.9331685852842291, 0.9471033735825469, 0.824585288853495)

### Finding the top 5 places for burgers

In [75]:
abr = get_avg_business_review_sentiment_for_burgers(models[0],br_test_set)

In [87]:
top_places = get_top_k_places(abr, 5, businesses)

In [93]:
top_places['name']

2        The Cosmopolitan of Las Vegas
1            Jacs Dining and Tap House
0    Flamingo Las Vegas Hotel & Casino
4                        The Barrymore
3                            DW Bistro
Name: name, dtype: object

In [81]:
top_places

business_id,class,count,avg,attributes,categories
NGJDjdiDJHmN2xxU7KauuA,1,21,0.949581092722,"{'Accepts Credit Cards': 'true', 'Good for Kids': ...","[Hotels & Travel, Arts & Entertainment, Casinos, ..."
tteHjtbjQfWnJgFe4jPoMA,1,27,0.954053463108,"{'Take-out': 'true', 'Accepts Credit Cards': ...","[American (New), Gluten- Free, Restaurants] ..."
AtjsjFzalWqJ7S9DUFQ4bw,1,36,0.966875100082,"{'Take-out': 'true', 'Accepts Credit Cards': ...","[Hotels & Travel, Arts & Entertainment, Casinos, ..."
y6NFyThUU2wtTNa0rX7VrA,1,42,0.946957801976,"{'Take-out': 'true', 'Accepts Credit Cards': ...","[American (New), Caribbean, Restaurants] ..."
Gvw2ewUbqq7PWkxKwjqifA,1,29,0.949579583608,"{'Take-out': 'true', 'Accepts Credit Cards': ...","[American (New), Restaurants] ..."

city,full_address,hours,latitude,longitude
Las Vegas,Flamingo Las Vegas Hotel & Casino\n3555 Las Vegas ...,"{'Monday': {'close': '00:00', 'open': ...",36.115673,-115.172557
Madison,"2611 Monroe St\nDudgeon- Monroe\nMadison, WI 5 ...","{'Monday': {'close': '22:00', 'open': ...",43.0574842,-89.4280617
Las Vegas,"3708 Las Vegas Blvd S\nThe Strip\nLas Vegas, ...","{'Monday': {'close': '00:00', 'open': ...",36.1099146155,-115.174642483
Las Vegas,6115 S Ft Apache Rd\nSte 112\nSpring Valley\nLas ...,"{'Tuesday': {'close': '21:00', 'open': ...",36.0771122954,-115.298295021
Las Vegas,99 Convention Center Dr\nThe Strip\nLas Ve ...,"{'Monday': {'close': '22:00', 'open': ...",36.1318966939,-115.162421151

name,neighborhoods,open,review_count,stars,state,type,acity
Flamingo Las Vegas Hotel & Casino ...,[The Strip],True,2024,2.5,NV,business,Las Vegas
Jacs Dining and Tap House,[Dudgeon-Monroe],True,165,4.0,WI,business,Madison
The Cosmopolitan of Las Vegas ...,[The Strip],True,2988,4.0,NV,business,Las Vegas
DW Bistro,[Spring Valley],True,806,4.5,NV,business,Las Vegas
The Barrymore,[The Strip],True,651,4.0,NV,business,Las Vegas
