In [1]:
import json
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

import import_ipynb
import feature_engineering

import warnings
with warnings.catch_warnings():  
    warnings.filterwarnings("ignore", category = FutureWarning)
    import tensorflow as tf
    
import tensorflow.python.util.deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

importing Jupyter notebook from feature_engineering.ipynb
importing Jupyter notebook from data_acquisition.ipynb


In [2]:
ratings, business, checkin, user, tips = feature_engineering.get_yelp_data()

100%|██████████| 6685900/6685900 [00:47<00:00, 139737.89it/s]
100%|██████████| 192609/192609 [00:03<00:00, 62395.71it/s]
100%|██████████| 161950/161950 [00:01<00:00, 127828.20it/s]
100%|██████████| 1637138/1637138 [00:20<00:00, 78707.92it/s]
100%|██████████| 1223094/1223094 [00:04<00:00, 266029.38it/s]


In [3]:
user = feature_engineering.add_user_features(user, ratings, tips)
business = feature_engineering.add_item_features(business, checkin)
ratings = feature_engineering.add_features_to_ratings(ratings, user, business)
ratings_train, ratings_test = feature_engineering.train_test_split(ratings, 0.1)

100%|██████████| 23793/23793 [00:12<00:00, 1929.77it/s]
100%|██████████| 25/25 [00:36<00:00,  1.64it/s]


In [130]:
def wide_and_deep_columns(column_list):
    # build categorical columns
    
    # categorical columns for elite status across years
#     for column in column_list[21:34]:
#         column = tf.contrib.layers.sparse_column_with_keys(column_name = column, keys = [0, 1])
     
    # categorical columns for presence of business category
    restaurants = tf.contrib.layers.sparse_column_with_keys(column_name = "Restaurants", keys = ['0', '1'])
    food = tf.contrib.layers.sparse_column_with_keys(column_name = "Food", keys = ['0', '1'])

    # build continuous columns
    #rating = tf.contrib.layers.real_valued_column("rating")
    review_count_user = tf.contrib.layers.real_valued_column("review_count_x")
    review_count_business = tf.contrib.layers.real_valued_column("review_count_y")
    average_stars_user = tf.contrib.layers.real_valued_column("average_stars")
    average_stars_business = tf.contrib.layers.real_valued_column("stars")
    
    # continuous columns for various compliment scores
#     for column in column_list[7:18]:
#         column = tf.contrib.layers.real_valued_column(column)
    
    fans = tf.contrib.layers.real_valued_column("fans_norm")
    friends = tf.contrib.layers.real_valued_column("friends_norm")
    compliments = tf.contrib.layers.real_valued_column("compliment_count")
    
    elite_count = tf.contrib.layers.real_valued_column("elite_count")
    user_lifetime = tf.contrib.layers.real_valued_column("user_lifetime")
    compliments = tf.contrib.layers.real_valued_column("compliment_count")
    
    total_hours = tf.contrib.layers.real_valued_column("total_hours")
    total_checkins = tf.contrib.layers.real_valued_column("total_checkins")
    age_of_business = tf.contrib.layers.real_valued_column("age_of_business")
    
    # build wide columns
    wide_columns = []
    
    # build deep columns
    deep_columns = [
        review_count_user, 
        review_count_business, 
        average_stars_user, 
        average_stars_business, 
        fans, 
        friends, 
        compliments, 
        elite_count, 
        user_lifetime,
        total_hours, 
        total_checkins, 
        age_of_business
    ]
    print(review_count_user)
    
    return wide_columns, deep_columns

In [124]:
ratings_train = ratings_train.fillna(0)
ratings_test = ratings_test.fillna(0)

In [120]:
model_dir = "model/wide_and_deep"

wide_deep_model = tf.contrib.learn.DNNLinearCombinedRegressor(
    model_dir = model_dir,
    linear_feature_columns = wide_and_deep_columns(ratings.columns)[0],
    dnn_feature_columns = wide_and_deep_columns(ratings.columns)[1], dnn_hidden_units = [100, 50])

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a949e6eb8>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_protocol': None, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'model/wide_and_deep'}


In [116]:
def input_fn(df):
    continuous_columns = ['review_count_x', 'review_count_y', 'average_stars', 'stars', 
                      'fans_norm', 'friends_norm', 'compliment_count', 'elite_count', 
                      'user_lifetime', 'total_hours', 'total_checkins', 'age_of_business']
    
    categorical_columns = ['Restaurants', 'Shopping', 'Home Services', 'Health & Medical', 
                       'Food', 'Beauty & Spas', 'Local Services', 'Automotive', 'Nightlife',
                       'Event Planning & Services', 'Professional Services', 'Real Estate', 
                       'Active Life', 'Fashion','Arts & Entertainment', 'Doctors', 'Bars', 
                       'Hotels & Travel', 'Hair Salons', 'Fast Food', 'Financial Services', 
                       'Auto Repair', 'Home & Garden', 'American (Traditional)', 'Coffee & Tea']

    continuous_cols = {k: tf.constant(df[k].values) for k in continuous_columns}
    categorical_cols = {k: tf.SparseTensor(
        indices = [[i, 0] for i in range(df[k].size)],
        values = df[k].values,
        dense_shape = [df[k].size, 1]) for k in categorical_columns}

    feature_cols = dict(list(continuous_cols.items()) + list(categorical_cols.items()))
    label = tf.constant(df['rating'].values)

    return feature_cols, label

In [121]:
def train_input_fn():
    return input_fn(ratings_train)

def eval_input_fn():
    return input_fn(ratings_test)

In [122]:
m.fit(input_fn=train_input_fn, steps=200)
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print ("%s: %s" % (key, results[key]))
predictions = m.predict(input_fn=eval_input_fn)

KeyboardInterrupt: 

In [79]:
feature_cols, label = input_fn(ratings_test)

In [131]:
feature_engineering.add_features_to_ratings(ratings, user, business)

Unnamed: 0,review_id,user_id,business_id,rating,date,review_count_x,average_stars_x,compliment_hot_x,compliment_more_x,compliment_profile_x,...,Hotels & Travel_y,Hair Salons_y,Fast Food_y,Financial Services_y,Auto Repair_y,Home & Garden_y,American (Traditional)_y,Coffee & Tea_y,total_checkins_y,age_of_business_y
0,Q1sbwvVQXV2734tPgoKj4Q,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,2013-05-07 04:34:36,10,2.00,0.000000,0.000000,0.000000,...,,,,,,,,,1600.0,3178.0
1,oy8f3bxyl7zZJFDQ5edtIA,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,2013-03-27 14:17:13,10,2.00,0.000000,0.000000,0.000000,...,,,,,,,,,1600.0,3178.0
2,8F9500ycq3mvpjf0glbFFg,tH0uKD-vNwMoEc3Xk3Cbdg,ujmEBvifdJM6h6RLv4wQIg,3.0,2012-08-17 21:58:24,1184,3.87,0.331926,0.022804,0.006757,...,,,,,,,,,1600.0,3178.0
3,qH-fr3sCKO0WoryJy44SGQ,renPzRDqMZpMaHiCD_e1_A,ujmEBvifdJM6h6RLv4wQIg,5.0,2015-07-18 11:40:36,485,3.82,0.177320,0.016495,0.028866,...,,,,,,,,,1600.0,3178.0
4,SnDqhw1IsB34BW1J1Er4mw,renPzRDqMZpMaHiCD_e1_A,ujmEBvifdJM6h6RLv4wQIg,4.0,2015-04-06 14:45:57,485,3.82,0.177320,0.016495,0.028866,...,,,,,,,,,1600.0,3178.0
5,_2_WR2PZHWt_N5IhkjFWbw,QJI9OSEn6ujRCtrX06vs1w,ujmEBvifdJM6h6RLv4wQIg,4.0,2011-03-14 00:23:12,1982,3.61,0.645812,0.049445,0.078708,...,,,,,,,,,1600.0,3178.0
6,jxjvu8zMuIIItx7r49EE4A,m-BZLIIh5PCAKnzH0qj_0Q,ujmEBvifdJM6h6RLv4wQIg,3.0,2015-02-22 02:52:26,734,3.63,0.183924,0.016349,0.010899,...,,,,,,,,,1600.0,3178.0
7,h_-p8Fs8Kf9dGKFnySVxpA,Fv0e9RIV9jw5TX3ctA1WbA,ujmEBvifdJM6h6RLv4wQIg,2.0,2012-07-08 22:12:41,858,3.81,7.386946,1.679487,2.716783,...,,,,,,,,,1600.0,3178.0
8,tRxXnnmVNeriXw6JgfeqhA,k4M43lXJuQMpQW65DTqzIQ,ujmEBvifdJM6h6RLv4wQIg,4.0,2012-02-09 03:44:59,463,3.67,0.522678,0.077754,0.062635,...,,,,,,,,,1600.0,3178.0
9,8Edv5pKe5lOBoN5UZTUI1w,RBXSJA372ilErzNwz0jXvQ,ujmEBvifdJM6h6RLv4wQIg,4.0,2017-02-09 06:45:18,12,3.67,0.000000,0.000000,0.000000,...,,,,,,,,,1600.0,3178.0


In [86]:
wide_and_deep_columns(ratings)[0]

[_SparseColumnKeys(column_name='review_id', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=(0, 1), num_oov_buckets=0, vocab_size=2, default_value=-1), combiner='sum', dtype=tf.string),
 _SparseColumnKeys(column_name='user_id', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=(0, 1), num_oov_buckets=0, vocab_size=2, default_value=-1), combiner='sum', dtype=tf.string),
 _SparseColumnKeys(column_name='business_id', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=(0, 1), num_oov_buckets=0, vocab_size=2, default_value=-1), combiner='sum', dtype=tf.string),
 _SparseColumnKeys(column_name='rating', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=(0, 1), num_oov_buckets=0, vocab_size=2, default_value=-1), combiner='sum', dtype=tf.string),
 _SparseColumnKeys(column_name='date', 

In [126]:
wide_deep_model.fit(input_fn = train_input_fn, steps = 200)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model/wide_and_deep/model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into model/wide_and_deep/model.ckpt.
INFO:tensorflow:loss = 291058.03, step = 1
INFO:tensorflow:global_step/sec: 24.6819
INFO:tensorflow:loss = 451.11353, step = 101 (4.050 sec)
INFO:tensorflow:Saving checkpoints for 200 into model/wide_and_deep/model.ckpt.
INFO:tensorflow:Loss for final step: 214.20366.


DNNLinearCombinedRegressor(params={'head': <tensorflow.contrib.learn.python.learn.estimators.head._RegressionHead object at 0x1a949e63c8>, 'linear_feature_columns': [], 'linear_optimizer': None, 'joint_linear_weights': False, 'dnn_feature_columns': [_RealValuedColumn(column_name='review_count_x', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='review_count_y', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='average_stars', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='stars', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='fans_norm', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='friends_norm', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='compliment_count', dimension=1, de

In [127]:
results = wide_deep_model.evaluate(input_fn = eval_input_fn, steps = 1)

predictions = wide_deep_model.predict(input_fn = eval_input_fn)

INFO:tensorflow:Starting evaluation at 2019-12-08T21:19:44Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model/wide_and_deep/model.ckpt-200
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-12-08-21:19:45
INFO:tensorflow:Saving dict for global step 200: global_step = 200, loss = 196.96843
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model/wide_and_deep/model.ckpt-200
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [129]:
list(predictions)

[-3.2903492,
 1.3618993,
 6.1883764,
 35.32661,
 -6.6277437,
 4.1021943,
 13.422633,
 6.070941,
 -0.7448741,
 9.0258665,
 15.274947,
 -3.2463772,
 6.0869436,
 30.128344,
 52.394886,
 4.7410207,
 5.3426695,
 4.882965,
 15.792027,
 8.654857,
 8.716699,
 -2.9073575,
 36.275196,
 7.3808775,
 6.1253996,
 17.268648,
 2.8330066,
 6.977158,
 5.115753,
 10.039898,
 5.5943375,
 9.820549,
 -9.092191,
 6.5817785,
 27.33782,
 -1.8554965,
 15.585582,
 4.1789675,
 -20.139746,
 90.1436,
 -0.900632,
 9.220676,
 15.945239,
 57.671925,
 -4.183235,
 -8.421681,
 -4.784896,
 11.63626,
 3.1336925,
 9.7518015,
 7.715623,
 3.7785327,
 2.7081168,
 5.2513,
 13.921277,
 12.480388,
 8.614917,
 17.8439,
 0.9932693,
 3.2426631,
 0.18297087,
 7.1383986,
 5.827576,
 4.966444,
 5.760176,
 25.848497,
 0.34216103,
 0.5261029,
 2.2954967,
 6.155335,
 0.2762631,
 4.5622797,
 1.8304456,
 0.017728668,
 -0.3144304,
 8.268209,
 22.726976,
 18.61616,
 12.40285,
 10.722749,
 14.876422,
 9.332862,
 8.601043,
 7.618396,
 9.343854,