### Wide & Deep Learning using Tensorflow

In [1]:
import os
import time
import json
import pickle
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm

import import_ipynb
import feature_engineering

import warnings
with warnings.catch_warnings():  
    warnings.filterwarnings("ignore", category = FutureWarning)
    import tensorflow as tf
    
import tensorflow.python.util.deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import precision_score, recall_score, roc_auc_score

importing Jupyter notebook from feature_engineering.ipynb
importing Jupyter notebook from data_acquisition.ipynb


#### 1. Reading Data

In [166]:
# fetch data objects using defined functions

# ratings, business, checkin, user, tips = feature_engineering.get_yelp_data()
# user = feature_engineering.add_user_features(user, ratings, tips)
# business = feature_engineering.add_item_features(business, checkin)
# ratings = feature_engineering.add_features_to_ratings(ratings, user, business)
# ratings_train, ratings_validation, ratings_test = feature_engineering.train_validation_test_split(years = 1)
# ratings_recommend = feature_engineering.user_recommendation_options(ratings_train)

In [3]:
# fetch data objects from saved pickle files

user = pickle.load(open("data/user_feature_set.pkl", "rb"))
business = pickle.load(open("data/business_feature_set.pkl", "rb"))

ratings_train = pickle.load(open("data/ratings_train_1_years.pkl", "rb"))
ratings_validation = pickle.load(open("data/ratings_validation_1_years.pkl", "rb"))
ratings_test = pickle.load(open("data/ratings_test_1_years.pkl", "rb"))
ratings_recommend = pickle.load(open("data/ratings_recommendation_list.pkl", "rb"))

#### 2. Data Preprocessing

In [3]:
# wide and deep column name format
column_names = {
    'Restaurants': 'restaurants', 'Food': 'food', 'Fast Food': 'fast_food', 
    'Nightlife': 'nightlife', 'American (Traditional)': 'american', 'Bars': 'bars', 
    'Mexican': 'mexican', 'Sandwiches': 'sandwiches', 'Pizza': 'pizza', 'Burgers': 'burgers'
}

continuous_columns = [
    'review_count_x', 'review_count_y', 'average_stars', 'stars', 'fans_norm', 'friends_norm', 'elite_count',
    'compliment_count', 'compliment_score', 'user_lifetime', 'total_hours',  'total_checkins', 'age_of_business'
]
    
categorical_columns = [
    'user_id', 'business_id', 'restaurants', 'food', 'fast_food', 'nightlife', 
    'american', 'bars', 'mexican', 'sandwiches', 'pizza', 'burgers'
]

In [4]:
# fix business category column format
def format_columns(df, column_names):
    df = df.rename(columns = column_names)
    
    for column in df.columns[17:27]:
        df[column] = df[column].apply(lambda x: str(x))
    
    return df

#### 3. Define Wide & Deep Columns

In [5]:
def wide_and_deep_columns():
    # build categorical columns
    user_id = tf.contrib.layers.sparse_column_with_hash_bucket("user_id", hash_bucket_size = 50000)
    business_id = tf.contrib.layers.sparse_column_with_hash_bucket("business_id", hash_bucket_size = 25000)

    # categorical columns for presence of business category
    restaurants = tf.contrib.layers.sparse_column_with_keys(column_name = "restaurants", keys = ['0', '1'])
    food = tf.contrib.layers.sparse_column_with_keys(column_name = "food", keys = ['0', '1'])
    fast_food = tf.contrib.layers.sparse_column_with_keys(column_name = "fast_food", keys = ['0', '1'])
    nightlife = tf.contrib.layers.sparse_column_with_keys(column_name = "nightlife", keys = ['0', '1'])
    american = tf.contrib.layers.sparse_column_with_keys(column_name = "american", keys = ['0', '1'])
    bars = tf.contrib.layers.sparse_column_with_keys(column_name = "bars", keys = ['0', '1'])
    mexican = tf.contrib.layers.sparse_column_with_keys(column_name = "mexican", keys = ['0', '1'])
    sandwiches = tf.contrib.layers.sparse_column_with_keys(column_name = "sandwiches", keys = ['0', '1'])
    pizza = tf.contrib.layers.sparse_column_with_keys(column_name = "pizza", keys = ['0', '1'])
    burgers = tf.contrib.layers.sparse_column_with_keys(column_name = "burgers", keys = ['0', '1'])
    
    # build continuous columns
    review_count_user = tf.contrib.layers.real_valued_column("review_count_x")
    review_count_business = tf.contrib.layers.real_valued_column("review_count_y")
    average_stars_user = tf.contrib.layers.real_valued_column("average_stars")
    average_stars_business = tf.contrib.layers.real_valued_column("stars")
    
    fans = tf.contrib.layers.real_valued_column("fans_norm")
    friends = tf.contrib.layers.real_valued_column("friends_norm")
    
    elite_count = tf.contrib.layers.real_valued_column("elite_count")
    user_lifetime = tf.contrib.layers.real_valued_column("user_lifetime")
    compliment_count = tf.contrib.layers.real_valued_column("compliment_count")
    compliment_score = tf.contrib.layers.real_valued_column("compliment_score")
    
    total_hours = tf.contrib.layers.real_valued_column("total_hours")
    total_checkins = tf.contrib.layers.real_valued_column("total_checkins")
    age_of_business = tf.contrib.layers.real_valued_column("age_of_business")
    
    # build wide columns
    wide_columns = [
        user_id, 
        business_id,
        restaurants,
        food,
        fast_food,
        nightlife,
        american,
        bars,
        mexican,
        sandwiches,
        pizza,
        burgers
    ]
    
    # build deep columns
    deep_columns = [
        review_count_user, 
        review_count_business, 
        average_stars_user, 
        average_stars_business, 
        fans, 
        friends, 
        compliment_count,
        compliment_score,
        elite_count, 
        user_lifetime,
        total_hours, 
        total_checkins, 
        age_of_business
    ]
    
    return wide_columns, deep_columns

#### 4. Define Tensors

In [6]:
def input_fn(df):
    for k in continuous_columns:
        df[k] = pd.to_numeric(df[k])
    
    # creating tensors for continuous and categorical columns
    continuous_cols = {k: tf.constant(df[k].values) for k in continuous_columns}
    categorical_cols = {k: tf.SparseTensor(
        indices = [[i, 0] for i in range(df[k].size)],
        values = df[k].values,
        dense_shape = [df[k].size, 1]) for k in categorical_columns}
    
    # combining all feature columns
    feature_cols = dict(list(continuous_cols.items()) + list(categorical_cols.items()))
    
    # specifying target variable
    df['rating'] = df['rating'].apply(lambda x: int(x))
    label = tf.constant(df['rating'].values)

    return feature_cols, label

In [7]:
def train_input_fn():
    return input_fn(ratings_train)

def validation_input_fn():
    return input_fn(ratings_validation)

def test_input_fn():
    return input_fn(ratings_test)

def recommend_input_fn():
    return input_fn(ratings_recommend)

#### 5. Model Training

In [8]:
def model_train(ratings_train, dnn_units = [100, 50], restart = True):
    model_dir = "model/wide_and_deep"
    if restart == True:
        if os.path.isdir('model/wide_and_deep'):
            shutil.rmtree(os.path.abspath('model/wide_and_deep'))
    
    # specifying parameters for multi-class classifier
    wide_deep_model = tf.contrib.learn.DNNLinearCombinedClassifier(
        model_dir = model_dir,
        n_classes = 6,
        linear_feature_columns = wide_and_deep_columns()[0],
        dnn_feature_columns = wide_and_deep_columns()[1], dnn_hidden_units = dnn_units)
    
    # fitting model by reducing loss over 400 steps
    wide_deep_model.fit(input_fn = train_input_fn, steps = 400)
    
    return wide_deep_model

#### 6. Hyperparameter Tuning

In [12]:
def model_tune(ratings_train, ratings_validation):
    # tuning hidden layers and nodes hyperparameter
    dnn_units = [[100, 50], [500, 250, 50], [1000, 500, 100, 50]]

    tune_results = pd.DataFrame(columns = ['model_config', 'results', 'recall', 'precision'])
    for i in range(len(dnn_units)):
        wide_deep_model = model_train(ratings_train, dnn_units[i])
        results = wide_deep_model.evaluate(input_fn = validation_input_fn, steps = 1)
        predictions = wide_deep_model.predict(input_fn = validation_input_fn, as_iterable = False)
        recall, precision, auc = model_eval(predictions, ratings_validation)
        
        tune_results = tune_results.append({'model_config': dnn_units[i], 'results': results, 'recall': recall, 
                                            'precision': precision, 'auc': auc}, ignore_index = True)
        
    return tune_results 

#### 7. Model Predictions

In [10]:
def model_predict(wide_deep_model, ratings_test):
    results = wide_deep_model.evaluate(input_fn = test_input_fn, steps = 1) 
    predictions = wide_deep_model.predict(input_fn = test_input_fn, as_iterable = False)
    
    return predictions

#### 8. Model Evaluation

In [None]:
def multiclass_roc_auc_score(y_test, y_pred, average = "macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)

    return roc_auc_score(y_test, y_pred, average = average)

In [2]:
def model_eval(predictions, ratings_test):
    recall = recall_score(ratings_test['rating'], predictions, average = 'macro')
    precision = precision_score(ratings_test['rating'], predictions, average = 'macro')
    auc = multiclass_roc_auc_score(ratings_test['rating'], predictions, average = 'macro')
    
    return recall, precision, auc

#### 9. Generating Recommendations

In [13]:
def model_recommend(wide_deep_model, ratings_recommend):
    ratings_recommend = ratings_recommend.merge(user, on = 'user_id')
    ratings_recommend = ratings_recommend.merge(business, on = 'business_id')
    ratings_recommend = format_columns(ratings_recommend, column_names)
    ratings_recommend['rating'] = 0
    
    predictions = wide_deep_model.predict(input_fn = recommend_input_fn, as_iterable = False)
    
    return predictions

#### 10. Function Calls

In [14]:
ratings_train = format_columns(ratings_train, column_names)
ratings_validation = format_columns(ratings_validation, column_names)
ratings_test = format_columns(ratings_test, column_names)

In [26]:
start = time.time()
tune_results = model_tune(ratings_train, ratings_validation)
end = time.time()
print(tune_results)
print('\nTime Elapsed = '+str(np.round(end - start, 4))+' secs')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a3c3d8908>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_protocol': None, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'model/wide_and_deep'}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Sav

  'precision', 'predicted', average, warn_for)


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into model/wide_and_deep/model.ckpt.
INFO:tensorflow:loss = 35.016827, step = 2
INFO:tensorflow:global_step/sec: 2.40201
INFO:tensorflow:loss = 1.4141937, step = 202 (79.249 sec)
INFO:tensorflow:global_step/sec: 2.6535
INFO:tensorflow:global_step/sec: 3.08069
INFO:tensorflow:loss = 1.3574953, step = 402 (65.146 sec)
INFO:tensorflow:Saving checkpoints for 402 into model/wide_and_deep/model.ckpt.
INFO:tensorflow:Loss for final step: 1.3574953.
INFO:tensorflow:Starting evaluation at 2019-12-13T01:42:19Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model/wide_and_deep/model.ckpt-402
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-12-13-01:42:21


In [27]:
tune_results.head()

Unnamed: 0,model_config,results,recall,precision
0,"[100, 50]","{'loss': 4.1252217, 'accuracy': 0.4388489, 'gl...",0.210637,0.233751
1,"[500, 250, 50]","{'loss': 1.3746114, 'accuracy': 0.46354917, 'g...",0.200863,0.169706
2,"[1000, 500, 100, 50]","{'loss': 1.404131, 'accuracy': 0.46179056, 'gl...",0.206687,0.259839


In [None]:
# To choose best model based on hyperparameter tune results

In [134]:
start = time.time()
wide_deep_model = model_train(ratings_train)
end = time.time()
print('\nTime Elapsed = '+str(np.round(end - start, 4))+' secs')
print(wide_deep_model)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a3fb0e898>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_protocol': None, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'model/wide_and_deep'}


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  after removing the cwd from sys.path.


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into model/wide_and_deep/model.ckpt.
INFO:tensorflow:loss = 455.41846, step = 2
INFO:tensorflow:global_step/sec: 8.30053
INFO:tensorflow:loss = 3.375916, step = 202 (22.911 sec)
INFO:tensorflow:Saving checkpoints for 202 into model/wide_and_deep/model.ckpt.
INFO:tensorflow:Loss for final step: 3.375916.


In [206]:
start = time.time()
predictions = model_predict(wide_deep_model, ratings_test)
end = time.time()
print('\nTime Elapsed = '+str(np.round(end - start, 4))+' secs')

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  after removing the cwd from sys.path.


NotFittedError: Couldn't find trained model at model/wide_and_deep.

In [204]:
start = time.time()
test_recall, test_precision, test_auc = model_eval(predictions, ratings_test)
end = time.time()
print('Recall = ', test_recall)
print('Precision = ', test_precision)
print('\nTime Elapsed = '+str(np.round(end - start, 4))+' secs')

TypeError: cannot unpack non-iterable numpy.float64 object

In [184]:
start = time.time()
recommendations = model_recommend(wide_deep_model, ratings_recommend)
end = time.time()
print(recommendations)
print('\nTime Elapsed = '+str(np.round(end - start, 4))+' secs')

KeyError: 'review_count_x'