In [36]:
import json
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

import import_ipynb
import feature_engineering

import surprise
from surprise import dataset
from surprise import BaselineOnly

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import precision_score, recall_score, roc_auc_score

In [7]:
# fetch data objects using defined functions

# ratings, business, checkin, user, tips = feature_engineering.get_yelp_data()
# user = feature_engineering.add_user_features(user, ratings, tips)
# business = feature_engineering.add_item_features(business, checkin)
# ratings = feature_engineering.add_features_to_ratings(ratings, user, business)
# ratings_train, ratings_validation, ratings_test = feature_engineering.train_validation_test_split(years = 1)
# ratings_recommend = feature_engineering.user_recommendation_options(ratings_train)

In [40]:
# fetch data objects from saved pickle files

ratings_train = pickle.load(open("data/ratings_train_1_years.pkl", "rb"))
ratings_validation = pickle.load(open("data/ratings_validation_1_years.pkl", "rb"))
ratings_test = pickle.load(open("data/ratings_test_1_years.pkl", "rb"))
ratings_recommend = pickle.load(open("data/ratings_recommendation_list.pkl", "rb"))

In [41]:
def baseline_model_train(ratings_train):
    bsl_options = {'method': 'als', 'n_epochs': 20}
    model = surprise.BaselineOnly(bsl_options = bsl_options)

    reader = surprise.Reader(rating_scale = (1,5))
    ratings = surprise.Dataset.load_from_df(ratings_train[['user_id', 'business_id', 'rating']], reader)
    ratings_train = ratings.build_full_trainset()

    model.fit(ratings_train)
    return model

In [50]:
def baseline_model_predict(model, ratings_test):
    predictions = pd.DataFrame(columns = ['user_id', 'business_id', 'pred_rating'])
    for i in range(len(ratings_test)):
        pred = model.predict(ratings_test.user_id[i], ratings_test.business_id[i])
        predictions.loc[i, 'user_id'] = pred.uid
        predictions.loc[i, 'business_id'] = pred.iid
        predictions.loc[i, 'pred_rating'] = pred.est
        
    return predictions

In [44]:
def multiclass_roc_auc_score(y_test, y_pred, average = "macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)

    return roc_auc_score(y_test, y_pred, average = average)

In [83]:
def basline_model_eval(predictions, ratings_test):
    predictions.pred_rating = predictions.pred_rating.apply(np.round).astype(np.double)    
    recall = recall_score(ratings_test['rating'], predictions.pred_rating, average = 'macro')
    precision = precision_score(ratings_test['rating'], predictions.pred_rating, average = 'macro')
    auc = multiclass_roc_auc_score(ratings_test['rating'], predictions.pred_rating, average = 'macro')
    
    return recall, precision, auc

In [84]:
model = baseline_model_train(ratings_train)

Estimating biases using als...


In [85]:
predictions = baseline_model_predict(model, ratings_test)

In [92]:
test_recall, test_precision, test_auc = basline_model_eval(predictions, ratings_test)

  'precision', 'predicted', average, warn_for)


In [89]:
test_auc

0.510582970731233

In [None]:
def model_coverage(ratings_train, ratings_recommend):
    items_train = len(ratings_train.business_id.unique())
    items_recommended = len(ratings_recommend.business_id.unique())
    
    item_coverage = np.round(items_recommend / items_train, 4)
    
    return item_coverage