In [73]:
import pandas as pd
import numpy as np
import datetime
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import matplotlib.pyplot as plt
from surprise.prediction_algorithms.algo_base import AlgoBase
from surprise.prediction_algorithms.baseline_only import BaselineOnly 
from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise.model_selection.validation import cross_validate
from surprise.model_selection.search import GridSearchCV
from surprise.model_selection import train_test_split
import recmetrics

# Item-based CF
from surprise import Dataset, Reader
from surprise.model_selection.validation import cross_validate
from surprise import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from collections import defaultdict

## Read from local path

In [75]:
sample_test = pd.read_csv('sample_test.csv')

In [6]:
sample_train = pd.read_csv('sample_train.csv')

In [4]:
business = pd.read_json('../yelp_dataset/business.json', lines = True)

In [7]:
user_id = pd.DataFrame(sample_train['user_id'].unique(), columns = ['user_id'])
user_id.reset_index(inplace = True)
user_id = user_id.rename(columns={'index': 'user_idx'})

In [8]:
business_id = pd.DataFrame(business['business_id'].unique(), columns = ['business_id'])
business_id.reset_index(inplace = True)
business_id = business_id.rename(columns={'index': 'business_idx'})

In [9]:
def merge(table1, table2, table3):
    temp = pd.merge(table1, table2, how='left')
    table = pd.merge(temp, table3, how='left')
    return table

In [186]:
ntrain = merge(sample_train, user_id, business_id)
ntest = merge(sample_test, user_id, business_id)

In [187]:
baseline_train = ntrain[['user_idx','business_idx','rating']]

In [188]:
baseline_test = ntest[['user_idx','business_idx','rating']]

## Baseline Model

In [189]:
reader = Reader(rating_scale=(0,5))
train = Dataset.load_from_df(ntrain[['user_idx','business_idx','rating']],reader)
train = train.build_full_trainset()
test = Dataset.load_from_df(ntest[['user_idx','business_idx','rating']],reader)
# test = test.build_full_trainset().build_testset()

In [190]:
bsl_options = {'method':'sgd', 'learning_rate':0.0001}
baseline_algo = BaselineOnly(bsl_options=bsl_options)
baseline_algo.fit(train)
# Baseline = baseline_algo.test(test)

Estimating biases using sgd...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x15595b210>

In [192]:
baseline_train['Predicted_bias'] = baseline_train.apply(lambda x: baseline_algo.predict(uid = x['user_idx'], iid = x['business_idx'])[3], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [196]:
baseline_test['Predicted_bias'] = baseline_test.apply(lambda x: baseline_algo.predict(uid = x['user_idx'], iid = x['business_idx'])[3], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [198]:
baseline_all = pd.concat([baseline_test, baseline_train])

In [65]:
print("RMSE (baseline): ", np.sqrt(np.mean( (baseline_test.Predicted_bias - baseline_test.rating)**2) ))

RMSE (baseline):  1.356915572415814


In [78]:
print("MAE (baseline): ", np.sqrt(np.mean(np.abs(baseline_test.Predicted_bias - baseline_test.rating))))

MAE (baseline):  1.0596398540571268


In [201]:
business_idx_bias = list()
for i in baseline_test['user_idx'].unique():
    business_idx_bias.extend(get_users_predictions(i, 10, baseline_test))

In [289]:
print("Coverage (baseline): ", len(set(business_idx_bias))/len(baseline_all['business_idx'].unique()))

Coverage (baseline):  0.27569864777849323


## Model Based

In [278]:
spark = SparkSession.builder.getOrCreate()

In [279]:
baseline_train = baseline_train[['business_idx', 'user_idx', 'rating']]
baseline_test = baseline_test[['business_idx', 'user_idx', 'rating']]

In [280]:
train_samp = spark.createDataFrame(baseline_train)
test_samp = spark.createDataFrame(baseline_test)
model = ALS(maxIter=3, regParam = 0.001, userCol='user_idx', 
            itemCol='business_idx', 
            ratingCol='rating').fit(train_samp)

In [282]:
predictions_test = model.transform(test_samp)
df_mb_test = predictions_test.toPandas()
predictions_train = model.transform(train_samp)
df_mb_train = predictions_train.toPandas()

In [83]:
eval_rmse = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
result = eval_rmse.evaluate(predictions_test.na.drop())
print(f'Model Based RMSE = {result}')

Model Based RMSE = 1.3671240046863096


In [84]:
eval_mae = RegressionEvaluator(metricName='mae', labelCol='rating', predictionCol='prediction')
result = eval_mae.evaluate(predictions_test.na.drop())
print(f'Model Based MAE = {result}')

Model Based MAE = 0.8245764107061594


In [283]:
df_mb = pd.concat([df_mb_train, df_mb_test])

In [286]:
def get_users_predictions(user_id, n, model):
    recommended_items = model[model['user_idx'] == user_id][['prediction', 'business_idx']]
    recommended_items = recommended_items.sort_values('prediction', ascending=False)    
    if len(recommended_items) >= 10:
        recommended_items = recommended_items.head(n)        
    return recommended_items['business_idx'].tolist()

In [287]:
business_idx_mb = list()
for i in df_mb['user_idx'].unique():
    business_idx_mb.extend(get_users_predictions(i, 10, df_mb))

In [290]:
print('Coverage(Model Based):', len(set(business_idx_mb))/len(df_mb['business_idx'].unique()))

Coverage(Model Based): 0.7800386349001932
