# Modelling

In [6]:
# import databricks.koalas as ks
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import time
import json
import os
import pickle
from collections import Counter

import pyspark as ps

import pandas_profiling
# df.profile_report(style={'full_width':True})
# df.profile_report(title='Pandas Profiling Report')

import matplotlib.pyplot as plt

plt.style.use('seaborn-pastel')
font = {'size':16}
import seaborn as sns

import scipy.stats as scs

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.simplefilter('ignore')

## Reading in AZNV dataframe

In [4]:
AZNV_df = pd.read_pickle('../data/AZNV_df.p')
print(AZNV_df.shape)
AZNV_df.head(2)

(1818490, 32)


Unnamed: 0,business_id,name_x,address,city,state,postal_code,latitude,longitude,stars_x,review_count_x,is_open,attributes,categories,hours,review_id,user_id,stars_y,useful,funny,cool,text,date,name_y,review_count_y,yelping_since,friends,fans,average_stars,likes,compliments,elite_years,hotel?
0,vx4YAA02Qz6khRD1fZ1MFA,Ping Pang Pong,4000 W Flamingo Rd,Las Vegas,NV,89103,36.116901,-115.193512,3.5,1005,1,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","[Cantonese, Seafood, Dim Sum, Restaurants, Chi...","{'Monday': '10:0-3:0', 'Tuesday': '10:0-3:0', ...",FwiJ7DQ3a9cBwp1OYlP5VA,A0kENtCCoVT3m7T35zb2Vg,2.0,0,0,0,There are so many good reviews of this place o...,2010-04-02 20:37:54,Anastasia,348,2009-02-15 16:45:34,"BBY1Alonk5V0OLgk_G0rNQ, TY7QgNKJEhvo1zCULlT9Rg...",9,3.66,619,38,3,0
1,glTBzDHv9wNhEsZa4bQrOA,CatHouse Boutique Nightclub,3900 Las Vegas Blvd S,Las Vegas,NV,89119,36.094822,-115.173216,3.0,178,0,"{'OutdoorSeating': 'False', 'BusinessAcceptsCr...","[Restaurants, Nightlife, Lounges, American (Ne...","{'Monday': '17:0-4:0', 'Tuesday': '17:0-4:0', ...",7ynnTyXiuIWHDkktHsdm4Q,A0kENtCCoVT3m7T35zb2Vg,2.0,1,1,0,"Restaurant-only review; based on that, they wo...",2009-02-16 14:53:11,Anastasia,348,2009-02-15 16:45:34,"BBY1Alonk5V0OLgk_G0rNQ, TY7QgNKJEhvo1zCULlT9Rg...",9,3.66,619,38,3,0


## Create dataframe for model

In [5]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV
from surprise import NormalPredictor
from surprise import Reader
from surprise.model_selection import cross_validate

start = time.time()

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(AZNV_df[['user_id', 'business_id', 'stars_y']], reader)
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005, 0.01],
              'reg_all': [0.2, 0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=-1)

gs.fit(data)

end = time.time()
print('this process took {:.3f} seconds'.format(end-start))

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])





this process took 3374.864 seconds
1.1775836998163294
{'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.2}
