In [1]:
import os
import sys
from pyspark import SparkContext, SparkConf
import json
import itertools
import math
import numpy as np
import time
from itertools import combinations
import xgboost as xgb
import pandas as pd
from sklearn.metrics import mean_squared_error

In [2]:
appName = 'assignment3'
master = 'local[*]'
conf = SparkConf().setAppName(appName).setMaster(master)
sc = SparkContext(conf=conf)
sc.setLogLevel("INFO")
sc

In [41]:
def read_test_data(test_path):
    def read_test_line(line):
        line = line.split(',')
        return (line[0].strip(), line[1].strip(), line[2].strip())
    
    rdd = sc.textFile(test_path)
    header = rdd.first()
    rdd = rdd.filter(lambda x: x != header).map(read_test_line)    
    return rdd


def read_train_data(train_path):
    def read_csv_line(line):
        line = line.split(',')
        return (line[0].strip(), line[1].strip(), line[2].strip())
    
    rdd = sc.textFile(train_path)
    header = rdd.first()
    rdd = rdd.filter(lambda x: x != header).map(read_csv_line)
    return rdd



business_data = sc.textFile('./data/business.json').map(json.loads).map(lambda x: (x['business_id'], x['review_count'], x['stars'])).collect()
user_data = sc.textFile('./data/user-002.json').map(json.loads).map(lambda x: (x['user_id'], x['review_count'], x['average_stars'])).collect()
checkin_data = sc.textFile('./data/checkin.json').map(json.loads).map(lambda x: (x['business_id'], sum(x['time'].values()))).collect()
test_data = read_test_data('./data/yelp_val.csv').collect()
train_data = read_train_data('./data/yelp_train.csv').collect()
business_data = pd.DataFrame(business_data, columns=['business_id', 'review_count', 'stars'])
user_data = pd.DataFrame(user_data, columns=['user_id', 'review_count', 'average_stars'])
checkin_data = pd.DataFrame(checkin_data, columns=['business_id', 'total_checkins'])
train_data = pd.DataFrame(train_data, columns=['user_id', 'business_id', 'rating'])
test_data = pd.DataFrame(test_data, columns=['user_id', 'business_id', 'rating'])

In [42]:
business_data = pd.merge(business_data, checkin_data, on='business_id', how='left')
train_data = pd.merge(train_data, user_data, on='user_id', how='left')
train_data = pd.merge(train_data, business_data, on='business_id', how='left')
test_data = pd.merge(test_data, user_data, on='user_id', how='left')
test_data = pd.merge(test_data, business_data, on='business_id', how='left')

# user_map = {}
# business_map = {}
# for i, x in enumerate(train_data.user_id.values):
#     user_map[x] = i

# for i, x in enumerate(train_data.business_id.values):
#     business_map[x] = i
    
# train_data.user_id = train_data.user_id.apply(lambda x: user_map[x])
# train_data.business_id = train_data.business_id.apply(lambda x: business_map[x])
# test_data.user_id = test_data.user_id.apply(lambda x: user_map[x] if x in user_map else None)
# test_data.business_id = test_data.business_id.apply(lambda x: business_map[x] if x in business_map else None)

# test_data = test_data.dropna()
# train_data.rating = train_data.rating.apply(lambda x: float(x))
# test_data.business_id = test_data.business_id.astype(int)
# test_data.rating = train_data.rating.apply(lambda x: float(x))

In [43]:
test_data.dtypes

user_id            object
business_id        object
rating             object
review_count_x      int64
average_stars     float64
review_count_y      int64
stars             float64
total_checkins    float64
dtype: object

In [44]:
train_x = train_data.loc[:, ('review_count_x', 'average_stars', 'review_count_y', 'stars', 'total_checkins')].values
train_y = train_data.loc[:, ('rating')].values
test_x = test_data.loc[:, ('review_count_x', 'average_stars', 'review_count_y', 'stars', 'total_checkins')].values
test_y = test_data.loc[:, ('rating')].values

In [45]:
model = xgb.XGBRegressor(objective ='reg:squarederror')

In [46]:
model.fit(train_x, train_y)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

In [47]:
pred = model.predict(test_x)

In [48]:
mean_squared_error(test_y, pred)

0.9674248316924059

In [50]:
output = pd.concat((test_data.loc[:, ('user_id', 'business_id')], pd.DataFrame(pred, columns=['pred'])), axis=1)

In [66]:
output.pred = output.pred.apply(lambda x: '5.0' if x>5 else str(x))

In [69]:
output = output.apply(lambda row: ','.join(row), axis=1).tolist()

In [70]:
def save_output(output, path):
    output_file = open(path, 'wt')
    output_file.write('user_id, business_id, prediction\n')
    for line in output:
        output_file.write(line)
    output_file.close()
    return


save_output(output, 'output2_2.csv')

In [None]:
def flatten(x):
    if(not type(x) is tuple):
        return (x)
    out = []
    for i in x:
        if(type(i) is tuple):
            out += list(i)
        else:
            out.append(i)
    return tuple(out)
    
business_rdd = business_rdd.join(checkin_rdd).map(lambda x: (x[0], flatten(x[1])))
user_rdd = user_rdd.join(train_data.map(lambda x: x[]))
    join(train_data).map(lambda x: (x[0], flatten(x[1]))).\
    map(lambda x: (x[1][-1], x[1]+(x[0],))).\
    join(user_rdd).map(lambda x: (x[0], flatten(x[1]))).take(3)