In [12]:
#Load libraries and functions
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#Load datasets
ratings_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/review_p.csv')
business_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/business_p.csv')
user_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/user_p.csv')

business_pd['businessId'] = business_pd.index + 1
user_pd['userId'] = user_pd.index + 1

ratings_pd = ratings_pd.merge(business_pd, how = 'left', left_on = 'business_id', right_on = 'business_id')
ratings_pd = ratings_pd.merge(user_pd, how = 'left', left_on = 'user_id', right_on = 'user_id')
ratings_pd = ratings_pd[['businessId', 'stars_x', 'userId']]
ratings_pd.columns = ['businessId', 'stars','userId']
ratings_pd['visited'] = 1

ratings_df = ratings_pd.pivot(index = 'userId', columns ='businessId', values = 'stars').fillna(0)

rating_matrix = ratings_df.values
user_ratings_mean = np.mean(rating_matrix, axis = 1)
rating_bias = rating_matrix - user_ratings_mean.reshape(-1, 1)

U, sigma, Vt = np.linalg.svd(rating_bias, full_matrices=False)

#Get diagonal sigma
sigma_diag = np.diag(sigma)

#Recalculate ratings
predicted_ratings = np.dot(np.dot(U, sigma_diag), Vt) + user_ratings_mean.reshape(-1, 1)
predicted_ratings = pd.DataFrame(predicted_ratings)
predicted_ratings['userId'] = predicted_ratings.index + 1
predicted_ratings_CtoR = pd.melt(predicted_ratings, id_vars='userId')
predicted_ratings_CtoR.columns = ['userId', 'businessId','predict']
predicted_ratings_CtoR['businessId'] = predicted_ratings_CtoR['businessId'] + 1
predicted_ratings_CtoR['businessId'] = predicted_ratings_CtoR['businessId'].astype(int)

predicted_ratings_CtoR = predicted_ratings_CtoR.merge(ratings_pd, how='left', left_on=['userId','businessId'], right_on = ['userId','businessId'])
predicted_ratings_CtoR = predicted_ratings_CtoR[['businessId', 'userId', 'predict','visited']]

rest_train, rest_test = train_test_split(predicted_ratings_CtoR, test_size =0.2, random_state=42)


In [15]:
rest_train = rest_train.merge(business_pd, how='left', left_on='businessId', right_on='businessId')

In [16]:
rest_test = rest_test.merge(business_pd, how='left', left_on='businessId', right_on='businessId')