In [1]:
#Load libraries and functions
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#Load datasets
ratings_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/review_p.csv')
business_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/business_p.csv')
user_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/user_p.csv')

business_pd['businessId'] = business_pd.index + 1
user_pd['userId'] = user_pd.index + 1

ratings_pd = ratings_pd.merge(business_pd, how = 'left', left_on = 'business_id', right_on = 'business_id')
ratings_pd = ratings_pd.merge(user_pd, how = 'left', left_on = 'user_id', right_on = 'user_id')
ratings_pd = ratings_pd[['businessId', 'stars_x', 'userId']]
ratings_pd.columns = ['businessId', 'stars','userId']
ratings_pd['visited'] = 1

ratings_df = ratings_pd.pivot(index = 'userId', columns ='businessId', values = 'stars').fillna(0)

rating_matrix = ratings_df.values
user_ratings_mean = np.mean(rating_matrix, axis = 1)
rating_bias = rating_matrix - user_ratings_mean.reshape(-1, 1)

U, sigma, Vt = np.linalg.svd(rating_bias, full_matrices=False)

#Get diagonal sigma
sigma_diag = np.diag(sigma)

#Recalculate ratings
predicted_ratings = np.dot(np.dot(U, sigma_diag), Vt) + user_ratings_mean.reshape(-1, 1)

rmse = mean_squared_error(predicted_ratings, rating_matrix)**0.5
print("RMSE : " + str(rmse))

#Lets loop through reducing k value
r, c = rating_bias.shape
k = min(r, c)
for i in range(5, 40, 5):

    # take columns less than k from U
    U_p = U[:,:k]
    # take rows less than k from V
    V_p = Vt[:k,:]
    # build the new S matrix with top k diagnal elements
    S_p = np.zeros((k, k), int)
    for j in range(k):
        S_p[j][j] = sigma[j]
    
    #Recalculate ratings
    predicted_ratings = np.dot(np.dot(U_p, S_p), V_p) + user_ratings_mean.reshape(-1, 1)

    #Calculate error difference
    diffM = rating_matrix - predicted_ratings
    
    #Frobenius Norm
    frobeniusNorm = np.linalg.norm(diffM, 'fro')
    
    #Singular value ratio has to be 90%
    if (k == min(r, c)):
        sigma_ratio = round(sum(sigma**2)/sum(sigma**2),3)
    else:
        less_singular_values = sigma[ np.where( sigma >= i ) ]
        sigma_ratio = round(sum(less_singular_values**2)/sum(sigma**2),3)
    
    
    #RMSE
    rmse = mean_squared_error(predicted_ratings, rating_matrix)**0.5
    print("RMSE : " + str(round(rmse,3)) + 
          ' Frobenius Norm : ' + str(round(frobeniusNorm,3)) + 
          ' k-Value reduced by : ' + str(min(r, c) - k) + 
          ' Singlar Value Ratio : ' + str(sigma_ratio)
         )
    
    
    
    #Eliminate rows with low sigma value
    k = min(r, c) - sigma[ np.where( sigma < i ) ].size



#Number of dimensions
k = min(r, c) - 1011
# take columns less than k from U
U_p = U[:,:k]
# take rows less than k from V
V_p = Vt[:k,:]
# build the new S matrix with top k diagnal elements
S_p = np.zeros((k, k), int)
for j in range(k):
    S_p[j][j] = sigma[j]

#Recalculate ratings
predicted_ratings = np.dot(np.dot(U_p, S_p), V_p) + user_ratings_mean.reshape(-1, 1)


RMSE : 6.073432698591637e-16
RMSE : 0.003 Frobenius Norm : 26.956 k-Value reduced by : 0 Singlar Value Ratio : 1.0
RMSE : 0.006 Frobenius Norm : 54.874 k-Value reduced by : 171 Singlar Value Ratio : 0.985
RMSE : 0.018 Frobenius Norm : 172.516 k-Value reduced by : 648 Singlar Value Ratio : 0.956
RMSE : 0.03 Frobenius Norm : 292.556 k-Value reduced by : 1011 Singlar Value Ratio : 0.912
RMSE : 0.043 Frobenius Norm : 413.046 k-Value reduced by : 1291 Singlar Value Ratio : 0.857
RMSE : 0.054 Frobenius Norm : 526.038 k-Value reduced by : 1501 Singlar Value Ratio : 0.795
RMSE : 0.065 Frobenius Norm : 630.098 k-Value reduced by : 1661 Singlar Value Ratio : 0.729


In [2]:
predicted_ratings = pd.DataFrame(predicted_ratings)
predicted_ratings['userId'] = predicted_ratings.index + 1
predicted_ratings_CtoR = pd.melt(predicted_ratings, id_vars='userId')
predicted_ratings_CtoR.columns = ['userId', 'businessId','predict']
predicted_ratings_CtoR['businessId'] = predicted_ratings_CtoR['businessId'] + 1
predicted_ratings_CtoR['businessId'] = predicted_ratings_CtoR['businessId'].astype(int)

predicted_ratings_CtoR = predicted_ratings_CtoR.merge(ratings_pd, how='left', left_on=['userId','businessId'], right_on = ['userId','businessId'])
predicted_ratings_CtoR = predicted_ratings_CtoR[['businessId', 'userId', 'predict','visited']]

sample_data = predicted_ratings_CtoR[(predicted_ratings_CtoR['userId'] <= 3000)]

train_data, test_data = train_test_split(sample_data, test_size =0.3, random_state=0)

In [3]:
train_data = pd.merge(train_data[['businessId','predict','visited']],
                     business_pd[['businessId','is_open','review_count','stars']],
                     how='left', on='businessId')
train_data = train_data.fillna(0)
rest_train_X = train_data[['predict','is_open','review_count','stars']]
rest_train_y = train_data[['visited']]

test_data = pd.merge(test_data[['businessId','predict','visited']],
                     business_pd[['businessId','is_open','review_count','stars']],
                     how='left', on='businessId')
test_data = test_data.fillna(0)
rest_test_X = test_data[['predict','is_open','review_count','stars']]
rest_test_y = test_data[['visited']]

In [4]:
import gc
del [[predicted_ratings_CtoR]]
gc.collect()


42

In [5]:
import statsmodels.api as sm
logit_model=sm.Logit(rest_train_y,rest_train_X)
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.000533
         Iterations 17
                           Logit Regression Results                           
Dep. Variable:                visited   No. Observations:              4634700
Model:                          Logit   Df Residuals:                  4634696
Method:                           MLE   Df Model:                            3
Date:                Sat, 14 Jul 2018   Pseudo R-squ.:                  0.9494
Time:                        09:55:07   Log-Likelihood:                -2471.1
converged:                       True   LL-Null:                       -48862.
                                        LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
predict         21.6215      0.318     67.913      0.000      20.997      22.245
is_open         -1.712

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
#X = rest_train_X.values
#y = rest_train_y.values.ravel()

predictors = ['predict','is_open','stars']
outcome = ['visited']

logreg = LogisticRegression()
logreg.fit(train_data[predictors], train_data[outcome].values.ravel())


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
y_pred = logreg.predict(test_data[predictors])
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_data[predictors], test_data[outcome].values.ravel())))

Accuracy of logistic regression classifier on test set: 1.00


In [9]:
#Number of dimensions
k = min(r, c) - 1291
# take columns less than k from U
U_p = U[:,:k]
# take rows less than k from V
V_p = Vt[:k,:]
# build the new S matrix with top k diagnal elements
S_p = np.zeros((k, k), int)
for j in range(k):
    S_p[j][j] = sigma[j]

#Recalculate ratings
predicted_ratings = np.dot(np.dot(U_p, S_p), V_p) + user_ratings_mean.reshape(-1, 1)


predicted_ratings = pd.DataFrame(predicted_ratings)
predicted_ratings['userId'] = predicted_ratings.index + 1
predicted_ratings_CtoR = pd.melt(predicted_ratings, id_vars='userId')
predicted_ratings_CtoR.columns = ['userId', 'businessId','predict']
predicted_ratings_CtoR['businessId'] = predicted_ratings_CtoR['businessId'] + 1
predicted_ratings_CtoR['businessId'] = predicted_ratings_CtoR['businessId'].astype(int)

predicted_ratings_CtoR = predicted_ratings_CtoR.merge(ratings_pd, how='left', left_on=['userId','businessId'], right_on = ['userId','businessId'])
predicted_ratings_CtoR = predicted_ratings_CtoR[['businessId', 'userId', 'predict','visited']]

sample_data = predicted_ratings_CtoR[(predicted_ratings_CtoR['userId'] <= 3000)]

train_data, test_data = train_test_split(sample_data, test_size =0.3, random_state=0)

train_data = pd.merge(train_data[['businessId','predict','visited']],
                     business_pd[['businessId','is_open','review_count','stars']],
                     how='left', on='businessId')
train_data = train_data.fillna(0)
rest_train_X = train_data[['predict','is_open','review_count','stars']]
rest_train_y = train_data[['visited']]

test_data = pd.merge(test_data[['businessId','predict','visited']],
                     business_pd[['businessId','is_open','review_count','stars']],
                     how='left', on='businessId')
test_data = test_data.fillna(0)
rest_test_X = test_data[['predict','is_open','review_count','stars']]
rest_test_y = test_data[['visited']]

import gc
del [[predicted_ratings_CtoR]]
gc.collect()


98

In [10]:
import statsmodels.api as sm
logit_model=sm.Logit(rest_train_y,rest_train_X)
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.000885
         Iterations 17
                           Logit Regression Results                           
Dep. Variable:                visited   No. Observations:              4634700
Model:                          Logit   Df Residuals:                  4634696
Method:                           MLE   Df Model:                            3
Date:                Sat, 14 Jul 2018   Pseudo R-squ.:                  0.9161
Time:                        11:27:42   Log-Likelihood:                -4101.0
converged:                       True   LL-Null:                       -48862.
                                        LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
predict         23.0763      0.287     80.362      0.000      22.513      23.639
is_open         -1.624

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
#X = rest_train_X.values
#y = rest_train_y.values.ravel()

predictors = ['predict','is_open','stars']
outcome = ['visited']

logreg = LogisticRegression()
logreg.fit(train_data[predictors], train_data[outcome].values.ravel())


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
y_pred = logreg.predict(test_data[predictors])
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_data[predictors], test_data[outcome].values.ravel())))

Accuracy of logistic regression classifier on test set: 1.00
