In [1]:
#Load libraries and functions
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import gc
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#Sparsity finction
def isSparse(array) :
    m, n = array.shape
    counter = 0
  
    # Count number of zeros
    # in the matrix
    for i in range(0,m) :
        for j in range(0,n) :
            if (array[i][j] == 0) :
                counter = counter + 1
  
    return (counter * 100 / (m * n))


In [2]:
#Load datasets
ratings_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/review_p.csv')
business_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/business_p.csv')
user_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/user_p.csv')

#Since user id is char datatype convert to numeric
business_pd['businessId'] = business_pd.index + 1
user_pd['userId'] = user_pd.index + 1

In [3]:
#apply numeric value to ratings dataset
ratings_pd = ratings_pd.merge(business_pd, how = 'left', left_on = 'business_id', right_on = 'business_id')
ratings_pd = ratings_pd.merge(user_pd, how = 'left', left_on = 'user_id', right_on = 'user_id')
ratings_pd = ratings_pd[['businessId', 'stars_x', 'userId']]
ratings_pd.columns = ['businessId', 'stars','userId']

In [4]:
#Entire dataset mean
rawmean = ratings_pd['stars'].mean()

#Raw mean for user - user bias
user_rawmean = pd.DataFrame(ratings_pd.groupby(['userId'])['stars'].mean() - rawmean)
user_rawmean.columns = ['userBias']
user_rawmean['userId'] = user_rawmean.index
user_rawmean = user_rawmean.reset_index(drop=True)

#Raw mean for business - business bias
business_rawmean = pd.DataFrame(ratings_pd.groupby(['businessId'])['stars'].mean() - rawmean)
business_rawmean.columns = ['businessBias']
business_rawmean['businessId'] = business_rawmean.index
business_rawmean = business_rawmean.reset_index(drop=True)

In [5]:
#pivot dataframe userId as columns and businessId as columns
ratings_df = ratings_pd.pivot(index = 'userId', columns ='businessId', values = 'stars').fillna(0)
ratings_df['userId'] = ratings_df.index

#Convert ratings dataframe to matrix
rating_matrix = ratings_df.values

#Check how sparse dataset is
sparsity = isSparse(rating_matrix)
print('Ratings Dataset is : {:.2f}% Sparse'.format(sparsity))

Ratings Dataset is : 99.82% Sparse


In [6]:
#Calculate user-business ratings
user_visits = ratings_pd.groupby(['userId']).size().sort_values(ascending=True)

user_visits.head(10)

userId
1        1
24724    1
24723    1
24721    1
24719    1
24718    1
24715    1
24713    1
24712    1
24711    1
dtype: int64

In [7]:
user_visits.tail(10)

userId
1874     192
13757    208
37600    236
20546    241
1908     245
15052    247
41339    336
23696    337
35390    365
32526    466
dtype: int64

We are working with very sparse dataset. One of the reason data is sparse could be once user finds resturants that cater according to their taste most of them stick with same resturant.

To overcome the problem, we will be using baseline prediction method using dataset average and user/business bias.

In [8]:
#Convert user X business matrix to rows format
#Each rows with have user, business, rating
predicted_ratings_CtoR = pd.melt(ratings_df, id_vars='userId')
predicted_ratings_CtoR.columns = ['userId', 'businessId','stars']
predicted_ratings_CtoR['businessId'] = predicted_ratings_CtoR['businessId'].astype(int)

#Attach baseline average ratings
predicted_ratings_CtoR = predicted_ratings_CtoR.merge(user_rawmean, how='left', left_on=['userId'], right_on = ['userId'])
predicted_ratings_CtoR = predicted_ratings_CtoR.merge(business_rawmean, how='left', left_on=['businessId'], right_on = ['businessId'])

#Update ratings for only missing values
predicted_ratings_CtoR['stars'] = np.where(predicted_ratings_CtoR['stars'] == 0, 
                      rawmean + predicted_ratings_CtoR['userBias'] + predicted_ratings_CtoR['businessBias'], 
                      predicted_ratings_CtoR['stars'])

#Since ratings are between 1 and 5 change accordingly
predicted_ratings_CtoR.loc[predicted_ratings_CtoR['stars'] < 1, 'stars'] = 1
predicted_ratings_CtoR.loc[predicted_ratings_CtoR['stars'] > 5, 'stars'] = 5

In [9]:
#Convert predicted_ratings_CtoR back to user X business matrix
ratings_df = predicted_ratings_CtoR.pivot(index = 'userId', columns ='businessId', values = 'stars').fillna(0)

#remove predicted_ratings_CtoR dataframe to free up memory
del [[predicted_ratings_CtoR]]
gc.collect()

63

In [10]:
#Compute SVD martix factorization
rating_matrix = ratings_df.values
U, sigma, Vt = np.linalg.svd(rating_matrix, full_matrices=False)

#Get diagonal sigma
sigma_diag = np.diag(sigma)

#Recalculate ratings
predicted_ratings = np.dot(np.dot(U, sigma_diag), Vt)

rmse = mean_squared_error(predicted_ratings, rating_matrix)**0.5
print('RMSE : {:.6f}'.format(rmse))

RMSE : 0.000000


In [11]:
#Lets loop through reducing k value
r, c = rating_matrix.shape
k = min(r, c)
for i in range(5, 40, 5):

    # take columns less than k from U
    U_p = U[:,:k]
    # take rows less than k from V
    V_p = Vt[:k,:]
    # build the new S matrix with top k diagnal elements
    S_p = np.zeros((k, k), int)
    for j in range(k):
        S_p[j][j] = sigma[j]
    
    #Recalculate ratings
    predicted_ratings = np.dot(np.dot(U_p, S_p), V_p)

    #Calculate error difference
    diffM = rating_matrix - predicted_ratings
    
    #Frobenius Norm
    frobeniusNorm = np.linalg.norm(diffM, 'fro')
    
    #Singular value ratio has to be 90%
    if (k == min(r, c)):
        sigma_ratio = round(sum(sigma**2)/sum(sigma**2),3)
    else:
        less_singular_values = sigma[ np.where( sigma >= i ) ]
        sigma_ratio = round(sum(less_singular_values**2)/sum(sigma**2),3)
    
    
    #RMSE
    rmse = mean_squared_error(predicted_ratings, rating_matrix)**0.5
    print("RMSE : " + str(round(rmse,3)) + 
          ' Frobenius Norm : ' + str(round(frobeniusNorm,3)) + 
          ' k-Value reduced by : ' + str(min(r, c) - k) + 
          ' Singlar Value Ratio : ' + str(sigma_ratio)
         )
    
    
    
    #Eliminate rows with low sigma value
    k = min(r, c) - sigma[ np.where( sigma < i ) ].size

RMSE : 0.003 Frobenius Norm : 26.901 k-Value reduced by : 0 Singlar Value Ratio : 1.0
RMSE : 0.011 Frobenius Norm : 103.281 k-Value reduced by : 1335 Singlar Value Ratio : 1.0
RMSE : 0.021 Frobenius Norm : 203.767 k-Value reduced by : 1931 Singlar Value Ratio : 1.0
RMSE : 0.027 Frobenius Norm : 263.275 k-Value reduced by : 2121 Singlar Value Ratio : 1.0
RMSE : 0.031 Frobenius Norm : 295.559 k-Value reduced by : 2183 Singlar Value Ratio : 1.0
RMSE : 0.032 Frobenius Norm : 305.027 k-Value reduced by : 2195 Singlar Value Ratio : 1.0
RMSE : 0.032 Frobenius Norm : 307.623 k-Value reduced by : 2197 Singlar Value Ratio : 1.0


Single Value Ratio is high even when we remove 1931 dimensions is because dataset has 42K business and little over 2K users rating them. Also, using baseline rating method has direct influence on it.

In [12]:
#Number of dimensions
k = min(r, c) - 1931
# take columns less than k from U
U_p = U[:,:k]
# take rows less than k from V
V_p = Vt[:k,:]
# build the new S matrix with top k diagnal elements
S_p = np.zeros((k, k), int)
for j in range(k):
    S_p[j][j] = sigma[j]

#Recalculate ratings
predicted_ratings = np.dot(np.dot(U_p, S_p), V_p) 

In [13]:
#Apply visited column to original rating dataframe
ratings_pd['visited'] = 1

In [14]:
#Apply visited column to predicted values dataset
predicted_ratings = pd.DataFrame(predicted_ratings)
predicted_ratings['userId'] = predicted_ratings.index + 1
predicted_ratings_CtoR = pd.melt(predicted_ratings, id_vars='userId')
predicted_ratings_CtoR.columns = ['userId', 'businessId','predict']
predicted_ratings_CtoR['businessId'] = predicted_ratings_CtoR['businessId'] + 1
predicted_ratings_CtoR['businessId'] = predicted_ratings_CtoR['businessId'].astype(int)

predicted_ratings_CtoR = predicted_ratings_CtoR.merge(ratings_pd, how='left', left_on=['userId','businessId'], right_on = ['userId','businessId'])
predicted_ratings_CtoR = predicted_ratings_CtoR[['businessId', 'userId', 'predict','visited']]

In [15]:
#Create user profile
#User 32526 has rated about 466 resturants
sample_data = predicted_ratings_CtoR[(predicted_ratings_CtoR['userId'] == 32526)]

#Fill non-visited resturants with zero value
sample_data = sample_data.fillna(0)

#Split user dataset into visited and non-visited datasets
sample_visited = sample_data[sample_data['visited'] ==1]
sample_not_visited = sample_data[sample_data['visited'] ==0]

#Select random non visited resturants
#if visited resturants are higher than non-visited use non-visited count else visited count 
random_count = sample_not_visited.shape[0] if sample_visited.shape[0] > sample_not_visited.shape[0] else sample_visited.shape[0]

#Get ramdom sample of non-visited resturants and split into 2 sets
random_not_visited = sample_not_visited.sample(n=random_count)
random_not_visited1, random_not_visited2 = train_test_split(random_not_visited, test_size =0.2, random_state=0)

#apply one set to visited resturants so we get good mix of visited and non-visited resturants in training dataset
train_data = sample_visited.append(random_not_visited1)
test_data = random_not_visited2

In [16]:
#Apply latent features to train and test datasets
#In this we will be using business features 'is_open','review_count','stars' - critic rating
train_data = pd.merge(train_data[['businessId','predict','visited']],
                     business_pd[['businessId','is_open','review_count','stars']],
                     how='left', on='businessId')
train_data = train_data.fillna(0)
rest_train_X = train_data[['predict','is_open','review_count','stars']]
rest_train_y = train_data[['visited']]

test_data = pd.merge(test_data[['businessId','predict','visited']],
                     business_pd[['businessId','is_open','review_count','stars']],
                     how='left', on='businessId')
test_data = test_data.fillna(0)
rest_test_X = test_data[['predict','is_open','review_count','stars']]
rest_test_y = test_data[['visited']]

In [17]:
train_data.sample(n=10)

Unnamed: 0,businessId,predict,visited,is_open,review_count,stars
119,562,3.934777,1.0,1,364,4.5
558,1798,2.514741,0.0,1,23,2.5
815,19,4.277595,0.0,0,30,4.0
36,181,1.949122,1.0,1,45,3.0
187,852,4.955508,1.0,1,57,4.0
727,38,4.388528,0.0,1,23,4.0
764,2050,3.279529,0.0,0,3,3.0
766,1516,4.598737,0.0,0,6,4.5
189,859,5.069806,1.0,1,115,3.5
167,786,2.890985,1.0,1,3,2.0


In [18]:
test_data.sample(n=10)

Unnamed: 0,businessId,predict,visited,is_open,review_count,stars
49,582,3.873024,0.0,1,70,3.5
88,815,3.207791,0.0,0,29,3.0
69,441,3.231107,0.0,1,38,3.0
50,1115,3.347618,0.0,1,37,3.0
4,265,4.998498,0.0,1,3,5.0
26,1066,2.707991,0.0,0,14,2.5
58,404,3.50889,0.0,1,4,3.0
5,389,2.221737,0.0,0,17,2.0
46,1544,2.473845,0.0,1,5,2.0
27,1880,3.283354,0.0,0,5,3.0


In [19]:
#Clean up dataframe it has 93+ million records
del [[predicted_ratings_CtoR]]
gc.collect()

56

In [20]:
#Build logit model
logit_model=sm.Logit(rest_train_y,rest_train_X)
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.645280
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                visited   No. Observations:                  838
Model:                          Logit   Df Residuals:                      834
Method:                           MLE   Df Model:                            3
Date:                Sun, 15 Jul 2018   Pseudo R-squ.:                 0.06051
Time:                        22:04:36   Log-Likelihood:                -540.74
converged:                       True   LL-Null:                       -575.57
                                        LLR p-value:                 5.050e-15
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
predict          0.4765      0.117      4.071      0.000       0.247       0.706
is_open         -0.3171

In [21]:
#Sperate dependent and independent variables
predictors = ['predict','is_open','review_count','stars']
outcome = ['visited']

#Using the model get predictions
logreg = LogisticRegression()
logreg.fit(train_data[predictors], train_data[outcome].values.ravel())

y_pred = logreg.predict(test_data[predictors])
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_data[predictors], test_data[outcome].values.ravel())))


Accuracy of logistic regression classifier on test set: 0.50


In [22]:
test_data['suggested'] = y_pred
test_data[test_data['is_open']==1].sample(n=10)

Unnamed: 0,businessId,predict,visited,is_open,review_count,stars,suggested
91,2139,3.679797,0.0,1,10,3.5,0.0
40,1441,2.48847,0.0,1,14,2.0,0.0
48,125,2.281263,0.0,1,4,2.0,0.0
19,623,3.912534,0.0,1,14,3.5,0.0
0,93,4.316055,0.0,1,78,4.0,1.0
7,1182,4.424754,0.0,1,32,4.0,0.0
65,160,3.796974,0.0,1,10,3.5,0.0
35,272,4.205046,0.0,1,28,4.0,0.0
76,869,3.941187,0.0,1,71,3.5,1.0
93,231,3.781956,0.0,1,6,3.5,0.0


In [23]:
#movies.shape
#print(estimated_ratings_test_T.head())
#print(ratings.head())
TP = 0 #True Positive
TN = 0 #True Negative
FP = 0 #False Positive
FN = 0 #False Negative
    
max_rate = test_data[test_data['is_open']==1].shape[0]
for i in range(max_rate):        
    #print(str(estimated_ratings_test_T.iloc[i,0]) + ' ' +str(ratings_test.iloc[i,2]))
    if test_data.iloc[i,1] >= 3.0: #and test_data.iloc[i,5] >= 3.0:
        test_data.loc[i,'star_suggest']  = 1
    elif  test_data.iloc[i,1] < 3.0: #and test_data.iloc[i,5] < 3.0:
        test_data.loc[i,'star_suggest']  = 0 

test_data.head()

Unnamed: 0,businessId,predict,visited,is_open,review_count,stars,suggested,star_suggest
0,93,4.316055,0.0,1,78,4.0,1.0,1.0
1,731,3.999715,0.0,0,144,4.0,1.0,1.0
2,402,4.42383,0.0,0,14,4.0,1.0,1.0
3,1384,4.494038,0.0,0,21,4.0,1.0,1.0
4,265,4.998498,0.0,1,3,5.0,0.0,1.0


In [24]:
for i in range(max_rate):
        
    #print(str(estimated_ratings_test_T.iloc[i,0]) + ' ' +str(ratings_test.iloc[i,2]))
    if test_data[test_data['is_open']==1].iloc[i,6] == 1 and test_data[test_data['is_open']==1].iloc[i,7] == 1:
        TP += 1
    elif  test_data[test_data['is_open']==1].iloc[i,6] == 0 and test_data[test_data['is_open']==1].iloc[i,7] == 0:
        TN += 1
    elif test_data[test_data['is_open']==1].iloc[i,6] == 0 and test_data[test_data['is_open']==1].iloc[i,7] == 1:
        FP += 1
    elif test_data[test_data['is_open']==1].iloc[i,6] == 1 and test_data[test_data['is_open']==1].iloc[i,7] == 0:
        FN += 1    
    

    
conf = np.array([[TP,FP],[FN,TN]])
print(conf)

[[17 22]
 [ 0  4]]


In [25]:
accuracy = (conf[0,0]+ conf[1,1])/test_data[test_data['is_open']==1].shape[0]
print("Accuracy is: " + str(accuracy))
precision = TP/(TP+FP)
print("Precision is: " + str(precision))
recall = TP/(TP+FN)
print("Recall is: " + str(recall))
F1 = (2*precision*recall)/(precision+recall)
print("F1 score is: " + str(F1))

Accuracy is: 0.3181818181818182
Precision is: 0.4358974358974359
Recall is: 1.0
F1 score is: 0.6071428571428571


#### References
- https://www.youtube.com/watch?v=eZZQ3UTn484
- https://www.youtube.com/watch?v=0-o9VgOxe9Y&list=PLuKhJYywjDe96T2L0-zXFU5Up2jqXlWI9&index=11
- http://dataaspirant.com/2017/04/15/implement-logistic-regression-model-python-binary-classification/
- https://www.geeksforgeeks.org/check-given-matrix-sparse-not/