In [2]:
import numpy as np
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
#from pyspark import SparkSession
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating 
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import gc
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
#%matplotlib inline
#conf = SparkConf().setAppName('ListenerSummarizer')
#sc = SparkContext.getOrCreate(conf=conf)

#sqlContext = SQLContext(sc)



spark = SparkSession.builder.getOrCreate()
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
sqlContext = SQLContext(sc)

#Sparsity finction
def isSparse(array) :
    m, n = array.shape
    counter = 0
  
    # Count number of zeros
    # in the matrix
    for i in range(0,m) :
        for j in range(0,n) :
            if (array[i][j] == 0) :
                counter = counter + 1
  
    return (counter * 100 / (m * n))

In [3]:
#Load datasets
ratings_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/review_p.csv')
business_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/business_p.csv')
user_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/user_p.csv')

business_pd['businessId'] = business_pd.index + 1
user_pd['userId'] = user_pd.index + 1

In [4]:
#apply numeric value to ratings dataset
ratings_pd = ratings_pd.merge(business_pd, how = 'left', left_on = 'business_id', right_on = 'business_id')
ratings_pd = ratings_pd.merge(user_pd, how = 'left', left_on = 'user_id', right_on = 'user_id')
ratings_pd = ratings_pd[['businessId', 'userId', 'stars_x']]
ratings_pd.columns = ['businessId','userId', 'stars']

In [5]:
ratings_sp_df = sqlContext.createDataFrame(ratings_pd)
ratings_sp_df.show()

In [6]:
ratings_rdd  = ratings_sp_df.rdd
print(ratings_rdd.first())

In [7]:
#Generate datasets
#Split dataset into train, validation and test sets
rating_train, rating_val, rating_test= ratings_rdd.randomSplit([0.6, 0.2, 0.2])

#Load data into memory
rating_train.cache()
rating_test.cache()
rating_val.cache()

#Sample results
print('Train set')
rating_train.take(5)

In [8]:
#Remove ratings for validation and test datasets
rating_test_no_rate = rating_test.map(lambda x: (x[0], x[1]))
rating_val_no_rate = rating_val.map(lambda x: (x[0], x[1]))

#Load data into memory
rating_test_no_rate.cache()
rating_val_no_rate.cache()

#Sample results
print('Train set')
rating_test_no_rate.take(5)

In [9]:
# Training the model
import math
rank = 1
iterations = 10

min_error = 0
best_rank = -1

for rank in range(5, 6):
  for iterations in range(20, 25):
    #generate model
    model = ALS.train(rating_train, rank=rank, iterations=iterations)

    #generate predictions
    predictions = model.predictAll(rating_val_no_rate).map(lambda r: ((r[0], r[1]), r[2]))

    #get actual vs predictions
    rates_and_preds = rating_val.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    


    #calculate error
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

    #display output
    print('For rank %s and %s the RMSE is %s' % (rank, iterations, error))

    if error < min_error:
        best_rank = rank

    min_error = error
        
print('The best model was trained with rank %s' % best_rank)


In [10]:
#Load datasets
ratings_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/review_p.csv')
business_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/business_p.csv')
user_pd = pd.read_csv('https://raw.githubusercontent.com/ncooper76/DATA_643_Group/master/Final_Project/user_p.csv')

business_pd['businessId'] = business_pd.index + 1
user_pd['userId'] = user_pd.index + 1

In [11]:
#apply numeric value to ratings dataset
ratings_pd = ratings_pd.merge(business_pd, how = 'left', left_on = 'business_id', right_on = 'business_id')
ratings_pd = ratings_pd.merge(user_pd, how = 'left', left_on = 'user_id', right_on = 'user_id')
ratings_pd = ratings_pd[['businessId', 'stars_x', 'userId']]
ratings_pd.columns = ['businessId', 'stars','userId']

In [12]:
#Calculate user-business ratings
user_visits = ratings_pd.groupby(['userId']).size().sort_values(ascending=True)

user_visits.head(10)

In [13]:
user_visits.tail(10)

In [14]:
user_visits = pd.DataFrame(user_visits)
user_visits['userId'] = user_visits.index
user_visits = user_visits.reset_index(drop=True)
user_visits.columns = ['ratingCount','userId']
rating_visits = user_visits.groupby(['ratingCount']).size().sort_values(ascending=True)
rating_visits = pd.DataFrame(rating_visits)
rating_visits['users'] = rating_visits.index
rating_visits = rating_visits.reset_index(drop=True)
rating_visits.columns = ['users','ratingCount']
rating_visits = rating_visits.sort_values(by=['users'], ascending=False)
rating_visits.head(20)

In [15]:
#Calculate user-business ratings
business_visits = ratings_pd.groupby(['businessId']).size().sort_values(ascending=True)

business_visits.head(10)

In [16]:
business_visits.tail(10)

In [17]:
#pivot dataframe userId as columns and businessId as columns
ratings_df = ratings_pd.pivot(index = 'userId', columns ='businessId', values = 'stars').fillna(0)
ratings_df['userId'] = ratings_df.index

#Convert ratings dataframe to matrix
rating_matrix = ratings_df.values

#Check how sparse dataset is
sparsity = isSparse(rating_matrix)
print('Ratings Dataset is : {:.2f}% Sparse'.format(sparsity))

In [18]:
#Entire dataset mean
rawmean = ratings_pd['stars'].mean()

#Raw mean for user - user bias
user_rawmean = pd.DataFrame(ratings_pd.groupby(['userId'])['stars'].mean() - rawmean)
user_rawmean.columns = ['userBias']
user_rawmean['userId'] = user_rawmean.index
user_rawmean = user_rawmean.reset_index(drop=True)

#Raw mean for business - business bias
business_rawmean = pd.DataFrame(ratings_pd.groupby(['businessId'])['stars'].mean() - rawmean)
business_rawmean.columns = ['businessBias']
business_rawmean['businessId'] = business_rawmean.index
business_rawmean = business_rawmean.reset_index(drop=True)

In [19]:
#Convert user X business matrix to rows format
#Each rows with have user, business, rating
predicted_ratings_CtoR = pd.melt(ratings_df, id_vars='userId')
predicted_ratings_CtoR.columns = ['userId', 'businessId','stars']
predicted_ratings_CtoR['businessId'] = predicted_ratings_CtoR['businessId'].astype(int)

#Attach baseline average ratings
predicted_ratings_CtoR = predicted_ratings_CtoR.merge(user_rawmean, how='left', left_on=['userId'], right_on = ['userId'])
predicted_ratings_CtoR = predicted_ratings_CtoR.merge(business_rawmean, how='left', 
                                                      left_on=['businessId'], 
                                                      right_on = ['businessId'])

#Update ratings for only missing values
predicted_ratings_CtoR['stars'] = np.where(predicted_ratings_CtoR['stars'] == 0, 
                      rawmean + predicted_ratings_CtoR['userBias'] + predicted_ratings_CtoR['businessBias'], 
                      predicted_ratings_CtoR['stars'])

#Since ratings are between 1 and 5 change accordingly
predicted_ratings_CtoR.loc[predicted_ratings_CtoR['stars'] < 1, 'stars'] = 1
predicted_ratings_CtoR.loc[predicted_ratings_CtoR['stars'] > 5, 'stars'] = 5