In [1]:
import numpy as np
import pandas as pd
import time as tm
import datetime as dt
import matplotlib.pyplot as plt
from collections import defaultdict
import csv
import ast
import json
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.prediction_algorithms.knns import KNNWithMeans
from surprise.prediction_algorithms.knns import KNNWithZScore
from surprise.prediction_algorithms.knns import KNNBaseline
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise.prediction_algorithms.matrix_factorization import SVDpp
from surprise.prediction_algorithms.matrix_factorization import NMF
from surprise.prediction_algorithms.slope_one import SlopeOne
from surprise.prediction_algorithms.co_clustering import CoClustering
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise.model_selection.search import GridSearchCV

reviewsPath = 'data/reviews_ssc.csv'
df_reviews = pd.read_csv(reviewsPath, sep=',')
df_reviews['unixReviewTime'] = pd.to_numeric(df_reviews['unixReviewTime'], errors='coerce')
#print(df_reviews.head())

userLocationPath = 'data/user_locations.csv'
df_userLocations = pd.read_csv(userLocationPath, sep=',')
#print(df_userLocations.head())

userReviewsPath = 'data/filtered_reviews_by_user.csv'
df_userReviews = pd.read_csv(userReviewsPath, sep=',')
#print(df_userReviews.head())

print('done')

done


In [2]:
grouped = df_reviews.groupby('gPlusUserId', as_index=False)['rating']
means = grouped.mean()
mins = grouped.min()
maxs = grouped.max()

meanTime = df_reviews['unixReviewTime'].mean()
maxTime = df_reviews['unixReviewTime'].max()
minTime = df_reviews['unixReviewTime'].min()
#print(means[means['gPlusUserId'] == '100003840837471130074']['rating'])

meanRating = []
timeRating = []
meanTimeRating = []
diff = float(maxTime - minTime)
for i in range(df_reviews.shape[0]):
    a = float(df_reviews['unixReviewTime'][i]) if pd.notna(df_reviews['unixReviewTime'][i]) else maxTime
    b = (a - minTime) / diff
    mn = means[means['gPlusUserId'] == df_reviews['gPlusUserId'][i]]['rating']
    d = float(df_reviews['rating'][i] - mn)
    
    meanRating.append(d)
    timeRating.append(df_reviews['rating'][i] * b)
    if d >= 0 or b == 0.0:
        meanTimeRating.append(d * b)
    else:
        meanTimeRating.append(d / b)

print('done')

done


In [25]:
# raw ratings
reader = Reader(rating_scale=(1,5), skip_lines=1)
reviewsData = Dataset.load_from_df(df_reviews[['gPlusUserId', 'gPlusPlaceId', 'rating']], reader)
trainset, testset = train_test_split(reviewsData, test_size=.25)

algo = SVD(n_factors=200,n_epochs=50,lr_bu=0.005,lr_bi=0.005,lr_pu=0.005,
           lr_qi=0.001,reg_bu=0.05,reg_bi=0.02,reg_pu=0.05,reg_qi=0.05)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

print('done')

RMSE: 1.0093
done


In [26]:
# normalized by timestamp
df_reviews['timeRating'] = timeRating
#print(df_reviews['timeRating'])
reader = Reader(rating_scale=(0,5), skip_lines=1)
reviewsData = Dataset.load_from_df(df_reviews[['gPlusUserId', 'gPlusPlaceId', 'timeRating']], reader)
trainset, testset = train_test_split(reviewsData, test_size=.25)

algo = SVD(n_factors=200,n_epochs=50,lr_bu=0.005,lr_bi=0.005,lr_pu=0.005,
           lr_qi=0.001,reg_bu=0.05,reg_bi=0.02,reg_pu=0.05,reg_qi=0.05)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

print('done')

RMSE: 0.9574
done


In [27]:
# normalized by user mean
df_reviews['meanRating'] = meanRating
reader = Reader(rating_scale=(-4,5), skip_lines=1)
reviewsData = Dataset.load_from_df(df_reviews[['gPlusUserId', 'gPlusPlaceId', 'meanRating']], reader)
trainset, testset = train_test_split(reviewsData, test_size=.25)

algo = SVD(n_factors=200,n_epochs=50,lr_bu=0.005,lr_bi=0.005,lr_pu=0.005,
           lr_qi=0.001,reg_bu=0.05,reg_bi=0.02,reg_pu=0.05,reg_qi=0.05)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

print('done')

RMSE: 0.7507
done


In [28]:
# normalized by user mean and timestamp
df_reviews['meanTimeRating'] = meanTimeRating
reader = Reader(rating_scale=(-4,5), skip_lines=1)
reviewsData = Dataset.load_from_df(df_reviews[['gPlusUserId', 'gPlusPlaceId', 'meanTimeRating']], reader)
trainset, testset = train_test_split(reviewsData, test_size=.25)

algo = SVD(n_factors=200,n_epochs=50,lr_bu=0.005,lr_bi=0.005,lr_pu=0.005,
           lr_qi=0.001,reg_bu=0.05,reg_bi=0.02,reg_pu=0.05,reg_qi=0.05)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

print('done')

RMSE: 0.7566
done


In [29]:
def get_top_n(df,tr,n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))'''
    
    header = ['uid']
    # assumes i < 1000
    for i in range(1,n+1):
        header.append('iid{:03d}'.format(i))
    csv.register_dialect('myDialect', delimiter = ',', quoting=csv.QUOTE_NONE, skipinitialspace=True)
    baseFilename = 'top{}'.format(n)
    timeString = dt.datetime.fromtimestamp(tm.time()).strftime('_%y%m%d_%H%M%S')
    
    with open(baseFilename+timeString+'.csv', 'w') as f:
        writer = csv.writer(f, dialect='myDialect')
        writer.writerow(header)

        print('fitting model...')
        algo = SVD(n_factors=200,n_epochs=50,lr_bu=0.005,lr_bi=0.005,lr_pu=0.005,
           lr_qi=0.001,reg_bu=0.05,reg_bi=0.02,reg_pu=0.05,reg_qi=0.05)
        algo.fit(tr)
        
        print('filling csv...')
        usersRemaining = tr.n_users
        for u in tr.all_users():
            predictedRatings = []
            rawUid = tr.to_raw_uid(u)
            userRatings = df.loc[df['gPlusUserId'] == rawUid]['gPlusPlaceId'].tolist()
            for i in tr.all_items():
                rawIid = tr.to_raw_iid(i)
                if rawIid not in userRatings:
                    p = algo.predict(rawUid,rawIid)
                    predictedRatings.append((p.iid, p.est))
            
            predictedRatings.sort(key=lambda x: x[1], reverse=True)
            writer.writerow([rawUid]+[iid for (iid, _) in predictedRatings[:n]])
                    
            usersRemaining -= 1
            if usersRemaining % 200 == 0:
                print(usersRemaining)
            #print(ps)
        
        '''
        togo = len(top_n.items())
        for uid, ratings in top_n.items():
            ratings.sort(key=lambda x: x[1], reverse=True)
            #top_n[uid] = user_ratings[:n]
            writer.writerow([uid]+[iid for (iid, _) in ratings[:n]])
            togo -= 1
            print('{} {}'.format(togo, uid))'''
    
    f.close()

def writeCsv(resultsList, baseFilename):
  newList = [['uid','iid01','iid02','iid03','iid04','iid05','iid06','iid07','iid08','iid09','iid10']] + [[uid]+[iid for (iid, _) in ratings] for uid,ratings in resultsList]
  csv.register_dialect('myDialect', delimiter = ',', quoting=csv.QUOTE_NONE, skipinitialspace=True)
  timeString = dt.datetime.fromtimestamp(tm.time()).strftime('_%y%m%d_%H%M%S')

  with open(baseFilename+timeString+'.csv', 'w') as f:
    writer = csv.writer(f, dialect='myDialect')
    for row in newList:
      writer.writerow(row)

  f.close()

print('done')

done


In [30]:
reader = Reader(rating_scale=(1,5), skip_lines=1)
useCols = df_reviews[['gPlusUserId', 'gPlusPlaceId', 'rating']]
reviewsData = Dataset.load_from_df(useCols, reader)
trainset = reviewsData.build_full_trainset()
get_top_n(useCols, trainset, n=20)

reader = Reader(rating_scale=(0,5), skip_lines=1)
useCols = df_reviews[['gPlusUserId', 'gPlusPlaceId', 'timeRating']]
reviewsData = Dataset.load_from_df(useCols, reader)
trainset = reviewsData.build_full_trainset()
get_top_n(useCols, trainset, n=20)

reader = Reader(rating_scale=(-4,5), skip_lines=1)
useCols = df_reviews[['gPlusUserId', 'gPlusPlaceId', 'meanRating']]
reviewsData = Dataset.load_from_df(useCols, reader)
trainset = reviewsData.build_full_trainset()
get_top_n(useCols, trainset, n=20)

reader = Reader(rating_scale=(-4,5), skip_lines=1)
useCols = df_reviews[['gPlusUserId', 'gPlusPlaceId', 'meanTimeRating']]
reviewsData = Dataset.load_from_df(useCols, reader)
trainset = reviewsData.build_full_trainset()
get_top_n(useCols, trainset, n=20)

print('done')

fitting model...
filling csv...
26200
26000
25800
25600
25400
25200
25000
24800
24600
24400
24200
24000
23800
23600
23400
23200
23000
22800
22600
22400
22200
22000
21800
21600
21400
21200
21000
20800
20600
20400
20200
20000
19800
19600
19400
19200
19000
18800
18600
18400
18200
18000
17800
17600
17400
17200
17000
16800
16600
16400
16200
16000
15800
15600
15400
15200
15000
14800
14600
14400
14200
14000
13800
13600
13400
13200
13000
12800
12600
12400
12200
12000
11800
11600
11400
11200
11000
10800
10600
10400
10200
10000
9800
9600
9400
9200
9000
8800
8600
8400
8200
8000
7800
7600
7400
7200
7000
6800
6600
6400
6200
6000
5800
5600
5400
5200
5000
4800
4600
4400
4200
4000
3800
3600
3400
3200
3000
2800
2600
2400
2200
2000
1800
1600
1400
1200
1000
800
600
400
200
0
fitting model...
filling csv...
26200
26000
25800
25600
25400
25200
25000
24800
24600
24400
24200
24000
23800
23600
23400
23200
23000
22800
22600
22400
22200
22000
21800
21600
21400
21200
21000
20800
20600
20400
20200
20000
19800
196

In [22]:
print('done')

done
