In [1]:
# Import libraries
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

import optuna


from utilities import data_basic_utility as databasic

## File Details

Basic run with SVD without cross validation. This can be the basis for parameter tuning and other stuff later


# Assignment 3 -- Recommendation Systems

* The final challenge is much like Assignment 2 -- but scoped appropriately for the time and your current abilities.
* It is ratings prediction, just like the movielens recommendations we have seen and many other similar problems.
* The features created are based on Beer Reviews from experts on a website.
* Each beer has been scored between 0 and 5 (on a real scale, so 2.75 or 3.5 is OK).
* The official measure is Mean Average Error (MAE) which is pretty intuitive to work with. Everything supports is and it is easy to interpret.
* A set of features have been created based on the reviewer, the written review, and information about the Beer being reviewed.
* Not all features have to be used, and you can easily create new features using the data if you like.
* The features included are:

![title](Images/A3Features.png)

* Sizes of the files are:
|Size | File|
|---|---|
| 1.9G | features.tsv |
| 88B  | header-features.tsv|
| 48B  | header.tsv |
| 15M  | test.tsv |
| 50M  | train.tsv |
| 16M |  val.tsv |


In [2]:
filePrefix = "A3_075_surprise_knnmeans_tuning"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
seed = databasic.get_random_seed()

In [3]:
# RowID  BeerID  ReviewerID  BeerName  BeerType  Label
#df_train = pd.read_csv(baseDataDir + 'train_wk12.tsv',sep='\t',
df_train = pd.read_csv(baseDataDir + 'train.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_train.head(10)

#df_vali = pd.read_csv(baseDataDir + 'val_wk12.tsv',sep='\t',
df_vali = pd.read_csv(baseDataDir + 'val.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_vali.head(10)


Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,22,12300,2634,Rauch Ür Bock,Rauchbier,4.0
1,27,12300,5634,Rauch Ür Bock,Rauchbier,4.5
2,28,12300,3544,Rauch Ür Bock,Rauchbier,4.5
3,40,12300,6521,Rauch Ür Bock,Rauchbier,4.0
4,43,12300,10177,Rauch Ür Bock,Rauchbier,4.5
5,48,12300,2907,Rauch Ür Bock,Rauchbier,3.5
6,49,12300,1532,Rauch Ür Bock,Rauchbier,4.0
7,50,12300,3452,Rauch Ür Bock,Rauchbier,3.5
8,59,12300,6861,Rauch Ür Bock,Rauchbier,4.0
9,64,6699,6401,Caldera Pale Ale,American Pale Ale (APA),4.5


Column List: 
RowID BrewerID ABV DayofWeek Month DayofMonth Year TimeOfDay Gender Birthday Text Lemmatized POS_Tag



In [4]:
# # df_features = pd.read_csv(baseDataDir + 'features-top500.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
# df_features = pd.read_csv(baseDataDir + 'features_200k.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
#                                                                  'DayofMonth','Year','TimeOfDay','Gender',
#                                                                  'Birthday','Text','Lemmatized','POS_Tag'])

# df_features.head(10)

In [5]:
idCols = ['RowID','BeerID','ReviewerID']

# Setup the data to be just the Reviewer and the Beer(Item) and the Rating Label we want to learn.
dfTrainFeatures = df_train.drop(['RowID','BeerName','BeerType'],axis=1)
dfValiIds = df_vali[idCols]
dfValiFeatures = df_vali.drop(['RowID','BeerName','BeerType'],axis=1)

dfTrainFeatures.head()

Unnamed: 0,BeerID,ReviewerID,rating
0,12300,10635,4.0
1,12300,6547,4.5
2,12300,9789,4.5
3,12300,7372,5.0
4,12300,1302,4.5


In [6]:

reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(dfTrainFeatures[['BeerID','ReviewerID',
                                    'rating']],reader)

dsetValiFeatures = Dataset.load_from_df(dfValiFeatures[['BeerID','ReviewerID',
                                     'rating']],reader)
trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()

print(type(dsetTrainFeatures))
print(type(trainsetTrainFeatures))
trainsetTrainFeatures

<class 'surprise.dataset.DatasetAutoFolds'>
<class 'surprise.trainset.Trainset'>


<surprise.trainset.Trainset at 0x280b02c9e50>

In [7]:

NA,valset = train_test_split(dsetValiFeatures, test_size=1.0)

# Basic 2:
# param_grid = {'k': [50, 60,  80, 100, 120] } 


# Basic 3:
param_grid = {'k': [130, 160, 200, 250] } 
              


# Full params - takes 4h10m
# param_grid = {'n_factors ': [200, 150, 100, 60, 30], 'n_epochs': [5, 10, 20, 30], 
#               'biased': [True, False], 'lr_all': [0.2, 0.1, 0.01, 0.002, 0.005, 0.001],
#               'reg_all': [0.4, 0.1, 0.05, 0.02, 0.01, 0.005]}

# Middle? 
# param_grid = {'k': [10, 20, 40, 60], 'min_k': [1, 2],
#               'sim_options': {'name': ['msd', 'cosine'],
#                 'min_support': [1, 5, 10],
#                 'user_based': [False]}
#               }

# grid_search = GridSearchCV(SVD, param_grid, measures=['MAE', 'RMSE'], cv=5)
grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['MAE'], cv=5)

grid_search.fit(dsetTrainFeatures)



# model = algorithm.fit(trainsetTrainFeatures)
# predictions = algorithm.test(valset)


# # Score our predictions with MAE
# # It is around 0.77, which means the a random guess based on the distribution of the data
# # is on average within 0.77 (plus or minus) the true label.
# # Not bad! You can beat it though, I'm sure :).
# # Smaller MAE is the better. Good luck!
# mae = accuracy.mae(predictions,verbose=True)

# print("Average MAE: " + str(mae))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [8]:

print("Best MAE: " + str(grid_search.best_score['mae']))
print("Best MAE Params: " + str(grid_search.best_params['mae']))

Best MAE: 0.44410480365155214
Best MAE Params: {'k': 160}


## Results 

### On Full Data:
* Basic 2: between 40 and 120
* Best MAE: 0.44415773228812105
* Best MAE Params: {'k': 120}

* Basic 3: between 80 and 160
* Best MAE: 0.44393454776524555
* Best MAE Params: {'k': 160}

* Basic 3: between 130 and 250
* Best MAE: 0.44410480365155214
* Best MAE Params: {'k': 160}

### Medium
* Ran once, took 30 min
* Best MAE: 0.46647853560442815
* Best MAE Params: {'k': 60, 'min_k': 2, 'sim_options': {'name': 'msd', 'min_support': 1, 'user_based': False}}

### Simple, just K
* Ran once, took 4 min
* Best MAE: 0.4475621419083778
* Best MAE Params: {'k': 80}

In [9]:

# print("Best RMSE: " + str(grid_search.best_score['rmse']))
# print("Best RMSE Params: " + str(grid_search.best_params['rmse']))