# Implementation of many-assesment recommenders

- Author: Marios Kokkodis 
- email: marios.kokkodis@gmail.com 

> Python notes: requires PY36 for some of the packages!!

In [1]:
import sys
if '../python/' not in sys.path: sys.path.insert(1, '../python/')
from joblib import dump, load
from sklearn.metrics import roc_curve, roc_auc_score
from surprise.model_selection import PredefinedKFold
import os
from surprise import SVD
from surprise import Dataset
from surprise import Reader
import pandas as pd
import numpy as np
from spotlight.interactions import Interactions
from spotlight.sequence.implicit import ImplicitSequenceModel




## Create the necessary files for the surprise libary

In [2]:
d = pd.read_csv( "../../data/restaurant_ncv.csv")
d['binaryOutcome'] = np.where(d['Y_it']==0,0,1)
d['trinaryOutcome'] = d['Y_it']/2 #to get auc score
reader = Reader(rating_scale=(0,1.0))

for fold in range(10):
    train = d[d['set_annotation'].str.contains(pat =('train_'+str(fold))+',|train_'+str(fold)+"$",
                                               regex=True)==True].copy()
    validation = d[d['set_annotation'].str.contains(pat =('validation_'+str(fold))+',|validation_'+str(fold)+"$",
                                                    regex=True)==True].copy()
    test = d[d['set_annotation'].str.contains(pat = ('test_'+str(fold))+',|test_'+str(fold)+"$",
                                              regex=True)==True].copy()
    trainAndValidation = pd.concat([train, validation])

    focalCols = ['user','itemNo','binaryOutcome','application']
    trainAndValidation[focalCols].to_csv("../../data/surprise/binary_train_"+str(fold)+".csv",sep=',',
                                         encoding='utf-8',index = False,header=False)
    test[focalCols].to_csv("../../data/surprise/binary_test_"+str(fold)+".csv",sep=',',
                           encoding='utf-8',index = False,header=False)
    focalCols = ['user','itemNo','trinaryOutcome','application']
    trainAndValidation[focalCols].to_csv("../../data/surprise/trinary_train_"+str(fold)+".csv",sep=',',
                                         encoding='utf-8',index = False,header=False)
    test[focalCols].to_csv("../../data/surprise/trinary_test_"+str(fold)+".csv",sep=',',
                           encoding='utf-8',index = False,header=False)

    #### Recsys explicit file creation
    type = 'explicit'
    trainAndValidation = trainAndValidation[trainAndValidation['Y_it'] > 0]
    focalCols = ['user','itemNo','binaryOutcome','application']
    trainAndValidation[focalCols].to_csv("../../data/surprise/binary_train_"+type+str(fold)+".csv",sep=',',
                                         encoding='utf-8',index = False,header=False)
    test[focalCols].to_csv("../../data/surprise/binary_test_"+type+str(fold)+".csv",sep=',',
                           encoding='utf-8',index = False,header=False)
    focalCols = ['user','itemNo','trinaryOutcome','application']
    trainAndValidation[focalCols].to_csv("../../data/surprise/trinary_train_"+type+str(fold)+".csv",sep=',',
                                         encoding='utf-8',index = False,header=False)
    test[focalCols].to_csv("../../data/surprise/trinary_test_"+type+str(fold)+".csv",sep=',',
                           encoding='utf-8',index = False,header=False)

# create sequential user
d = pd.read_csv( "../../data/restaurant_ncv.csv")

userInd = 0
userN = []
userDict = {}
for ind,row in d.iterrows():
    u = row['user']
    if u not in userDict:
        userDict[u] = userInd
        userInd+=1
    userN.append(userDict[u])
d['userN'] = userN
d.to_csv( "../../data/restaurant_ncv.csv",index=False)#needs numeric id for cnn



## Run the SVD models

In [3]:

np.random.seed(1234)
for type in ['','explicit']:
    print('Type:',type)
    curpath = '/Users/mkokkodi/Dropbox/projects/current/recommendation/gbu/data/'
    reader = Reader(rating_scale=(0,1.0),line_format='user item rating timestamp', sep=',')
    for fold in range(10):
        original_test = d[d['set_annotation'].str.contains(pat = ('test_'+str(fold))+',|test_'+str(fold)+"$",
                                                           regex=True)==True].copy()
        for binTrin in ['binary','trinary']:
            test_file = os.path.expanduser(curpath +"surprise/"+binTrin+"_test_"+type+str(fold)+".csv")
            train_file = os.path.expanduser(curpath+ "surprise/"+binTrin+"_train_"+type+str(fold)+".csv")
            data = Dataset.load_from_folds([(train_file, test_file)],reader=reader)
            pkf = PredefinedKFold()
            for trainset, testset in pkf.split(data):
                algo = SVD()
                algo.fit(trainset)
                predictions = algo.test(testset)
                pvals = []
                actualScores = []
                curInd = 0
                for p in predictions:
                    pvals.append(p.est)
                preds = pd.DataFrame(pvals)
                preds['label_0'] = [1-x for x in pvals]
                preds['label_1'] = pvals
                if 'trinary' in binTrin:
                    preds['label_2'] = pvals
                preds['application'] = list(original_test['application'])
                preds['task_id'] = list(original_test['task_id'])
                preds['employer_total_tasks_so_far'] = list(original_test['employer_total_tasks_so_far'])
                preds['truth'] = list(original_test['Y_it'])
                preds['hire_positive_truth'] = np.where(preds['truth']==2,1,0)
                preds['hire_negative_truth'] = np.where(preds['truth'] == 1, 1, 0)
                preds.to_csv("../../data/raw_predictions/svd"+binTrin+type+str(fold)+".csv",index=False)





Type: 
Type: explicit


##  CNN

- We use the spotlight library: spotlight: https://github.com/maciejkula/spotlight

- This implementation relies on the following article: https://towardsdatascience.com/introduction-to-recommender-system-part-2-adoption-of-neural-network-831972c4cbf7


### Step 1: CNN Hyperparameter tuning, search in validation sets


In [4]:

def getSeqRecPreds(test,model,userVar,itemVar,ratingVar):
    """
    This functions provides the sequential predictions of the CNN model.
    """
    contrsSeq = {}
    preds = []
    actuals = []
    for _,row in test.iterrows():
        c = row[userVar]
        if c not in contrsSeq:
            contrsSeq[c] = [0]
        pred = model.predict(contrsSeq[c])
        cur_item = int(row[itemVar]-1)
        if cur_item > len(pred)-1:
            preds.append(np.nan)
        else:
            preds.append(pred[cur_item])
        actuals.append(row[ratingVar])
        if (row[ratingVar] > 0) & (int(cur_item) < len(pred)):
            contrsSeq[c].append(cur_item)
    preds = [np.nanmedian(preds) if np.isnan(x) else x for x in preds]
    preds = [(i - min(preds))/(max(preds)-min(preds)) for i in preds]
    return preds,actuals

In [5]:
d = pd.read_csv( "../../data/restaurant_ncv.csv")
d['binaryOutcome'] = np.where(d['Y_it']==0,0,1)
d = d.sort_values(['user',  'choicesetId', 'daysTrend', 'application'])
fold_settings = {}
for n_iter in [5,10,50]:
    for cur_loss in ['pointwise','bpr','hinge','adaptive_hinge']:
        for batch_size in [32,64,128]:
            l=[]
            k = ">>".join([str(i) for i in [n_iter,cur_loss,batch_size]])
            for fold in range(10):
                np.random.seed(1234)
                train = d[d['set_annotation'].str.contains(pat =('train_'+str(fold))+',|train_'+str(fold)+"$",
                                                           regex=True)==True].copy()
                test = d[d['set_annotation'].str.contains(pat =('validation_'+str(fold))+',|validation_'+str(fold)+"$",
                                                          regex=True)==True].copy()
                implcit_interactions_train = Interactions(train['userN'].values, train['item_sequential'].values,
                                                       timestamps=train['application'].values)
                implcit_interactions_train = implcit_interactions_train.to_sequence()
                model = ImplicitSequenceModel(n_iter=n_iter,
                                          representation='cnn',
                                loss=cur_loss, batch_size = batch_size, random_state=0) #crashes if random_state=1234
                model.fit(implcit_interactions_train)
                pvals,actuals = getSeqRecPreds(test,model,'userN','item_sequential','binaryOutcome')
                curAuc = roc_auc_score(np.where(test['Y_it'] == 2, 1, 0),pvals)
                if fold not in fold_settings or  fold_settings[fold][-1] < curAuc:
                    fold_settings[fold] = [k,curAuc]



print("Dumping grid search results in: ../../data/surprise/cnn_hyps.dict")
dump(fold_settings,"../../data/surprise/cnn_hyps.dict")



Dumping grid search results in: ../../data/surprise/cnn_hyps.dict


['../../data/surprise/cnn_hyps.dict']

In [6]:
#### Step 2: Run CNN

params = load("../../data/surprise/cnn_hyps.dict")
d = pd.read_csv( "../../data/restaurant_ncv.csv")
d['binaryOutcome'] = np.where(d['Y_it']==0,0,1)
d = d.sort_values(['user',  'choicesetId', 'daysTrend', 'application'])
l=[]
for fold in range(10):
    np.random.seed(1234)
    train = d[d['set_annotation'].str.contains(pat =('train_'+str(fold))+',|train_'+str(fold)+"$",
                                               regex=True)==True].copy()
    validation = d[d['set_annotation'].str.contains(pat =('validation_'+str(fold))+',|validation_'+str(fold)+"$",
                                                    regex=True)==True].copy()
    test = d[d['set_annotation'].str.contains(pat = ('test_'+str(fold))+',|test_'+str(fold)+"$",
                                              regex=True)==True].copy()
    train = pd.concat([train, validation])

    implcit_interactions_train = Interactions(train['userN'].values, train['item_sequential'].values,
                                           timestamps=train['application'].values)
    implcit_interactions_train = implcit_interactions_train.to_sequence()
    n_iter, cur_loss, batch_size =  params[fold][0].split(">>")
    n_iter = int(n_iter)
    batch_size = int(batch_size)
    model = ImplicitSequenceModel(n_iter=n_iter,
                              representation='cnn',
                              loss=cur_loss, batch_size = batch_size, random_state=0)
    model.fit(implcit_interactions_train)
    pvals,actuals = getSeqRecPreds(test,model,'userN','item_sequential','binaryOutcome')
    curAuc = roc_auc_score(np.where(test['Y_it'] == 2, 1, 0),pvals)
    preds = pd.DataFrame(pvals)
    preds['label_0'] = [1-x for x in pvals]
    preds['label_1'] = pvals
    preds['application'] = list(test['application'])
    preds['task_id'] = list(test['task_id'])
    preds['employer_total_tasks_so_far'] = list(test['employer_total_tasks_so_far'])
    preds['truth'] = list(test['Y_it'])
    preds['hire_positive_truth'] = np.where(preds['truth']==2,1,0)
    preds['hire_negative_truth'] = np.where(preds['truth'] == 1, 1, 0)
    preds.to_csv("../../data/raw_predictions/cnn"+str(fold)+".csv",index=False)
    curAuc = roc_auc_score(preds['hire_positive_truth'],preds['label_1'])
    l.append(curAuc)
print("Results stored in ../../data/raw_predictions/")

Results stored in ../../data/raw_predictions/
