# Model Pipeline 
This notebook uses code from the cross_val, sample_train_test, and evaluate pipeline.

In [33]:
import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor

def load_data(aug_tt, item_tt, user_tt):
    """
    Load the data from the transaction tables

    Paramters
    ---------
    aug_tt       : str
                   File name of the parquet file with each row corresponding
                   to a user's features, an item's features, and the user's
                   rating for that item

    item_tt      : str
                   File name of the parquet file with each row corresponding
                   to an item's features

    user_tt      : str
                   File name of the parquet file with each row corresponding
                   to a user's features

    Returns
    -------
    df            : pandas DataFrame
                    The augmented transaction table

    item_df       : pandas DataFrame
                    The item features as a transaction table

    user_df       : pandas DataFrame
                    The userfeatures as a transaction table

    item_ids      : list
                    All unique item ids

    user_ids      : list
                    All unique user ids
    """

    df = pd.read_parquet(aug_tt).dropna()
    item_df = pd.read_parquet(item_tt)
    item_ids = item_df['movieId'].unique()
    item_df = item_df.drop(columns=['movieId'])
    user_df = pd.read_parquet(user_tt).drop(columns=['userId'])
    user_ids = df['userId'].unique()
    return df, item_df, user_df, item_ids, user_ids


def fit_ml_cb(train_df, model, target_col='rating', drop_cols=['userId', 'movieId', 'timestamp']):
    """
    Perform item-wise clustering and assign each item to a cluster of similar
    items based on the users that 

    Paramters
    ---------
    train_df     : pandas DataFrame
                   The training set as a transaction table. Each row
                   corresponds to a user's features and that item's features
                   along with the user's rating for that item.

    model        : an sklearn regressor object
                   An object with a fit and predict method that outputs a
                   float.

    target_col   : str
                   The column corresponding to the rating.

    drop_cols    : list
                   Columns to be dropped in train_df.

    Returns
    -------
    rs_model      : an sklearn model object
                    The fitted version of the model input used to predict the
                    rating of a user for an object given the user's features
                    and the item's features.
    """
    rs_model = clone(model)
    target = train_df[target_col].dropna().values.ravel()
    train_df = train_df.drop(columns=[target_col]+drop_cols)
    rs_model = model.fit(train_df, target)
    return rs_model


def reco_ml_cb(user_df, item_df, item_ids, model_fitted):
    """
    Completes the entire utility matrix based on the model passed

    Parameters
    ---------
    train_df     : pandas DataFrame
                   The training set as a transaction table. Each row
                   corresponds to a user's features and that item's features
                   along with the user's rating for that item.

    model        : an sklearn regressor object
                   An object with a fit and predict method that outputs a
                   float.

    target_col   : str
                   The column corresponding to the rating.

    Returns
    -------
    full_matrix  : a pandas DataFrame
                   The completed utility matrix.
    """
    recos = {}
    c = 1
    for u, u_feats in user_df.iterrows():
        print(c, 'out of', len(user_df), end='\r')
        u_feats = pd.concat([pd.DataFrame(u_feats).T] *
                            len(item_ids)).reset_index(drop=True)
        a_feats = u_feats.join(item_df)
        reco = pd.Series(model_fitted.predict(a_feats), index=item_ids)
        recos[u] = reco
        c += 1
    full_matrix = pd.DataFrame.from_dict(recos, orient='index')
    return full_matrix


def reco_ml_cb_tt(df_test, model_fitted, target='rating', drop_cols=['userId', 'movieId', 'timestamp']):
    """
    Make predictions on the test set and outputs an array of the predicted
    values for them.

    Paramters
    ---------
    df_test      : pandas DataFrame
                   The test set as a transaction table. Each row
                   corresponds to a user's features and that item's features
                   along with the user's rating for that item.

    model_fitted : an sklearn regressor object
                   An object with a fit and predict method that outputs a
                   float. Must be fitted already

    target_col   : str
                   The column corresponding to the rating.
                   
    drop_cols    : list
                   Columns to be dropped in df_test.

    Returns
    -------
    result        : numpy array
                   The results of the model using df_test's features
    """
    df_test = df_test.drop(columns=[target]+drop_cols)
    result = model_fitted.predict(df_test)
    return result

In [34]:
def split_train_test(data, train_ratio=0.7,uid='userId', iid='movieId', rid='rating'):
    """
    Splits the transaction data into train and test sets.
    
    Parameters
    ----------
    data         : pandas DataFrame for transaction table containing user, item, and ratings
    
    train_ratio  : the desired ratio of training set, while 1-train ratio is automatically set for the test set 
    
    
    Returns
    ---------
    df_train_fin : dataframe for the training set
    
    df_test_fin  : dataframe for the test set
    
    df_test_fin* : possible option is a pivoted df ready as the util matrix input of the recsys. In our case, the
                   index='userId', columns='movieId', values='rating'. To generalize a transaction table, 
                   index=column[0], columns=itemId, values=rating.
    """
    
    list_df_train = []
    list_df_test = []
    
    #group by user id
    d = dict(tuple(data.groupby(data.columns[0]))) #assuming column[0] is the userId
    
    #splitting randomly per user
    for i in (d):
        if len(d[i])<2:
            list_df_test.append(d[i])
            
        else:            
            df_train = d[i].sample(frac=train_ratio)  
            ind = df_train.index
            df_test = d[i].drop(ind)
            list_df_train.append(df_train) 
            list_df_test.append(df_test)

    # 2. merge selected train set per user to a single dataframe
    df_train_fin = pd.concat(list_df_train)
    df_test_fin = pd.concat(list_df_test)
    
    # 3. Option to pivot it to create the utility matrix ready as input for recsys
    df_test_um = df_test_fin.pivot(index=uid, columns=iid, values=rid)
    
    # 4. get indices of train and test sets
    indx_train = df_train_fin.index
    indx_test = df_test_fin.index

    return df_train_fin, df_test_fin, df_test_um, indx_train, indx_test #return indices

In [64]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def evaluate(df_test_result, df_test_data):
    """
    Calculates the mse and mae per user of the results of the recommender system for a given test set.
    
    Parameters
    ----------
    
    df_test_result   : utility matrix containing the result of the recommender systems
    
    df_test_data     : pivoted test data generated from splitting the transaction table and tested on the recommender systems
    
    Returns
    ---------
    
    mse_list         : list of mean squared error for each user
    
    mae_list         : list of mean absolute error for each user
    
    """
    
    
    mse_list = []
    mae_list = []
    
#     test indices first, all user ids should be represented in the test matrix 
    idx_orig_data = df_test_data.index
    idx_result = df_test_result.index
    a=idx_orig_data.difference(idx_result)
    
    if len(a)==0:
        print('proceed')
        for i in (df_test_result.index):
            y_pred = df_test_result[df_test_result.index==i].fillna(0)
            y = df_test_data[df_test_data.index==i].fillna(0)
            y_pred = y_pred[y.columns]

            mse = mean_squared_error(y, y_pred)
            mae = mean_absolute_error(y, y_pred)

            mse_list.append(mse)
            mae_list.append(mae)
    else:
        print('error')
    
    return mse_list, mae_list

import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def evaluate_arrays(model_result_arr, df_data, indx_test):
    """
    Calculates the mse and mae of the recommender system for a given result and test set.
    
    Parameters
    ----------
    
    model_result_arr   : ratings from the results of the recommender sys using test set
    
    df_test_truth      : the original dataframe for before splitting.
                         the original ratings or ground truth from the test set will be extracted from here using indices
                         
    indx_test          : result indices of test set from splitting
    
    Returns
    ---------
    
    mse                : mse value using sklearn 
    
    mae                : mse value using sklearn 
    
    """
    
    df_test_truth = df_data.loc[pd.Index(indx_test), df_data.columns[2]]
    test_arr = df_test_truth.values
         
#     test indices first, all user ids should be represented in the test matrix 

    result_len = len(model_result_arr) 
    test_len = len(test_arr)
      
    if result_len!=test_len:
        raise ValueError('the arrays are of different lengths %s in %s' % (result_len,test_len))
        
    else:
        print('proceed')
            
        mse = mean_squared_error(test_arr, model_result_arr)
        mae = mean_absolute_error(test_arr, model_result_arr)

            
    return mse, mae

In [65]:
def cross_val(df, k, model, split_method='random'):
    """
    Performs cross-validation for different train and test sets.

    Parameters
    -----------
    df                    : the data to be split in the form of vanilla/transaction++ table (uid, iid, rating, timestamp)

    k                     : the number of times splitting and learning with the model is desired
    
    model                 : an unfitted sklearn model

    split_method          : 'random' splitting or 'chronological' splitting of the data


    Returns
    --------
    mse and mae           : error metrics using sklearn


    """
    mse = []
    mae = []

    if split_method == 'random':

        for i in range(k):
            print(i)
            # 1. split
            print('Starting splitting')
            df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(
                df, 0.7)
            print('Finished splitting')
            # 2. train with model
            model_clone = clone(model)
            print('Starting training')
            model_clone_fit = fit_ml_cb(df_train, model_clone)
            print('Finished training')
            print('Starting completing matrix')
            result = reco_ml_cb(user_df, item_df, model_clone_fit)
            print('Finished completing matrix')
            print('Starting computing MAE and MSE')
            # 3. evaluate results (result is in the form of utility matrix)
            mse_i, mae_i = evaluate(result, df_test_um)
            print('Finished computing MAE and MSE')

            mse.append(mse_i)
            mae.append(mae_i)

    elif split_method == 'chronological':

        # 1. split
        df_train, df_test, df_test_um, indx_train, indx_test = split_train_test_chronological(
            df, 0.7)

        print('Starting splitting')
        print('Finished splitting')
        # 2. train with model
        model_clone = clone(model)
        print('Starting training')
        model_clone_fit = fit_ml_cb(df_train, model_clone)
        print('Finished training')
        print('Starting completing matrix')
        result = reco_ml_cb(user_df, list(df_test.index), item_df, model_clone_fit)
        print('Finished completing matrix')
        print('Starting computing MAE and MSE')
        # 3. evaluate results (result is in the form of utility matrix)
        mse_i, mae_i = evaluate(result, df_test_um)
        print('Finished computing MAE and MSE')

        mse.append(mse_i)
        mae.append(mae_i)

    return mse, mae

# Model Pipeline

In [61]:
#Declare your model
rs_model1 = RandomForestRegressor(random_state=202109, n_jobs=-1)
#Load the data
df, item_df, user_df, item_ids, user_ids = load_data('augmented_transaction_table.parquet',
                                                     'item_feature.parquet',
                                                     'user_feature.parquet')
#Do your train and test split
df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(df, 0.7) #To split the data
# #Fit your model to the train data
model_fit = fit_ml_cb(df_train, rs_model1) #To fit the model
#Predict on the test data
preds_array = reco_ml_cb_tt(df_test, model_fit) #To make predictions as an array

Unnamed: 0,userId,movieId,rating,timestamp,u_1,u_2,u_3,u_4,u_5,u_6,...,i_291,i_292,i_293,i_294,i_295,i_296,i_297,i_298,i_299,i_300
4,1,50,5.0,964982931,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
5,1,70,3.0,964982400,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
6,1,101,5.0,964980868,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7,1,110,4.0,964982176,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9,1,157,5.0,964984100,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100823,610,160836,3.0,1493844794,267,66,56,411,151,119,...,0.0,0.0,0.0,0.0,0.0,0.287034,0.0,0.0,0.0,0.0
100824,610,161582,4.0,1493847759,267,66,56,411,151,119,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
100827,610,163937,3.5,1493848789,267,66,56,411,151,119,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
100831,610,166534,4.0,1493848402,267,66,56,411,151,119,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [68]:
evaluate_arrays(preds_array, df, indx_test) #MSE and MAE

proceed


(1.277636794458656, 0.881370718580778)

In [8]:
preds_matrix = reco_ml_cb(user_df, item_df, model_fit) #To complete the utility matrix

254 out of 610

KeyboardInterrupt: 

In [None]:
import unittest


class TestGetRec(unittest.TestCase):
    import pandas as pd
    import numpy as np
    from sklearn.base import clone
    from sklearn.ensemble import RandomForestRegressor
    
    def test_matrix_shape(self):
        df, item_df, user_df, item_ids, user_ids = load_data('augmented_transaction_table.parquet',
                                                     'item_feature.parquet',
                                                     'user_feature.parquet')
        df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(df, 0.7) #To split the data
        model_fit = fit_ml_cb(df_train.sample(100), rs_model1) 
        matrix_result = reco_ml_cb(user_df, item_df, item_ids, model_fit)
        self.assertEqual(matrix_result.shape[0], len(user_ids))
        self.assertEqual(matrix_result.shape[1], len(item_ids))

    def test_array_pred(self):
        df, item_df, user_df, item_ids, user_ids = load_data('augmented_transaction_table.parquet',
                                                     'item_feature.parquet',
                                                     'user_feature.parquet')
        df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(df, 0.7) #To split the data
        model_fit = fit_ml_cb(df_train.sample(100), rs_model1) 
        array_result = reco_ml_cb_tt(df_test, model_fit)
        self.assertEqual(len(array_result), len(df_test))        
        
unittest.main(argv=[''], verbosity=2, exit=False)

test_array_pred (__main__.TestGetRec) ... 

Unnamed: 0,userId,movieId,rating,timestamp,u_1,u_2,u_3,u_4,u_5,u_6,...,i_291,i_292,i_293,i_294,i_295,i_296,i_297,i_298,i_299,i_300
1,1,3,4.0,964981247,85,29,42,83,47,26,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
2,1,6,4.0,964982224,85,29,42,83,47,26,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
5,1,70,3.0,964982400,85,29,42,83,47,26,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
11,1,216,5.0,964981208,85,29,42,83,47,26,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.448884,0.0,0.000000
18,1,333,5.0,964981179,85,29,42,83,47,26,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100824,610,161582,4.0,1493847759,267,66,56,411,151,119,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
100826,610,162350,3.5,1493849971,267,66,56,411,151,119,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
100829,610,164179,5.0,1493845631,267,66,56,411,151,119,...,0.0,0.0,0.000000,0.0,0.0,0.296870,0.0,0.000000,0.0,0.000000
100834,610,168252,5.0,1493846352,267,66,56,411,151,119,...,0.0,0.0,0.000000,0.0,0.0,0.291784,0.0,0.000000,0.0,0.282198


ok
test_matrix_shape (__main__.TestGetRec) ... 

Unnamed: 0,userId,movieId,rating,timestamp,u_1,u_2,u_3,u_4,u_5,u_6,...,i_291,i_292,i_293,i_294,i_295,i_296,i_297,i_298,i_299,i_300
0,1,1,4.0,964982703,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
4,1,50,5.0,964982931,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
5,1,70,3.0,964982400,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
6,1,101,5.0,964980868,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
8,1,151,5.0,964984041,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100817,610,158956,3.0,1493848947,267,66,56,411,151,119,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
100819,610,160080,3.0,1493848031,267,66,56,411,151,119,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
100827,610,163937,3.5,1493848789,267,66,56,411,151,119,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
100828,610,163981,3.5,1493850155,267,66,56,411,151,119,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0


42 out of 610