# Model Pipeline 
This notebook uses code from the cross_val, sample_train_test, and evaluate pipeline.

In [22]:
import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor

rs_model1 = RandomForestRegressor(random_state=202109, n_jobs=-1)

# df = pd.read_parquet('augmented_transaction_table.parquet').dropna()

# item_df = pd.read_parquet('item_feature.parquet')
# item_ids = item_df['movieId'].unique()
# item_df = item_df.drop(columns=['movieId'])
# user_df = pd.read_parquet('user_feature.parquet').drop(columns=['userId'])
# user_ids = df['userId'].unique()


def load_data(aug_tt, item_tt,user_tt):
    """
    Load the data from the transaction tables

    Paramters
    ---------
    aug_tt       : str
                   File name of the parquet file with each row corresponding
                   to a user's features, an item's features, and the user's
                   rating for that item

    item_tt      : str
                   File name of the parquet file with each row corresponding
                   to an item's features

    user_tt      : str
                   File name of the parquet file with each row corresponding
                   to a user's features

    Returns
    -------
    df            : pandas DataFrame
                    The augmented transaction table
                    
    item_df       : pandas DataFrame
                    The item features as a transaction table
                    
    user_df       : pandas DataFrame
                    The userfeatures as a transaction table
                    
    item_ids      : list
                    All unique item ids
                    
    user_ids      : list
                    All unique user ids
    """
    
    df = pd.read_parquet(aug_tt).dropna()
    item_df = pd.read_parquet(item_tt)
    item_ids = item_df['movieId'].unique()
    item_df = item_df.drop(columns=['movieId'])
    user_df = pd.read_parquet(user_tt).drop(columns=['userId'])
    user_ids = df['userId'].unique()
    return df, item_df, user_df, item_ids, user_ids

df, item_df, user_df, item_ids, user_ids = load_data('augmented_transaction_table.parquet',
                                                    'item_feature.parquet',
                                                    'user_feature.parquet')


def fit_ml_cb(train_df, model, target_col='rating', drop_cols=['userId', 'movieId','timestamp']):
    """
    Perform item-wise clustering and assign each item to a cluster of similar
    items based on the users that 

    Paramters
    ---------
    train_df     : pandas DataFrame
                   The training set as a transaction table. Each row
                   corresponds to a user's features and that item's features
                   along with the user's rating for that item.

    model        : an sklearn regressor object
                   An object with a fit and predict method that outputs a
                   float.

    target_col   : str
                   The column corresponding to the rating.

    drop_cols    : list
                   Columns to be dropped in train_df.

    Returns
    -------
    rs_model      : an sklearn model object
                    The fitted version of the model input used to predict the
                    rating of a user for an object given the user's features
                    and the item's features.
    """
    rs_model = clone(model)
    target = train_df[target_col].dropna().values.ravel()
    train_df = train_df.drop(columns=[target_col]+drop_cols)
    rs_model = model.fit(train_df, target)
    return rs_model


def reco_ml_cb(user_df, item_df, model_fitted):
    """
    Completes the entire utility matrix based on the model passed

    Paramters
    ---------
    train_df     : pandas DataFrame
                   The training set as a transaction table. Each row
                   corresponds to a user's features and that item's features
                   along with the user's rating for that item.

    model        : an sklearn regressor object
                   An object with a fit and predict method that outputs a
                   float.

    target_col   : str
                   The column corresponding to the rating.
                   
    Returns
    -------
    full_matrix  : a pandas DataFrame
                   The completed utility matrix.
    """
    recos = {}
    c = 1
    for u, u_feats in user_df.iterrows():
        print(c, 'out of', len(user_df), end='\r')
        u_feats = pd.concat([pd.DataFrame(u_feats).T] *
                            len(item_ids)).reset_index(drop=True)
        a_feats = u_feats.join(item_df)
        reco = pd.Series(model_fitted.predict(a_feats), index=item_ids)
        recos[u] = reco
        c += 1
    full_matrix = pd.DataFrame.from_dict(recos, orient='index')
    return full_matrix



# def fit_ml_cb_all(c_transactions, model, target_col='rating'):
#     # Unused Function
#     c_models = {}
#     for cluster, table in c_transactions.items():
#         c_models[cluster] = fit_ml_cb(table, model, target_col)
#     return c_models


In [17]:
# idx = np.cumsum(np.in1d(np.arange(len(df.index)), user_ids))
# u_i_history = {}
# for i in user_ids:
#     for x in df.groupby('userId')['movieId'].get_group(i).index:
#         u_i_history[x] = i

In [18]:
import pandas as pd

def split_train_test(data, train_ratio=0.7):
    """
    Splits the transaction data into train and test sets.
    
    Parameters
    ----------
    data         : pandas DataFrame for transaction table containing user, item, and ratings
    
    train_ratio  : the desired ratio of training set, while 1-train ratio is automatically set for the test set 
    
    
    Returns
    ---------
    df_train_fin : dataframe for the training set
    
    df_test_fin  : dataframe for the test set
    
    df_test_fin* : possible option is a pivoted df ready as the util matrix input of the recsys. In our case, the
                   index='userId', columns='movieId', values='rating'. To generalize a transaction table, 
                   index=column[0], columns=itemId, values=rating.
    """
    
    list_df_train = []
    list_df_test = []
    
    #group by user id
    d = dict(tuple(data.groupby(data.columns[0]))) #assuming column[0] is the userId
    
    #splitting randomly per user
    for i in (d):
        if len(d[i])<2:
            list_df_test.append(d[i])
            
        else:            
            df_train = d[i].sample(frac=train_ratio)  
            ind = df_train.index
            df_test = d[i].drop(ind)
            list_df_train.append(df_train) 
            list_df_test.append(df_test)

    # 2. merge selected train set per user to a single dataframe
    df_train_fin = pd.concat(list_df_train)
    df_test_fin = pd.concat(list_df_test)
    
    # 3. Option to pivot it to create the utility matrix ready as input for recsys
    df_test_um = df_test_fin.pivot(index=df_test_fin.columns[0], columns=df_test_fin.columns[1], values=df_test_fin.columns[2])

    # 4. get indices of train and test sets
    indx_train = df_train_fin.index
    indx_test = df_test_fin.index

    return df_train_fin, df_test_fin, df_test_um, indx_train, indx_test #return indices

In [19]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def evaluate(df_test_result, df_test_data):
    """
    Calculates the mse and mae per user of the results of the recommender system for a given test set.
    
    Parameters
    ----------
    
    df_test_result   : utility matrix containing the result of the recommender systems
    
    df_test_data     : pivoted test data generated from splitting the transaction table and tested on the recommender systems
    
    Returns
    ---------
    
    mse_list         : list of mean squared error for each user
    
    mae_list         : list of mean absolute error for each user
    
    """
    
    
    mse_list = []
    mae_list = []
    
#     test indices first, all user ids should be represented in the test matrix 
    idx_orig_data = df_test_data.index
    idx_result = df_test_result.index + 1
    a=idx_orig_data.difference(idx_result)
    
    if len(a)==0:
        print('proceed')
        
        for i in (df_test_result.index):
            y_pred = df_test_result[df_test_result.index==i].fillna(0)
            y = df_test_data[df_test_data.index==i+1].fillna(0)
            y_pred = y_pred[y.columns]
            mse = mean_squared_error(y, y_pred)
            mae = mean_absolute_error(y, y_pred)
            mse_list.append(mse)
            mae_list.append(mae)
    else:
        print('error')
    
    return mse_list, mae_list

In [20]:
def cross_val(df, k, model, split_method='random'):
    """
    Performs cross-validation for different train and test sets.

    Parameters
    -----------
    df                    : the data to be split in the form of vanilla/transaction++ table (uid, iid, rating, timestamp)

    k                     : the number of times splitting and learning with the model is desired
    
    model                 : an unfitted sklearn model

    split_method          : 'random' splitting or 'chronological' splitting of the data


    Returns
    --------
    mse and mae           : error metrics using sklearn


    """
    mse = []
    mae = []

    if split_method == 'random':

        for i in range(k):
            print(i)
            # 1. split
            print('Starting splitting')
            df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(
                df, 0.7)
            print('Finished splitting')
            # 2. train with model
            model_clone = clone(model)
            print('Starting training')
            model_clone_fit = fit_ml_cb(df_train, model_clone)
            print('Finished training')
            print('Starting completing matrix')
            result = reco_ml_cb(user_df, list(df_test.index), item_df, model_clone_fit)
            print('Finished completing matrix')
            print('Starting computing MAE and MSE')
            # 3. evaluate results (result is in the form of utility matrix)
            mse_i, mae_i = evaluate(result, df_test_um)
            print('Finished computing MAE and MSE')

            mse.append(mse_i)
            mae.append(mae_i)

    elif split_method == 'chronological':

        # 1. split
        df_train, df_test, df_test_um, indx_train, indx_test = split_train_test_chronological(
            df, 0.7)

        print('Starting splitting')
        print('Finished splitting')
        # 2. train with model
        model_clone = clone(model)
        print('Starting training')
        model_clone_fit = fit_ml_cb(df_train, model_clone)
        print('Finished training')
        print('Starting completing matrix')
        result = reco_ml_cb(user_df, list(df_test.index), item_df, model_clone_fit)
        print('Finished completing matrix')
        print('Starting computing MAE and MSE')
        # 3. evaluate results (result is in the form of utility matrix)
        mse_i, mae_i = evaluate(result, df_test_um)
        print('Finished computing MAE and MSE')

        mse.append(mse_i)
        mae.append(mae_i)

    return mse, mae

In [21]:
mse, mae = cross_val(df,5,rs_model1)

0
Starting splitting
Finished splitting
Starting training
Finished training
Starting completing matrix
Finished completing matrix
Starting computing MAE and MSE
proceed
ID: 0
ID: 1
ID: 2
ID: 3
ID: 4
ID: 5
ID: 6
ID: 7
ID: 8
ID: 9
ID: 10
ID: 11
ID: 12
ID: 13
ID: 14
ID: 15
ID: 16
ID: 17
ID: 18
ID: 19
ID: 20
ID: 21
ID: 22
ID: 23
ID: 24
ID: 25
ID: 26
ID: 27
ID: 28
ID: 29
ID: 30
ID: 31
ID: 32
ID: 33
ID: 34
ID: 35
ID: 36
ID: 37
ID: 38
ID: 39
ID: 40
ID: 41
ID: 42
ID: 43
ID: 44
ID: 45
ID: 46
ID: 47
ID: 48
ID: 49
ID: 50
ID: 51
ID: 52
ID: 53
ID: 54
ID: 55
ID: 56
ID: 57
ID: 58
ID: 59
ID: 60
ID: 61
ID: 62
ID: 63
ID: 64
ID: 65
ID: 66
ID: 67
ID: 68
ID: 69
ID: 70
ID: 71
ID: 72
ID: 73
ID: 74
ID: 75
ID: 76
ID: 77
ID: 78
ID: 79
ID: 80
ID: 81
ID: 82
ID: 83
ID: 84
ID: 85
ID: 86
ID: 87
ID: 88
ID: 89
ID: 90
ID: 91
ID: 92
ID: 93
ID: 94
ID: 95
ID: 96
ID: 97
ID: 98
ID: 99
ID: 100
ID: 101
ID: 102
ID: 103
ID: 104
ID: 105
ID: 106
ID: 107
ID: 108
ID: 109
ID: 110
ID: 111
ID: 112
ID: 113
ID: 114
ID: 115
ID: 116
ID: 1

KeyboardInterrupt: 

In [None]:
df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(df, 0.7)

In [None]:
len(df['movieId'].unique())

In [269]:
df_test_um

movieId,1,2,3,4,5,6,7,8,9,10,...,173619,173873,173941,174053,174055,174681,174727,175197,175569,175707
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,,,,,,,,,...,,,,,,,,,,
609,,,,,,,,,,4.0,...,,,,,,,,,,
