# Combined Pipeline

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.metrics import mean_squared_error
from sklearn.cluster import (KMeans, SpectralClustering,
                             AgglomerativeClustering, DBSCAN, OPTICS,
                             cluster_optics_dbscan, Birch)
import pickle
import sys
import jdc

## Preprocessing

### Item-Feature Table

In [2]:
def create_item_feature_table(item_list, encode_genre=False):
    """
    Creates item feature table from the movies.csv file
    Input:
    
    item_list (pd.DataFrame)      : MovieLens movies.csv Datraframe containing movieId, title, and genre
    encode_genre (bool)           : Include 1-hot encoding of movie into its indicated genres (default: False)

    
    Output
    item_feature_table : Dataframe containing  itemId, movieId, title, genres, year, genre 1-hot encoding (optional) 
    """
    
    item_table = item_list.copy()
    item_table.index.rename('item_id', inplace=True)
    item_table['year'] = item_table['title'].str.slice(-5,-1)
    item_table['genres'] = item_table['genres'].str.split('|')
    
    item_feature_table = item_table
    
    if (encode_genre):
        genre_list = list(set().union(*item_table['genres']))
        genre_list.sort()
        item_feature_table[genre_list] = 0
        
        def set_genres(row):
            row[row['genres']] = 1
            return row
        
        item_feature_table = item_feature_table.apply(set_genres, axis=1)
    
    return item_feature_table
    
    

### Transaction Table

In [3]:
def create_transaction_table(transaction_list, item_list, include_timestamp=False):
    """
    Creates user feature table from the ratings.csv file
    Ratings are normalized for each user (min-max scaling)
    Input:
    
    transaction_list (pd.DataFrame)     : MovieLens movies.csv Datraframe containing movieId, title, and genre
    item_list (pd.DataFrame)            : MovieLens movies.csv Datraframe containing movieId, title, and genre
    include_timestamp (bool)            : Include timestamp in output 
    
    Output
    user_feature_table : Dataframe containing  user_id, item_id, rating 
    """
    
    df_table = transaction_list.copy()
    df_table.rename(columns = {'userId':'user_id'}, inplace=True)
    
    # create new index based on zero
    df_table['user_id'] = df_table.groupby('user_id').ngroup()
    df_table = df_table.set_index('user_id')

    # replace movieId with item_id from the item_feature table
    df_table['item_id'] = 0
    item_feature = create_item_feature_table(item_list)
    
    def get_item_id(row):
        row['item_id'] =  int(item_feature.index[item_feature['movieId'] == row['movieId']][0])
        return row
        
    df_table = df_table.apply(get_item_id, axis=1)
    df_table['item_id'] = df_table['item_id'].astype(int)
    del df_table['movieId']
    
    # normalize ratings per each user
    norm_ratings = df_table.groupby('user_id')['rating'].transform(lambda x: (x - x.min()) / (x.max()-x.min()))
    df_table['rating'] = norm_ratings * 5.0
    
    # option to include/remove timestamp column
    if not include_timestamp:
        del df_table['timestamp']
    
    
    return df_table
    
    

### User Feature Table

In [4]:
def create_user_feature_table(transaction_list, item_list, rating=0, by_genre=True):
    """
    Create a uer-feature table from the transaction_list and item_list
    Can return a 1-hot encoding of the items associated with each user or genre of items
    
    Possible genres include: 'IMAX', 'Adventure', 'Mystery', 'Animation', 'Documentary', 'Comedy',
       'Western', 'War', 'Film-Noir', 'Crime', 'Drama', 'Thriller', 'Fantasy',
       'Action', 'Sci-Fi', 'Children', 'Romance', 'Horror', 'Musical',
       '(no genres listed)'
       
    Input:
    
    transaction_list (pd.DataFrame)  : MovieLens movies.csv Datraframe containing movieId, title, and genre
    item_list (pd.DataFrame)         : MovieLens movies.csv Datraframe containing movieId, title, and genre
    rating (float)                   : Filter movies with ratings better or equal this user-defined input (default = 0)
    by_genre (bool)                  : use genre/category as user_feature, else use 1-hot encoding of items (default = True)
    
    Output
    user_feature : Dataframe containing userId, item_id, rating
    """
    
    transaction_table = create_transaction_table(transaction_list, item_list)
    # for user-defined rating, filter dataframe based on input-rating
    if rating > 0:
        if rating > 5.0:
            rating = 5.0
        transaction_table = transaction_table[transaction_table['rating'] >= rating]
    
    user_id_list = list(set(transaction_table.index))
    user_feature = pd.DataFrame({'user_id': user_id_list})
    user_feature = user_feature.set_index('user_id')
    
    item_feature = create_item_feature_table(item_list)
    
    if by_genre:
        column_name = 'genres'
        category_list = list(set().union(*item_feature['genres']))
        
    else:
        # item 1-hot encoding
        category_list = list(item_feature.index)
        
    category_list.sort()
    user_feature[category_list] = 0
    user_item_list = transaction_table.groupby('user_id')['item_id'].apply(list)
#     print(user_item_list)

    for user in user_id_list:
        if by_genre:
            temp = list(item_feature.iloc[user_item_list[user]]['genres'])
            user_genre_list = [item for sublist in temp for item in sublist]
            user_genre_dict = Counter(user_genre_list)
            genre_list = user_genre_dict.keys()
            genre_list_count = user_genre_dict.values()
            user_feature.loc[user, genre_list] = genre_list_count
            
        else:
            user_feature.loc[user, user_item_list[user]] = 1


    return user_feature
    
    

## Clustering

### User Clustering

In [5]:
def u_cluster(fname, model_fname, random_state=None, drop_cols=[], 
              u_clusters=20, u_method='kmeans', **kwargs):
    """
    pre_cluster

    Perform item-wise and user-wise clustering


    fname        : pandas DataFrame or string
                   The initial utility matrix with each row corresponding 
                   to a user and the columns be 

    random_state : int
                   The state to be used by the clustering algorithm to ensure
                   the consistency of results across runs

    drop_cols    : list
                   Columns to be dropped in fname

    u_clusters   : int
                   Number of clusters to be used for hard clustering of users

    Returns
    -------
    utility_matrix : pandas DataFrame
    """
    import pandas as pd
    import numpy as np
    from sklearn.cluster import (KMeans, SpectralClustering,
                                 AgglomerativeClustering, DBSCAN, OPTICS,
                                 cluster_optics_dbscan, Birch)
    import pickle
    # Aggregation through tables

    if isinstance(fname, str):
        df = pd.read_csv(fname)
    else:
        df = fname

    if drop_cols != None:
        df = df.drop(columns=drop_cols)

    if u_method == 'kmeans':
        u_clusterer = KMeans(n_clusters=u_clusters, random_state=random_state)
    if u_method == 'spectral':
        u_clusterer = SpectralClustering(u_clusters, random_state=random_state)
    if u_method == 'ward':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              **kwargs)
    if u_method == 'single':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              linkage='single', **kwargs)
    if u_method == 'complete':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              linkage='complete', **kwargs)
    if u_method == 'average':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              linkage='average', **kwargs)
    if u_method == 'dbscan':
        u_clusterer = DBScan(**kwargs)
    if u_method == 'optics':
        u_clusterer = OPTICS(**kwargs)
    if u_method == 'birch':
        u_clusterer = Birch(n_clusters=u_clusters, **kwargs)

    u_predict = u_clusterer.fit_predict(df)
    df['u_cluster'] = u_predict

    model = u_clusterer
    result = dict(df['u_cluster'])
    with open(model_fname,'wb') as f:
        pickle.dump(model, f)
    return model, result, df

### Item Clustering

In [6]:
def i_cluster(fname, model_fname, random_state=None, drop_cols=[],
              i_clusters=20, i_method='kmeans', **kwargs):
    """
    pre_cluster

    Perform item-wise and user-wise clustering


    fname        : pandas DataFrame or string
                   The initial utility matrix with each row corresponding 
                   to a user and the columns be 

    random_state : int
                   The state to be used by the clustering algorithm to ensure
                   the consistency of results across runs

    drop_cols    : list
                   Columns to be dropped in fname

    i_clusters   : int
                   Number of clusters to be used for hard clustering of items

    Returns
    -------
    model         : sklearn model

    result        : dict

    """
    import pandas as pd
    import numpy as np
    from sklearn.cluster import (KMeans, SpectralClustering, 
                                 AgglomerativeClustering, DBSCAN, OPTICS, 
                                 cluster_optics_dbscan, Birch)
    import pickle
    if isinstance(fname, str):
        df = pd.read_csv(fname)
    else:
        df = fname

    if drop_cols != None:
        df = df.drop(columns=drop_cols)

    df_items = df.T

    if i_method == 'kmeans':
        i_clusterer = KMeans(n_clusters=i_clusters, random_state=random_state)
    if i_method == 'spectral':
        i_clusterer = SpectralClustering(i_clusters, random_state=random_state)
    if i_method == 'ward':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              **kwargs)
    if i_method == 'single':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              linkage='single', **kwargs)
    if i_method == 'complete':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              linkage='complete', **kwargs)
    if i_method == 'average':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              linkage='average', **kwargs)
    if i_method == 'dbscan':
        i_clusterer = DBScan(**kwargs)
    if i_method == 'optics':
        i_clusterer = OPTICS(**kwargs)
    if i_method == 'birch':
        i_clusterer = Birch(n_clusters=i_clusters, **kwargs)

    i_predict = i_clusterer.fit_predict(df_items)
    df_items['i_cluster'] = i_predict

    model = i_clusterer
    result = dict(df_items['i_cluster'])
    with open(model_fname,'wb') as f:
        pickle.dump(model, f)
    return model, result, df_items

### Aggregating

In [7]:
def cluster_assignment(dictionary, data_name='user_id'):
    
    """
    Converts the dictionary containing user_id and user_cluster assignment  
    to a pandas data frame 


    dictionary     : dictionary
                     output from clustering function
                   

    data_name      : string
                     the columns to be used 

    Returns
    -------
    result        : dataframe of cluster assignments

    """
    
    import pandas as pd
    
    if data_name=='user_id':
        cluster_name='ucluster'
    else:
        cluster_name='icluster'
    
    c_assignment = pd.DataFrame(list(dictionary.items()), columns=[data_name, cluster_name])
    c_assignment.set_index(data_name, inplace=True)
    return c_assignment


In [70]:
def utility_matrix_agg(df_u, df_i, u_agg='sum', i_agg='sum'):
    """
    Aggregates the results of the clustering with respect to item clusters and user clusters.
    
    
    df_u    : dataframe of the user_id with cluster assignments and items ratings of users from the original matrix
    
    df_i    : dataframe of the item_ids with cluster assignments and items ratings of users from the original matrix
    
    ------
    Methods : two possible ways to aggregate the results of cluster assignments in df_u and df_i are 'sum' and 'mean'
    u_agg   : aggregration method to be used for users
    
    i_agg   : aggregation method to be used for items
    
    -----
    Returns : utility matrix consisting of the aggregrated user clusters as rows and aggregated item clusters as columns
    
    """
    import numpy as np
    import pandas as pd

    u_series = df_u['u_cluster']
    i_series = df_i['i_cluster']

    u_ids = np.unique(u_series.values)
    i_ids = np.unique(i_series.values) 

    u_feats = {}
    for u_id in u_ids: #u_ids are clusters of u_id
        sub_df = df_u.groupby('u_cluster').get_group(
            u_id).drop(columns=['u_cluster']).T
        sub_df = sub_df.merge(i_series.reset_index(drop=True), left_index=True, right_index=True)

        if u_agg == 'sum':
            df_grp = sub_df.groupby('i_cluster').sum()
        if u_agg == 'mean':
            df_grp = sub_df.groupby('i_cluster').mean()
        if not isinstance(u_agg,str):
            df_grp = sub_df.groupby('i_cluster').apply(u_agg)

        if i_agg == 'sum':
            df_grp = df_grp.sum(axis=1)
        if i_agg == 'mean':
            df_grp = df_grp.mean(axis=1)
        if not isinstance(i_agg,str):
            df_grp = df_grp.apply(i_agg, axis=1)

        u_feats[u_id] = df_grp
    

    u_matrix = pd.DataFrame()
    for k, v in u_feats.items():
        u_matrix = u_matrix.merge(v.rename(k), how='outer',
                                  left_index=True, right_index=True)

    utility_matrix = u_matrix.fillna(0).T
    utility_matrix.index.rename('u_cluster', inplace=True)
    return utility_matrix

## Modeling

### Preprocess Utility Matrix

In [9]:
def mean_center_utilmat(U, axis=1, fillna=True, fill_val=None):
    """Gets the mean-centered utility matrix

    Parameters:
        U (DataFrame) : utilily matrix (rows are users, columns are items) 
        axis (int) : The axis along mean is evaluated, 
            {0/'index', 1/'columns'}, default 1
        fillna (bool) : Indicates whether missing/null values are to be filled
        fill_val (None/float) : Value to be used to fill null values when 
            fillna==True, default None

    Returns:
        U (DataFrame): mean-centered utility matrix
    """
    mean_centered = U.sub(U.mean(axis=axis), axis=1-axis)
    if fillna:
        if fill_val is not None:
            return mean_centered.fillna(fill_val)
        else:
            return mean_centered.fillna(0)
    else:
        return mean_centered


def split_utilmat_label_features(U, label_index, axis=1):
    """Splits utility matrix into label (column/row where ratings are predicted) 
    and features (columns/rows to be used as input in the model)

    Parameters:
        U (DataFrame) : utilily matrix (rows are users, columns are items) 
        label_index : column name or index corresponding to  item ratings (column)
            or user ratings (row) to be predicted
        axis (int) : The axis along the utility matrix is split, 
            {0/'index', 1/'columns'}, default 1

    Returns:
        label_df (DataFrame) : contains the column/row to be predicted
        feature_df (DataFrame) : contains the features   
    """
    if axis == 1:
        label_col = U.columns[U.columns == label_index]
        feature_col = U.columns[~(U.columns == label_index)]
        label_df = U.loc[:, label_col]
        feature_df = U.loc[:, feature_col]
    elif axis == 0:
        label_row = U.index[U.index == label_index]
        feature_row = U.index[~(U.index == label_index)]
        label_df = U.loc[label_row, :]
        feature_df = U.loc[feature_row, :]

    return label_df, feature_df


def known_missing_split_1d(label_data, feature_data, split_axis=1,
                           missing_val_filled=False, fill_val=None):
    """Returns index of the dataset corresponding to known and missing ratings
    in the label data (row or column to be predicted)

    Parameters:
        label_df (DataFrame) : contains the column/row to be predicted
        feature_df (DataFrame) : contains the features  
        split_axis (int) : The axis along the utility matrix is split, 
            {0/'index', 1/'columns'}, default 1
        missing_val_filled (bool) : Indicates whether missing/null values 
            in the label/feature data were filled
        fill_val (None/float) : Value used to fill the null values when 
            missing_val_filled==True, default None            

    Returns:
        X_known.index : index corresponding to known ratings
        X_missing.index : index corresponding to missing/unknown ratings
    """
    if missing_val_filled:
        if fill_val is None:
            missing_vals = (label_data == 0).values.flatten()
        else:
            missing_vals = (label_data == fill_val).values.flatten()
    else:
        missing_vals = label_data.isnull().values.flatten()
    if split_axis == 1:
        X_missing = feature_data.loc[missing_vals, :]
        X_known = feature_data.loc[~missing_vals, :]
    elif split_axis == 0:
        X_missing = feature_data.loc[:, missing_vals]
        X_known = feature_data.loc[:, ~missing_vals]
    else:
        X_missing = feature_data.loc[missing_vals, :]
        X_known = feature_data.loc[~missing_vals, :]

    return X_known.index, X_missing.index


def known_missing_split_U(U, split_axis=1, missing_val_filled=False,
                          fill_val=None):
    """Returns index of the dataset corresponding to known and missing ratings
    in for the whole utility matrix

    Parameters:
        U (DataFrame) : utilily matrix (rows are users, columns are items) 
        split_axis (int) : The axis along the utility matrix is split, 
            {0/'index', 1/'columns'}, default 1
        missing_val_filled (bool) : Indicates whether missing/null values 
            in the label/feature data were filled
        fill_val (None/float) : Value used to fill the null values when 
            missing_val_filled==True, default None            

    Returns:
        known_idx (dict): keys are the column name/index to be predicted, 
            values are index of the utility matrix that contains known values
        missing_idx (dict): keys are the column name/index to be predicted, 
            values are index of the utility matrix that contains missing values
    """    
    if missing_val_filled:
        if fill_val is None:
            missing_val = 0
        else:
            missing_val = fill_val
        if split_axis == 1:
            known_idx = dict((U == missing_val).T.apply(lambda x: np.array(
                x), axis=1).apply(lambda x: U.index[np.argwhere(~x).flatten()]))
            missing_idx = dict((U == missing_val).T.apply(lambda x: np.array(
                x), axis=1).apply(lambda x: U.index[np.argwhere(x).flatten()]))
        elif split_axis == 0:
            known_idx = dict((U == missing_val).apply(lambda x: np.array(
                x), axis=1).apply(lambda x: U.T.index[np.argwhere(~x).flatten()]))
            missing_idx = dict((U == missing_val).apply(lambda x: np.array(x), axis=1).apply(
                lambda x: U.T.index[np.argwhere(x).flatten()]))
        else:
            print('Invalid axis. Result for axis=1 is returned.')
            known_idx = dict((U == missing_val).T.apply(lambda x: np.array(
                x), axis=1).apply(lambda x: U.index[np.argwhere(~x).flatten()]))
            missing_idx = dict((U == missing_val).T.apply(lambda x: np.array(
                x), axis=1).apply(lambda x: U.index[np.argwhere(x).flatten()]))
    else:
        if split_axis == 1:
            known_idx = dict(U.isnull().T.apply(lambda x: np.array(
                x), axis=1).apply(lambda x: U.index[np.argwhere(~x).flatten()]))
            missing_idx = dict(U.isnull().T.apply(lambda x: np.array(
                x), axis=1).apply(lambda x: U.index[np.argwhere(x).flatten()]))
        elif split_axis == 0:
            train_idx = dict(U.isnull().apply(lambda x: np.array(
                x), axis=1).apply(lambda x: U.T.index[np.argwhere(~x).flatten()]))
            test_idx = dict(U.isnull().apply(lambda x: np.array(x), axis=1).apply(
                lambda x: U.T.index[np.argwhere(x).flatten()]))
        else:
            print('Invalid axis. Result for axis=1 is returned.')
            known_idx = dict(U.isnull().T.apply(lambda x: np.array(
                x), axis=1).apply(lambda x: U.index[np.argwhere(~x).flatten()]))
            missing_idx = dict(U.isnull().T.apply(lambda x: np.array(
                x), axis=1).apply(lambda x: U.index[np.argwhere(x).flatten()]))

    return known_idx, missing_idx

In [10]:
def nan_mask(U, p=0.2):
    mask = np.ones(np.shape(U))
    random_index = np.random.choice(U.size, size=int(U.size*p), replace=False)
    np.ravel(mask)[random_index] = np.nan
    return U*mask

In [11]:
def gen_missing_ratings(U_df, p=0.2, n_masks=10):
    cols = U_df.columns
    idx = U_df.index
    U_arr = U_df.values
    masked_um = []
    for n in range(n_masks):
        masked_um.append(pd.DataFrame(nan_mask(U_arr, p=p),
                                      columns=cols,
                                      index=idx))
    return masked_um

### Non-clustered Data

In [12]:
def initialize_models_itemwise(model, U, suffix='model'):
    """Initializes classifier/regressor per item to be predicted

    Parameters:
        model : model object to use to fit the data
        U (DataFrame) : utilily matrix (rows are users, columns are items) 
        suffix (str) : suffix for keys in output dictionary

    Returns:
        models (dict): dictionary of models, keys correspond to columns/items 
        in the utility matrix and values are the model objects
    """
    models = {f'{item}{suffix}': model for item in U.columns}
    return models


def initialize_models_userwise(model, U, suffix='_model'):
    """Initializes classifier/regressor per user to be predicted

    Parameters:
        model : model object to use to fit the data
        U (DataFrame) : utilily matrix (rows are users, columns are items) 
        suffix (str) : suffix for keys in output dictionary

    Returns:
        models (dict): dictionary of models, keys correspond to the rows/users 
            in the utility matrix and values are the model objects
    """
    models = {f'{user}{suffix}': model for user in U.index}
    return models


def eval_convergence_criterion(
        pred_curr, pred_prev, stopping_criterion='mse',
        mse_threshold=0.1, stdev_threshold=None,
        scaled=False, scaling_method='max',
        rating_min=None, rating_max=None):
    """
    Evaluates whether the model training has converged

    Parameters:
        pred_curr (array) : array of predicted ratings from current iteration
        pred_prev (array) : array of predicted ratings from previous iteration
        stopping_criterion (str) : metric for evaluating convergence, 
            {mse/'mean squared error', stdev_abs/'standard deviation of 
            absolute difference'}, default 'mse'
        mse_threshold (float) : threshold for stopping criterion when 
            'mse'is selected, default 0.1            
        stdev_threshold (float) : threshold for stopping criterion when 
            'stdev_abs'is selected, default None
        scaled (bool) : Indicates whether metric for stopping criterion is 
            to be scaled/normalized
        scaling_method (str) : indicates method for scaling when scaled==True, 
            {max/'maximum rating', minmax/'maximum rating - minimum rating'},
            default 'max'
        rating_min (numeric) : minimum value of rating, default None
        rating_max (numeric) : maximum value of rating, default None

    Returns:
        metric (float) : value of metric
        stop_train (bool) : Indicates convergence (stop training when True)

    """

    if stopping_criterion == 'mse':
        if mse_threshold is None:
            print('Threshold for calculating MSE is not defined. '
                  'Input threshold value.')
        metric = mean_squared_error(pred_curr, pred_prev)

        if scaled:
            if scaling_method == 'max':
                if rating_max is None:
                    print('Scaled metric needs maximum possible value '
                          'of rating.')
                else:
                    scaling_factor = rating_max
            elif scaling_metho == 'minmax':
                if (rating_max is None) or (rating_min is None):
                    print(
                        'Scaled metric needs maximum and minimum '
                        'possible values of rating.')
                else:
                    scaling_factor = (rating_max - rating_min)
            metric /= scaling_factor

        stop_train = (metric <= mse_threshold)

    elif stopping_criterion == 'stdev_abs':
        if stdev_threshold is None:
            print('Threshold for calculating standard deviation of absolute '
                  'error is not defined. Input threshold value.')

        metric = np.std(np.abs(pred_curr-pred_prev))

        if scaled:
            if scaling_method == 'max':
                if rating_max is None:
                    print('Scaled metric needs maximum possible value '
                          'of rating.')
                else:
                    scaling_factor = rating_max
            elif scaling_metho == 'minmax':
                if (rating_max is None) or (rating_min is None):
                    print(
                        'Scaled metric needs maximum and minimum possible'
                        ' values of rating.')
                else:
                    scaling_factor = (rating_max - rating_min)
            metric /= scaling_factor

        stop_train = (metric <= stdev_threshold)

    else:
        if mse_threshold is None:
            print('Stopping criterion set to MSE. Input threshold value.')
        metric = mean_squared_error(pred_curr, pred_prev)

        stop_train = (metric <= mse_threshold)

    return metric, stop_train


def train_model_itemwise(
        U_df, model_object, return_models=True, max_iter=100,
        stopping_criterion='mse', mse_threshold=0.1, stdev_threshold=None,
        scaled=False, scaling_method='max', rating_min=None, rating_max=None):
    """
    Trains model iteratively for the item-wise recommender system: 
    (1) Estimates the missing entries of each column/item by setting it as 
    the target variable and the remaining columns as the feature variables. 
    (2) For the remaining columns, the current set of filled in values are 
    used to create a complete matrix of feature variables. 
    (3) The observed ratings in the target column are used for training. 
    (4) The missing entries are updated based on the prediction of the model 
    on each target column. 

    Parameters:
        U_df (DataFrame) : utilily matrix (rows are users, columns are items) 
        model_object : model object to use to fit the data
        return_models (bool) : Indicates whether trained models are returned 
            as output, default True
        max_iter (int) : maximum number of iterations for model training and 
            updatingof missing values, default 100
        stopping_criterion (str) : metric for evaluating convergence, 
            {mse/'mean squared error', stdev_abs/'standard deviation of 
            absolute difference'}, default 'mse'
        mse_threshold (float) : threshold for stopping criterion when 
            'mse'is selected, default 0.1            
        stdev_threshold (float) : threshold for stopping criterion when 
            'stdev_abs'is selected, default None
        scaled (bool) : Indicates whether metric for stopping criterion is 
            to be scaled/normalized
        scaling_method (str) : indicates method for scaling when scaled==True, 
            {max/'maximum rating', minmax/'maximum rating - minimum rating'},
            default 'max'
        rating_min (numeric) : minimum value of rating, default None
        rating_max (numeric) : maximum value of rating, default None

    Returns:
        U_update (DataFrame) : complete utility matrix
        metric_iter (array-like) : value of convergence metric per iteration
        models_item (dict) : dictionary of trained models, returned only if
            return_models=True
    """
    U = U_df.copy()
    U_update = U.copy()

    models_item = initialize_models_itemwise(model_object, U, suffix='')

    known_index, missing_index = um.known_missing_split_U(
        U, split_axis=1, missing_val_filled=True)

    len_missing_vals = len(sum([i.tolist()
                                for i in missing_index.values()], []))

    preds_per_iter = [np.zeros(len_missing_vals)]
    metric_iter = []

    for i in range(max_iter):
        preds = []
        for item in U.columns:
            models_item[str(item)].fit(
                U_update.drop(item, axis=1).loc[known_index[item]],
                U_update.loc[known_index[item], item])
            if len(missing_index[item]) > 0:
                pred = models_item[str(item)].predict(
                    U_update.drop(item, axis=1).loc[missing_index[item]])
            else:
                pred = np.array([])
            preds.append(pred)
            U_update.loc[missing_index[item], item] = pred

        metric, stopping_criterion = eval_convergence_criterion(
            np.hstack(preds),
            preds_per_iter[-1],
            stopping_criterion=stopping_criterion,
            mse_threshold=mse_threshold,
            stdev_threshold=stdev_threshold,
            scaled=scaled,
            scaling_method=scaling_method,
            rating_min=rating_min,
            rating_max=rating_min)
        metric_iter.append(metric)
        if stopping_criterion:
            break
        preds_per_iter.append(np.hstack(preds))

    if return_models:
        return U_update, metric_iter, models_item
    else:
        return U_update, metric_iter

### Clustered Data

In [13]:
def train_model_itemwise_cluster(Uc_df, n_synth_data=100, p=0.3):
    synth_data = gen_missing_ratings(Uc_df, p=p, n_masks=n_synth_data)
    um_output = []
    for n in range(n_synth_data):
#         print(n)
        U_df = synth_data[n]
        U_df_mc = um.mean_center_utilmat(U_df, axis=1, fillna=True, fill_val=0)
        U_imputed, metrics, models = im.train_model_itemwise(
            U_df_mc, mlp1, return_models=True)
        um_output.append(U_imputed)
    um_output = pd.concat(um_output)
    return um_output.groupby(um_output.index).mean()

## Get Recommendations

In [14]:
def get_rec(utility_matrix, utility_matrix_o, user_list, uc_assignment, top_n):
    
    """Returns the top N recommendations for each user in the user list.
    
            Parameters:
                    utility_matrix (numpy.ndarray): Matrix of utilities for each user-item pairing (assumes that indices correspond to user_cluster_id and item_cluster_id)
                    utility_matrix_o (numpy.ndarray): Original utility matrix, before imputation (i need this so i dont recommend items that have already been "consumed"/"rated")
                    user_list (array-like): List of users
                    uc_assignment (array-like): List containing the cluster assignment of each user (assumes that indices correspond to user_id)
                    top_n (int): Number of item clusters to recommend

            Returns:
                    df_rec (pandas.DataFrame): Table containing the top N recommendations for each user in the user list
                    
    """
    
    # Don't recommend items that are already rated
    utility_matrix[np.where(utility_matrix_o != 0)] = -np.inf
    
    # Get top N per user cluster
    cluster_rec = utility_matrix.argsort()[:, -top_n:]

    # Create recommendation table
    df_rec = pd.DataFrame()
    df_rec['user_id'] = user_list
    
    for i in range(top_n):
        df_rec['rank_'+str(i+1)] = np.zeros(df_rec.shape[0])
        for j in range(df_rec.shape[0]):
            df_rec.iloc[j, i+1] = cluster_rec[uc_assignment[user_list[j]], top_n-i-1]
    
    #df_rec['u_cluster'] = uc_assignment[user_list] # Comment out later!
    return df_rec

## Trying the entire pipeline

### Process Inputs

In [15]:
item_list = pd.read_csv("sample_data/movies.csv")
item_list.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [16]:
transaction_list = pd.read_csv("sample_data/ratings.csv")
transaction_list.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [17]:
item_feature_table = create_item_feature_table(item_list, encode_genre=True)
item_feature_table.drop(["title", "genres", "movieId", "year"], axis=1, inplace=True)
item_feature_table.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
transaction_table = create_transaction_table(transaction_list, item_list)
transaction_table.head()

Unnamed: 0_level_0,rating,item_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3.75,0
0,3.75,2
0,3.75,5
0,5.0,43
0,5.0,46


In [19]:
user_feature = create_user_feature_table(transaction_list, item_list, by_genre=False)
user_feature.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Clustering Model

In [20]:
x_u, y_u, df_u = u_cluster(user_feature,"kmeans", u_clusters=50)
x_i, y_i, df_i = i_cluster(item_feature_table,'kmeans', i_clusters=10)

In [21]:
uc_assignment = cluster_assignment(y_u, data_name='user_id')
ic_assignment = cluster_assignment(y_i, data_name='item_id')

In [103]:
display(ic_assignment.head(3))
display(uc_assignment.head(3))

Unnamed: 0_level_0,icluster
item_id,Unnamed: 1_level_1
(no genres listed),1
Action,4
Adventure,7


Unnamed: 0_level_0,ucluster
user_id,Unnamed: 1_level_1
0,2
1,20
2,20


In [102]:
display(df_u.head(3))
display(df_i.head(3))

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,9733,9734,9735,9736,9737,9738,9739,9740,9741,u_cluster
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,20
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,20


item_id,0,1,2,3,4,5,6,7,8,9,...,9733,9734,9735,9736,9737,9738,9739,9740,9741,i_cluster
(no genres listed),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Action,0,0,0,0,0,1,0,0,1,1,...,0,0,0,0,1,0,0,1,0,4
Adventure,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,7


In [72]:
um = utility_matrix_agg(df_u, df_i, u_agg='sum', i_agg='sum')
um

i_cluster,0,1,2,3,4,5,6,7,8,9
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4.0,61.0,13.0,1.0,1.0,9.0,10.0,0.0,8.0,3.0
1,1.0,6.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
2,17.0,74.0,18.0,10.0,3.0,7.0,8.0,7.0,17.0,14.0
3,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0,3.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
5,0.0,4.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
6,2.0,22.0,3.0,0.0,1.0,2.0,4.0,0.0,1.0,2.0
7,0.0,4.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
8,0.0,5.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
9,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
