# 3 - Train Recommender

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import numpy as np
from keras.layers import *
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from livelossplot import PlotLossesKeras

### Helper functions

In [None]:
# df: dataframe containing features to be encoded
# columns: list of columns to be encoded
def one_hot_encode(df, columns):
    ohe = OneHotEncoder()
    ohe_features = pd.DataFrame(ohe.fit_transform(df[columns]).toarray())
    ohe_features.columns = ohe.get_feature_names()
    df = pd.concat([df, ohe_features], axis=1)
    df = df.drop(columns = categorical_features)
    return df


# df: dataframe containing features to be encoded
# columns: list of columns to be encoded
def label_encode(df, columns):
    le = LabelEncoder()
    df[columns] = df[columns].apply(le.fit_transform)
    return df


# df: dataframe containing text to be vectorized
# column: string name of text column
# vectorizer: scikit learn vectorizer - CountVectorizer or TfidfVectorizer
def vectorize_text(df, column, vectorizer):
    text = df[column].replace(np.nan, ' ').tolist()
    X = vectorizer.fit_transform(text)
    df[column+'_features'] = list(X.toarray())
#     word_vecs = pd.DataFrame(X.toarray())
    df.drop(columns=column, inplace=True)
#     df = pd.concat([df, word_vecs], axis = 1)
    return df


# vectorizes columns that include a list that should be broken out into one-hot-encoded features
# for example, a column containing lists like ["red", "green", "blue"] will be transformed into 3 columns with 0/1 indicators
# df: dataframe containing column to be vectorized
# column: column containing list of features
def vectorize_columns(df, columns):
    for column in columns:
        df[column] = df[column].fillna('[]')
        df[column] = df[column].apply(lambda x: x.strip('][').split(', '))
        features = df[column].apply(frozenset).to_frame(name='features')
        for feature in frozenset.union(*features.features):
            new_col = feature.strip('\'').lower()
            df[new_col] = features.apply(lambda _: int(feature in _.features), axis=1)
        df = df.drop(columns = [column])
    return df


# feature_columns: list of column names that contain single features values
# embedding_columns: list of column names that contain vector embeddings (image or text embeddings)
def create_metadata_df(df, feature_columns, embedding_columns):
    features = df[feature_columns].reset_index(drop=True)
    embeddings = pd.DataFrame()
    for column in embedding_columns:
        embeddings = pd.concat([embeddings, pd.DataFrame(np.vstack(df[column]))], axis=1)
    result = pd.concat([features,embeddings],axis=1)
    return result


# recommender with only user-item ratings and no user-item features
def create_basic_network(n_items, n_users, n_factors):
    item_input = Input(shape=[1], name="Item-Input")
    item_embedding = Embedding(n_items, n_factors, name="Item-Embedding")(item_input)
    item_vec = Flatten(name="Flatten-Items")(item_embedding)
    
    user_input = Input(shape=[1], name="User-Input")
    user_embedding = Embedding(n_users, n_factors, name="User-Embedding")(user_input)
    user_vec = Flatten(name="Flatten-Users")(user_embedding)
    
    prod = Dot(name="Dot-Product", axes=1)([item_vec, user_vec])
    
    model = Model([user_input, item_input], prod)
    model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.001))

    return model

## Load and preprocess data

In [None]:
# user-item-ratings

user_item_ratings_file = 'path to csv with schema: item_id, user_id, rating'

ratings = pd.read_csv(user_item_ratings_file)

In [None]:
# item features

items_file = 'path to csv with schema: item_id, item_feature1, item_feature2, ..., item_featureN' 

items = pd.read_csv(items_file)
items = items[['item_id','color','category','item_gender','description']]  # sample columns in our dataset

In [None]:
# user features

user_file = 'path to csv with schema: user_id, user_feature1, user_feature2, ..., user_featureN' 
users = users[['user_id','user_gender','colors','user_description']]  # sample columns in our dataset
users = pd.read_csv(user_file)

In [None]:
# item image encoded vectors

images = pd.read_pickle('../data/image_vecs_encoded.pkl')

### Prepare item features

In [None]:
# add image features to item data

items = pd.merge(items,images,on='item_id')

In [None]:
# encode item categorical features from strings to ints
item_cat_features = ['color', 'category', 'item_gender']  # TODO: replace with your categorical string features
items = label_encode(items, item_cat_features)

# vectorize item text descriptions
tf_vectorizer = TfidfVectorizer()
items = vectorize_text(items, 'description', tf_vectorizer) 

### Prepare user features

In [None]:
# encode user categorical features
user_cat_features = ['user_gender']  # TODO: replace with your categorical string features
users = label_encode(users, user_cat_features)

# vectorize user features - split lists into one hot encoded columns
users = vectorize_columns(users, ['colors']) # sample column that contains lists in our dataset, e.g. ['blue', 'purple']

# if there is text associated with the user, vectorize it here (like a user request, profile description, or other)
users = vectorize_text(users, 'user_description', tf_vectorizer)

### Add all metadata to ratings df

In [None]:
ratings = pd.merge(ratings, items, on='item_id')
ratings = pd.merge(ratings, users, on='user_id')

## Train model

In [None]:
train, test = train_test_split(ratings, test_size=0.15, random_state=42)
n_users = len(ratings.user_id.unique())
n_items = len(ratings.item_id.unique())

In [None]:
def hybrid_recommender_v1(n_item_features, n_user_features, embedding_size):

    user_id_input = Input(shape=[1], name='user')
    item_id_input = Input(shape=[1], name='item')
    item_meta_input = Input(shape=[n_item_features], name='item_features')
    user_meta_input = Input(shape=[n_user_features], name='user_features')

    user_embedding = Embedding(output_dim=embedding_size, input_dim=n_users, name='user_embedding')(user_id_input)
    item_embedding = Embedding(output_dim=embedding_size, input_dim=n_items, name='item_embedding')(item_id_input)
    item_metadata = Dense(units=embedding_size, name='item_metadata')(item_meta_input)
    user_metadata = Dense(units=embedding_size, name='user_metadata')(user_meta_input)

    user_vec = Flatten()(user_embedding)
    item_vec = Flatten()(item_embedding)
    item_vec = Add()([item_vec, item_metadata])
    user_vec = Add()([user_vec, user_metadata])

    input_vec = Concatenate()([user_vec, item_vec])#, item_metadata, user_metadata])

    x = Dense(128, activation='relu')(input_vec)
    x = Dropout(0.5)(x)
    y = Dense(1)(x)

    model = Model(inputs=[user_id_input, item_id_input, item_meta_input, user_meta_input], outputs=y)
    model.compile(loss='mse', optimizer=Adam(lr=0.001), metrics=['mae'])
    return model

 
def hybrid_recommender_v2(n_item_features, n_user_features, embedding_size):

    # users
    user_id_input   = Input(shape=[1], name='user')
    user_meta_input = Input(shape=[n_user_features], name='user_features')

    user_embedding  = Embedding(output_dim=embedding_size, input_dim=n_users, name='user_embedding')(user_id_input)
    user_vec        = Flatten()(user_embedding)
    user_vec        = Dropout(0.5)(user_vec)
    user_metadata   = Dense(units=embedding_size, name='user_metadata')(user_meta_input)
    
    # items
    item_id_input   = Input(shape=[1], name='item')
    item_meta_input = Input(shape=[n_item_features], name='item_features')
    item_img_input  = Input(shape=[embedding_size], name='item_image_features') # autoencoded image features

    item_embedding  = Embedding(output_dim=embedding_size, input_dim=n_items, name='item_embedding')(item_id_input)
    item_vec        = Flatten()(item_embedding)
    item_vec        = Dropout(0.5)(item_vec)
    item_metadata   = Dense(units=embedding_size, name='item_metadata')(item_meta_input)

    # join features 
    item_vec        = Add()([item_vec, item_metadata, item_img_input])
    user_vec        = Add()([user_vec, user_metadata])

    input_vec       = Concatenate()([user_vec, item_vec])#, item_metadata, user_metadata])

    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(input_vec)
    x = Dropout(0.5)(x)
    y = Dense(1)(x)

    model = Model(inputs=[user_id_input, item_id_input, item_meta_input, user_meta_input, item_img_input], outputs=y)
    model.compile(loss='mse', optimizer=Adam(lr=0.001), metrics=['mae'])
    return model

### No autoencoded features - raw VGG16 embeddings

In [None]:
# metadata cols
item_feature_cols = ['color','category','item_gender'] # item feature columns that contain a single value
item_embedding_cols = ['image_features','description_features'] # item feature columns that contain a list of embeddings - applicable to image or text embeddings
user_feature_cols = ['user_gender','rose gold','white','black','gray','gold','red','orange','natural','blue light'] # gender plus additional one-hot-encoded features
user_embedding_cols = ['user_description_features']

# prepare train & test inputs
train_item_metadata = create_metadata_df(train, item_feature_cols, item_embedding_cols)
test_item_metadata = create_metadata_df(test, item_feature_cols, item_embedding_cols)

train_user_metadata = create_metadata_df(train, user_feature_cols, user_embedding_cols)
test_user_metadata = create_metadata_df(test, user_feature_cols, user_embedding_cols)

In [None]:
# architecture v1
n_item_features = 6534
n_user_features = 2797
embedding_size = 256

model = hybrid_recommender_v1(n_item_features, n_user_features, embedding_size)

history = model.fit([train.user_id, train.item_id, train_item_metadata, train_user_metadata]
                    , train.rating
                    , batch_size=32, epochs=50
                    , validation_split=0.1
                    , validation_data=([test.user_id, test.item_id, test_item_metadata, test_user_metadata], test.rating)
                    , callbacks = [PlotLossesKeras()]
                    , shuffle=True)

### Using autoencoded features

In [None]:
# Architecture v1
item_embedding_cols = ['image_features_encoded','description_features']

train_item_metadata = create_metadata_df(train, item_feature_cols, item_embedding_cols)
test_item_metadata = create_metadata_df(test, item_feature_cols, item_embedding_cols)

train_user_metadata = create_metadata_df(train, user_feature_cols, user_embedding_cols)
test_user_metadata = create_metadata_df(test, user_feature_cols, user_embedding_cols)

n_item_features = 2694

model = hybrid_recommender_v1(n_item_features, n_user_features, embedding_size)
history = model.fit([train.user_id, train.item_id, train_item_metadata, train_user_metadata]
                    , train.rating
                    , batch_size=32, epochs=100
                    , validation_split=0.1
                    , validation_data=([test.user_id, test.item_id, test_item_metadata, test_user_metadata], test.rating)
                    , callbacks = [PlotLossesKeras()]
                    , shuffle=True)

In [None]:
# Architecture v2
n_item_features = 2438

item_embedding_cols = ['description_features']

train_item_metadata = create_metadata_df(train, item_feature_cols, item_embedding_cols)
test_item_metadata = create_metadata_df(test, item_feature_cols, item_embedding_cols)

train_user_metadata = create_metadata_df(train, user_feature_cols, user_embedding_cols)
test_user_metadata = create_metadata_df(test, user_feature_cols, user_embedding_cols)

In [None]:
model = hybrid_recommender_v2(n_item_features, n_user_features, embedding_size)

best = ModelCheckpoint('../models/recommender.h5',
                        monitor='val_loss',
                        verbose=0,
                        save_best_only=True,
                        mode='auto')

history = model.fit([train.user_id, train.item_id, train_item_metadata, train_user_metadata, np.vstack(train.image_features_encoded)]
                    , train.rating
                    , batch_size=32, epochs=100
                    , validation_split=0.2
                    , validation_data=([test.user_id, test.item_id, test_item_metadata, test_user_metadata, np.vstack(test.image_features_encoded)], test.rating)
                    , callbacks = [PlotLossesKeras(), best]
                    , shuffle=True)

## Generate Predictions

In [None]:
# load trained model
model = load_model('../models/recommender.h5') # model generated from v2 architecture

In [None]:
# prep model inputs
item_ids = items.item_id # all item ids 
num_items = len(item_ids)

user_idx = users.sample(1).user_id.index[0] # select random user index
user_data = users.loc[[user_idx]] # get data for selected user

user_id = user_data.user_id.values[0] # get user id for selected user
user_ids = np.array([user_id for i in range(num_items)]) # array of user id repeated to match number of items

item_embedding_cols = ['description_features']
item_metadata = create_metadata_df(items, item_feature_cols, item_embedding_cols)
user_metadata = create_metadata_df(user_data, user_feature_cols, user_embedding_cols)
user_metadata = user_metadata.loc[user_metadata.index.repeat(num_items)] # repeat user features by number of items

In [None]:
# get predictions
preds = model.predict([user_ids, item_ids, item_metadata, user_metadata, np.vstack(ratings.image_features_encoded)])
preds = np.array([x[0] for x in preds])

In [None]:
# sort and get top N recommendations (indices of items recommended)
num_recs = 10
rec_ids = (-preds).argsort()[:num_recs]

In [None]:
# get recommended item ids and ratings from indices above
recs = [(item_ids[x],preds[x]) for x in rec_ids] # list of tuples - (item id, predicted rating)