# Intro

This notebook implements an experiment aimed to verify accuracy of Deep Hybrid AutoencodeR Recommendation ENgine (DHARREN), on a dataset used in a [recent publication](https://github.com/MengtingWan/marketBias). 

# Technical prep

In [1]:
import pandas as pd
import io
import requests
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K

from scipy.sparse import lil_matrix, save_npz, load_npz
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


from tqdm.autonotebook import tqdm
from tqdm import trange

  


# Data prep

In [2]:
url="https://raw.githubusercontent.com/MengtingWan/marketBias/master/data/df_electronics.csv"
s=requests.get(url).content

In [3]:
raw_data = pd.read_csv(io.StringIO(s.decode('utf-8')), sep=",")

In [4]:
raw_data.head(3)

Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,brand,year,user_attr,split
0,0,0,5.0,1999-06-13,Female,Portable Audio & Video,,1999,,0
1,0,1,5.0,1999-06-14,Female,Portable Audio & Video,,1999,,0
2,0,2,3.0,1999-06-17,Female,Portable Audio & Video,,1999,,0


In [5]:
raw_data.brand.unique()

array([nan, 'HP', 'Philips', 'Polaroid', 'Panasonic', 'JVC', 'Fujifilm',
       'Nikon', 'Kodak', 'Sony', 'Canon', 'Kensington', 'Pyle', 'Olympus',
       'Toshiba', 'Logitech', 'Etre Jeune', 'Linksys', 'Vivitar',
       'Sennheiser', 'Apple', 'Samsung', 'EldHus', 'Bose', 'Archos',
       'Garmin', 'Jabra', 'Gary Fong', 'ViewSonic', 'Savage', 'Uniden',
       'ebasy', 'Generic', 'JLAB', 'Skullcandy', 'TaoTronics', 'Neewer',
       'Koolertron', 'DURAGADGET', 'iRULU', 'Tiamat', 'DBPOWER', 'Fintie',
       'Plemo', 'EINCAR', 'Cooper Cases', 'LSS', 'Mpow', 'XShields',
       'IRULU', 'Funlux'], dtype=object)

In [6]:
raw_data.model_attr.fillna("missing", inplace=True)
raw_data.user_attr.fillna("missing", inplace=True)
raw_data.brand.fillna("missing", inplace=True)

In [7]:
max_uid = raw_data.user_id.max()
max_uid

1157632

Testing if data consist consecutive id numbers

In [8]:
unique_uid = pd.Series(raw_data.user_id.unique())
expected_ids = pd.Series(range(max_uid + 1))

pd.util.testing.assert_series_equal(unique_uid, expected_ids)

In [9]:
max_item_id = raw_data.item_id.max()
unique_iids = pd.Series(raw_data.item_id.unique())
expected_item_ids = pd.Series(range(max_item_id+1))

pd.util.testing.assert_series_equal(expected_item_ids, unique_iids)

In [10]:
feature_columns = ['user_attr', 'model_attr', 'brand']

In [11]:
encoded_data = pd.get_dummies(raw_data[feature_columns], sparse=True)
encoded_data['user_id'] = raw_data.user_id.values
encoded_data['item_id'] = raw_data.item_id.values

In [12]:
raw_data.sort_values(by=['user_id', 'item_id'], ascending=True, inplace=True)
encoded_data.sort_values(by=['user_id', 'item_id'], ascending=True, inplace=True)

In [13]:
features_matrix = encoded_data.drop(['user_id', 'item_id'], axis=1).to_numpy()

## Recreate ratings matrix if neccessary

In [13]:
recreate_matrix = False

In [14]:
if recreate_matrix:
    user_item_matrix = lil_matrix((max_uid+1, max_item_id+1), dtype=np.int8)

    for row_idx, row in raw_data.iterrows():
        uidx = row['user_id']
        iidx = row['item_id']
        rating = row['rating']    
        user_item_matrix[uidx, iidx] = rating
        if row_idx % 100000 == 0:
            print(f"Processed: {row_idx / float(raw_data.shape[0])}%")
            
    user_item_matrix = user_item_matrix.tocsr()
    print("done")
    save_npz("../data/processed/ratings_sparse_mat.npz", user_item_matrix)

## Load ratings matrix

In [16]:
user_item_matrix = load_npz("../data/processed/ratings_sparse_mat.npz")

In [17]:
print("done")

done


# Modelling phase

## Train test split

In [18]:
raw_data.columns

Index(['item_id', 'user_id', 'rating', 'timestamp', 'model_attr', 'category',
       'brand', 'year', 'user_attr', 'split'],
      dtype='object')

In [19]:
train_ids, test_ids = train_test_split(raw_data, stratify=raw_data.loc[:,["model_attr", "user_attr"]], test_size=0.3)

## Experimental setup

In [27]:
def generator_ratings_features(ratings, features, mask, normalize=False, batch_size=64):
    while True:
        ratings, mask = shuffle(ratings, mask)
        for i in range(ratings.shape[0] // batch_size + 1):
            upper = min((i+1)*batch_size, ratings.shape[0])
            r = ratings[i*batch_size:upper].toarray()
            f = features[i * batch_size : upper]
            m = mask[i*batch_size:upper].toarray()
            if normalize:
                #r = r - mu * m
                r = r * m
            yield [r, f], r
            
def generator_ratings(ratings,mask, normalize=False, batch_size=64):
    while True:
        ratings, mask = shuffle(ratings, mask)
        print("shuffling the data")
        for i in range(ratings.shape[0] // batch_size + 1):
            upper = min((i+1)*batch_size, ratings.shape[0])
            r = ratings[i*batch_size:upper].toarray()
            m = mask[i*batch_size:upper].toarray()
            if normalize:
                #r = r - mu * m
                r = r * m
            yield r, r

In [28]:
def mse_masked(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, 0), dtype='float32')
    #y_true = y_true + mu * mask
    #y_pred = y_pred + mu * mask
    y_true = y_true * mask
    y_pred = y_pred * mask
    diff = y_pred - y_true
    sqdiff = diff * diff * mask
    sse = tf.reduce_sum(tf.reduce_sum(sqdiff))
    n = tf.reduce_sum(tf.reduce_sum(mask))
    return sse / n

def mspe_masked(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, 0), dtype='float32')
    mape = tf.keras.losses.MeanAbsolutePercentageError()
    return mape(y_true * mask, y_pred * mask)

def mae_masked(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, 0), dtype='float32')
    #y_true = y_true + mu * mask
    #y_pred = y_pred + mu * mask
    #y_true = y_true * mask
    #y_pred = y_pred * mask
    mape = tf.keras.losses.MeanAbsoluteError()
    return mape(y_true * mask, y_pred * mask)


## Collaborative filtering

Deep collaborative filtering implementation can be inspired by the following tutorial:

https://medium.com/@jdwittenauer/deep-learning-with-keras-recommender-systems-e7b99cb29929

In [60]:
def build_collaborative_filtering(n_users, n_movies, n_factors):
    user = tf.keras.layers.Input(shape=(1,))
    u = tf.keras.layers.Embedding(n_users, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=tf.keras.regularizers.l2(1e-6))(user)
    u =  tf.keras.layers.Reshape((n_factors,))(u)
    
    movie =  tf.keras.layers.Input(shape=(1,))
    m =  tf.keras.layers.Embedding(n_movies, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=tf.keras.regularizers.l2(1e-6))(movie)
    m =  tf.keras.layers.Reshape((n_factors,))(m)
    
    x =  tf.keras.layers.Dot(axes=1)([u, m])
    model =  tf.keras.models.Model(inputs=[user, movie], outputs=x)
    opt = tf.keras.optimizers.Adam(lr=0.001)
    model.compile(loss=[mse_masked], optimizer=opt)
    return model

In [61]:
nusers = raw_data.user_id.nunique()
nitems = raw_data.item_id.nunique()
factors = 128
cfi = build_collaborative_filtering(nusers, nitems, factors)

In [None]:
train_mask = (user_item_matrix > 0.0) * 1.0
batch_size = 64
steps_per_epoch = user_item_matrix.shape[0] // batch_size

In [None]:
results = deepautorec.fit_generator(generator_ratings_features(user_item_matrix, features_matrix, train_mask, batch_size), epochs=2, steps_per_epoch=500)

## Autoencoders

In [36]:
def buildautorec_single_input(X_shape):
    inp = tf.keras.layers.Input(shape=(X_shape, ))
    drop1 = tf.keras.layers.Dropout(rate=0.2)(inp)
    enc = tf.keras.layers.Dense(X_shape // 8, activation='relu')(drop1)
    drop2 = tf.keras.layers.Dropout(rate=0.2)(enc)
    out = tf.keras.layers.Dense(X_shape, activation='relu')(drop2)
    model = tf.keras.models.Model(inputs=inp, outputs=out)
    optimizer = tf.keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss=[mse_masked], metrics=[mse_masked, mspe_masked, mae_masked])
    
    print(model.summary())
    return model

In [29]:
def build_deep_autorec_single_input(X_shape):
    inp = tf.keras.layers.Input(shape=(X_shape, ))
    drop1 = tf.keras.layers.Dropout(rate=0.2)(inp)
    enc1 = tf.keras.layers.Dense(X_shape // 4, activation='tanh')(drop1)
    drop2 = tf.keras.layers.Dropout(rate=0.2)(enc1)
    enc2 = tf.keras.layers.Dense(X_shape // 8, activation='tanh')(drop2)
    drop3 = tf.keras.layers.Dropout(rate=0.2)(enc2)
    dec1 = tf.keras.layers.Dense(X_shape // 4, activation='tanh')(drop3)
    drop4 = tf.keras.layers.Dropout(rate=0.2)(dec1)
    out = tf.keras.layers.Dense(X_shape, activation='relu')(drop4)
    model = tf.keras.models.Model(inputs=inp, outputs=out)
    optimizer = tf.keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss=[mse_masked], metrics=[mse_masked, mspe_masked, mae_masked])
    
    print(model.summary())
    return model

In [46]:
def build_autorec_multi_input(X_shape, F_shape):
    inp1 = tf.keras.layers.Input(shape=(X_shape, ))
    inp2 = tf.keras.layers.Input(shape=(F_shape, ))
    concat = tf.keras.layers.Concatenate()
    combined = concat([inp1, inp2])
    drop1 = tf.keras.layers.Dropout(rate=0.2)(combined)
    enc1 = tf.keras.layers.Dense(X_shape // 4, activation='tanh')(drop1)
    drop2 = tf.keras.layers.Dropout(rate=0.2)(enc1)
    enc2 = tf.keras.layers.Dense(X_shape // 8, activation='tanh')(drop2)
    drop3 = tf.keras.layers.Dropout(rate=0.2)(enc2)
    dec1 = tf.keras.layers.Dense(X_shape // 4, activation='tanh')(drop3)
    drop4 = tf.keras.layers.Dropout(rate=0.2)(dec1)
    out = tf.keras.layers.Dense(X_shape, activation='relu')(drop4)
    
    
    
    model = tf.keras.models.Model(inputs=[inp1, inp2], outputs=out)
    optimizer = tf.keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss=[mse_masked], metrics=[mse_masked, mspe_masked, mae_masked])
    
    print(model.summary())
    return model

In [47]:
deepautorec = build_autorec_multi_input(user_item_matrix.shape[1], features_matrix.shape[1])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 9560)         0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 57)           0                                            
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 9617)         0           input_8[0][0]                    
                                                                 input_9[0][0]                    
__________________________________________________________________________________________________
dropout_14 (Dropout)            (None, 9617)         0           concatenate_2[0][0]              
__________

In [48]:
train_mask = (user_item_matrix > 0.0) * 1.0

In [49]:
batch_size = 64
steps_per_epoch = user_item_matrix.shape[0] // batch_size

In [50]:
results = deepautorec.fit_generator(generator_ratings_features(user_item_matrix, features_matrix, train_mask, batch_size), epochs=2, steps_per_epoch=500)

Epoch 1/2

KeyboardInterrupt: 

In [34]:
results.history

{'loss': [11.905031005859374, 19.175671463012694],
 'mse_masked': [11.905032, 19.175665],
 'mspe_masked': [0.009350273, 0.0119173955],
 'mae_masked': [0.00034039113, 0.00043758715]}