# Intro

This notebook implements an experiment aimed to verify accuracy of Deep Hybrid AutoencodeR Recommendation ENgine (DHARREN), on a dataset used in a [recent publication](https://github.com/MengtingWan/marketBias). 

# Technical prep

In [1]:
import pandas as pd
import io
import requests
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K

from scipy.sparse import lil_matrix, save_npz, load_npz
from sklearn.preprocessing import OneHotEncoder


from tqdm.autonotebook import tqdm
from tqdm import trange



# Data prep

In [2]:
url="https://raw.githubusercontent.com/MengtingWan/marketBias/master/data/df_electronics.csv"
s=requests.get(url).content

In [3]:
raw_data = pd.read_csv(io.StringIO(s.decode('utf-8')), sep=",")

In [4]:
raw_data.head(3)

Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,brand,year,user_attr,split
0,0,0,5.0,1999-06-13,Female,Portable Audio & Video,,1999,,0
1,0,1,5.0,1999-06-14,Female,Portable Audio & Video,,1999,,0
2,0,2,3.0,1999-06-17,Female,Portable Audio & Video,,1999,,0


In [5]:
raw_data.brand.unique()

array([nan, 'HP', 'Philips', 'Polaroid', 'Panasonic', 'JVC', 'Fujifilm',
       'Nikon', 'Kodak', 'Sony', 'Canon', 'Kensington', 'Pyle', 'Olympus',
       'Toshiba', 'Logitech', 'Etre Jeune', 'Linksys', 'Vivitar',
       'Sennheiser', 'Apple', 'Samsung', 'EldHus', 'Bose', 'Archos',
       'Garmin', 'Jabra', 'Gary Fong', 'ViewSonic', 'Savage', 'Uniden',
       'ebasy', 'Generic', 'JLAB', 'Skullcandy', 'TaoTronics', 'Neewer',
       'Koolertron', 'DURAGADGET', 'iRULU', 'Tiamat', 'DBPOWER', 'Fintie',
       'Plemo', 'EINCAR', 'Cooper Cases', 'LSS', 'Mpow', 'XShields',
       'IRULU', 'Funlux'], dtype=object)

In [6]:
raw_data.model_attr.fillna("missing", inplace=True)
raw_data.user_attr.fillna("missing", inplace=True)
raw_data.brand.fillna("missing", inplace=True)

In [7]:
max_uid = raw_data.user_id.max()
max_uid

1157632

Testing if data consist consecutive id numbers

In [8]:
unique_uid = pd.Series(raw_data.user_id.unique())
expected_ids = pd.Series(range(max_uid + 1))

pd.util.testing.assert_series_equal(unique_uid, expected_ids)

In [9]:
max_item_id = raw_data.item_id.max()
unique_iids = pd.Series(raw_data.item_id.unique())
expected_item_ids = pd.Series(range(max_item_id+1))

pd.util.testing.assert_series_equal(expected_item_ids, unique_iids)

In [10]:
feature_columns = ['user_attr', 'model_attr', 'brand']

In [11]:
encoded_data = pd.get_dummies(raw_data[feature_columns], sparse=True)
encoded_data['user_id'] = raw_data.user_id.values
encoded_data['item_id'] = raw_data.item_id.values

In [12]:
raw_data.sort_values(by=['user_id', 'item_id'], ascending=True, inplace=True)
encoded_data.sort_values(by=['user_id', 'item_id'], ascending=True, inplace=True)

## Recreate ratings matrix if neccessary

In [13]:
recreate_matrix = False

In [14]:
if recreate_matrix:
    user_item_matrix = lil_matrix((max_uid+1, max_item_id+1), dtype=np.int8)

    for row_idx, row in raw_data.iterrows():
        uidx = row['user_id']
        iidx = row['item_id']
        rating = row['rating']    
        user_item_matrix[uidx, iidx] = rating
        if row_idx % 100000 == 0:
            print(f"Processed: {row_idx / float(raw_data.shape[0])}%")
            
    user_item_matrix = user_item_matrix.tocsr()
    print("done")
    save_npz("../data/processed/ratings_sparse_mat.npz", user_item_matrix)

## Load ratings matrix

In [15]:
user_item_matrix = load_npz("../data/processed/ratings_sparse_mat.npz")

In [17]:
print("done")

done


# Model prep

## Deep autorec single input

In [18]:
def custom_loss(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, 0), dtype='float32')
    y_true = y_true + mu * mask
    y_pred = y_pred + mu * mask
    diff = y_pred - y_true
    sqdiff = diff * diff * mask
    sse = tf.reduce_sum(tf.reduce_sum(sqdiff))
    n = tf.reduce_sum(tf.reduce_sum(mask))
    return sse / n

def custom_perc_loss(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, 0), dtype='float32')
    mape = tf.keras.losses.MeanAbsolutePercentageError()
    return mape(y_true * mask, y_pred * mask)

In [None]:
X_shape = user_item_matrix.shape[1]
inp = tf.keras.layers.Input(shape=(X_shape, ))
#drop1 = tf.keras.layers.Dropout(rate=0.2)(inp)
enc1 = tf.keras.layers.Dense(X_shape // 4, activation='tanh')(inp)
#drop2 = tf.keras.layers.Dropout(rate=0.2)(enc1)
out = tf.keras.Dense(X_shape, activation='relu')(enc1)
model = tf.keras.models.Model(inputs=inp, outputs=out)
optimizer = tf.keras.optimizers.Adam()
#model.compile(optimizer=optimizer, loss='mse', metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.MeanSquaredError()])

In [20]:
def build_autorec_single_input(X_shape):
    """
    inp = tf.keras.layers.Input(shape=(X_shape, ))    
    seq.add(tf.keras.layers.Dropout(rate=0.2))
    seq.add(tf.keras.layers.Dense(X_shape // 4, activation='tanh'))
    seq.add(tf.keras.layers.Dropout(rate=0.2))
    seq.add(tf.keras.layers.Dense(X_shape // 8, activation='tanh'))
    seq.add(tf.keras.layers.Dropout(rate=0.2))
    seq.add(tf.keras.layers.Dense(X_shape // 4, activation='tanh'))
    seq.add(tf.keras.layers.Dropout(rate=0.2))
    out = tf.keras.layers.Dense(X_shape, activation='relu')
    model = tf.keras.models.Model(inputs=inp, outputs=out)
    """

    enc1 = tf.keras.layers.Dense(X_shape // 4, activation='tanh', input_shape=(X_shape, ))
    drop1 = tf.keras.layers.Dropout(rate=0.2)
    enc2 = tf.keras.layers.Dense(X_shape // 8, activation='tanh')
    drop2 = tf.keras.layers.Dropout(rate=0.2)
    dec1 = tf.keras.layers.Dense(X_shape // 4, activation='tanh')
    drop3 = tf.keras.layers.Dropout(rate=0.2)
    out = tf.keras.layers.Dense(X_shape, activation='relu')

    model = tf.keras.models.Sequential([enc1, drop1, enc2, drop2, dec1, drop3, out])
    optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr)
    model.compile(optimizer=optimizer, loss=[custom_loss], metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.MeanSquaredError()])
    print(model.summary())
    return model

In [None]:
deepautorec = build_autorec_single_input(user_item_matrix.shape[1])