# Tutorial DeepCTR-torch

MAN 3160 - Sistemas Recomendadores

## Importar Librerías

In [None]:
# Instalamos librerías para descarcar y descomprimir archivos.

!pip install deepctr-torch
!pip install tensorflow

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM

  from .autonotebook import tqdm as notebook_tqdm


## Entrenamiento de agente

In [2]:
dir_train = 'ml-100k'

# Generamos los títulos de las columnas del archivo items.

columns = ['itemid', 'title', 'release_date', 'video_release_date', \
           'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', \
           'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', \
           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', \
           'Thriller', 'War', 'Western']

In [3]:
# Primero creamos el dataframe con los datos
df = pd.read_csv(f'{dir_train}/u.data',
                         sep='\t',
                         names=['userid', 'itemid', 'rating', 'timestamp'],
                         header=None)

In [4]:
# Cargamos el dataset con los items
df_items = pd.read_csv(f'{dir_train}/u.item',
                        sep='|',
                        index_col=0,
                        names = columns,
                        header=None,
                        encoding='latin-1')

In [5]:
columns_user = ['userid', 'age', 'gender', 'occupation', 'zip_code']

In [6]:
df_users = pd.read_csv(f'{dir_train}/u.user',
                        sep='|',
                        index_col=0,
                        names = columns_user,
                        header=None,
                        encoding='latin-1')

In [7]:
df

Unnamed: 0,userid,itemid,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [8]:
df_users = df_users.reset_index()

In [9]:
df_items = df_items.reset_index()

In [10]:
df_total = pd.merge(df, df_users, on='userid')

In [11]:
df_total

Unnamed: 0,userid,itemid,rating,timestamp,age,gender,occupation,zip_code
0,196,242,3,881250949,49,M,writer,55105
1,196,393,4,881251863,49,M,writer,55105
2,196,381,4,881251728,49,M,writer,55105
3,196,251,3,881251274,49,M,writer,55105
4,196,655,5,881251793,49,M,writer,55105
...,...,...,...,...,...,...,...,...
99995,941,919,5,875048887,20,M,student,97229
99996,941,273,3,875049038,20,M,student,97229
99997,941,1,5,875049144,20,M,student,97229
99998,941,294,4,875048532,20,M,student,97229


In [12]:
df_total = pd.merge(df_total, df_items, on='itemid')

In [13]:
df_total

Unnamed: 0,userid,itemid,rating,timestamp,age,gender,occupation,zip_code,title,release_date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,49,M,writer,55105,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
1,305,242,5,886307828,23,M,programmer,94086,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
2,6,242,4,883268170,42,M,executive,98101,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
3,234,242,4,891033261,60,M,retired,94702,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
4,63,242,3,875747190,31,M,marketing,75240,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,863,1679,3,889289491,17,M,student,60089,B. Monkey (1998),06-Feb-1998,...,0,0,0,0,0,1,0,1,0,0
99996,863,1678,1,889289570,17,M,student,60089,Mat' i syn (1997),06-Feb-1998,...,0,0,0,0,0,0,0,0,0,0
99997,863,1680,2,889289570,17,M,student,60089,Sliding Doors (1998),01-Jan-1998,...,0,0,0,0,0,1,0,0,0,0
99998,896,1681,3,887160722,28,M,writer,91505,You So Crazy (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df_total['datetime'] = pd.to_datetime(df_total['timestamp'], unit='s')  # Convert the column to datetime format if it's not

# Create new columns
df_total['Year'] = df_total['datetime'].dt.year
df_total['Month'] = df_total['datetime'].dt.month
df_total['Weekday'] = df_total['datetime'].dt.weekday  # Monday=0, Sunday=6
df_total['Time'] = df_total['datetime'].dt.time
df_total['Hour'] = df_total['datetime'].dt.hour
print(df_total)

       userid  itemid  rating  timestamp  age gender  occupation zip_code  \
0         196     242       3  881250949   49      M      writer    55105   
1         305     242       5  886307828   23      M  programmer    94086   
2           6     242       4  883268170   42      M   executive    98101   
3         234     242       4  891033261   60      M     retired    94702   
4          63     242       3  875747190   31      M   marketing    75240   
...       ...     ...     ...        ...  ...    ...         ...      ...   
99995     863    1679       3  889289491   17      M     student    60089   
99996     863    1678       1  889289570   17      M     student    60089   
99997     863    1680       2  889289570   17      M     student    60089   
99998     896    1681       3  887160722   28      M      writer    91505   
99999     916    1682       3  880845755   27      M    engineer    N2L5N   

                                           title release_date  ...  Sci-Fi 

In [15]:
def definir_bloque_horario(hour):
    if 6 <= hour < 12:
        return 'Mañana'
    elif 12 <= hour < 19:
        return 'Tarde'
    else:
        return 'Noche'

In [16]:
df_total['Horario'] = df_total['Hour'].apply(definir_bloque_horario)

In [17]:
df_total[['userid', 'itemid', 'rating', 'Horario']]

Unnamed: 0,userid,itemid,rating,Horario
0,196,242,3,Tarde
1,305,242,5,Noche
2,6,242,4,Noche
3,234,242,4,Noche
4,63,242,3,Noche
...,...,...,...,...
99995,863,1679,3,Tarde
99996,863,1678,1,Tarde
99997,863,1680,2,Tarde
99998,896,1681,3,Noche


In [19]:
sparse_features = ["userid", "itemid", "gender", "age", "occupation", "Horario"]
target = ['rating']

In [20]:
for feat in sparse_features:
    lbe = LabelEncoder()
    df_total[feat] = lbe.fit_transform(df_total[feat])

In [21]:
df_total

Unnamed: 0,userid,itemid,rating,timestamp,age,gender,occupation,zip_code,title,release_date,...,Thriller,War,Western,datetime,Year,Month,Weekday,Time,Hour,Horario
0,195,241,3,881250949,39,1,20,55105,Kolya (1996),24-Jan-1997,...,0,0,0,1997-12-04 15:55:49,1997,12,3,15:55:49,15,2
1,304,241,5,886307828,13,1,14,94086,Kolya (1996),24-Jan-1997,...,0,0,0,1998-02-01 04:37:08,1998,2,6,04:37:08,4,1
2,5,241,4,883268170,32,1,6,98101,Kolya (1996),24-Jan-1997,...,0,0,0,1997-12-28 00:16:10,1997,12,6,00:16:10,0,1
3,233,241,4,891033261,50,1,15,94702,Kolya (1996),24-Jan-1997,...,0,0,0,1998-03-27 21:14:21,1998,3,4,21:14:21,21,1
4,62,241,3,875747190,21,1,11,75240,Kolya (1996),24-Jan-1997,...,0,0,0,1997-10-01 23:06:30,1997,10,2,23:06:30,23,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,862,1678,3,889289491,7,1,18,60089,B. Monkey (1998),06-Feb-1998,...,1,0,0,1998-03-07 16:51:31,1998,3,5,16:51:31,16,2
99996,862,1677,1,889289570,7,1,18,60089,Mat' i syn (1997),06-Feb-1998,...,0,0,0,1998-03-07 16:52:50,1998,3,5,16:52:50,16,2
99997,862,1679,2,889289570,7,1,18,60089,Sliding Doors (1998),01-Jan-1998,...,0,0,0,1998-03-07 16:52:50,1998,3,5,16:52:50,16,2
99998,895,1680,3,887160722,18,1,20,91505,You So Crazy (1994),01-Jan-1994,...,0,0,0,1998-02-11 01:32:02,1998,2,2,01:32:02,1,1


In [22]:
def combine_genres(row):
    genres = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', \
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    return '|'.join([col for col, val in row.items() if val == 1 and col in genres])

In [23]:
df_total['genres'] = df_total.apply(combine_genres, axis=1)

In [24]:
df_total = df_total[['userid', 'itemid', 'rating', 'timestamp', 'age', 'gender', 'occupation', 'Horario', 'genres']]

In [25]:
df_total

Unnamed: 0,userid,itemid,rating,timestamp,age,gender,occupation,Horario,genres
0,195,241,3,881250949,39,1,20,2,Comedy
1,304,241,5,886307828,13,1,14,1,Comedy
2,5,241,4,883268170,32,1,6,1,Comedy
3,233,241,4,891033261,50,1,15,1,Comedy
4,62,241,3,875747190,21,1,11,1,Comedy
...,...,...,...,...,...,...,...,...,...
99995,862,1678,3,889289491,7,1,18,2,Romance|Thriller
99996,862,1677,1,889289570,7,1,18,2,Drama
99997,862,1679,2,889289570,7,1,18,2,Drama|Romance
99998,895,1680,3,887160722,18,1,20,1,Comedy


In [26]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [27]:
key2index = {}
genres_list = list(map(split, df_total['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)

In [28]:
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [29]:
genres_list

array([[1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       ...,
       [4, 2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [4, 0, 0, 0, 0, 0]])

In [30]:
fixlen_feature_columns = [SparseFeat(feat, df_total[feat].nunique(), embedding_dim=4)
                              for feat in sparse_features]

In [32]:
varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean')]  # Notice : value 0 is for padding for sequence input feature

In [33]:
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

In [34]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [35]:
feature_names

['userid', 'itemid', 'gender', 'age', 'occupation', 'Horario', 'genres']

In [36]:
train, test = train_test_split(df_total, test_size=0.25)

In [38]:
train

Unnamed: 0,userid,itemid,rating,timestamp,age,gender,occupation,Horario,genres
7257,434,469,2,884131661,14,1,4,1,Western
30089,410,180,5,892845605,24,1,3,1,Action|Adventure|Romance|Sci-Fi|War
83742,726,754,2,883712828,15,1,18,1,Action|Adventure|Children|Fantasy|Sci-Fi
24023,939,357,1,884801227,22,1,0,2,Action|Adventure|Sci-Fi|Thriller
42368,534,949,3,879618019,35,0,3,2,Drama
...,...,...,...,...,...,...,...,...,...
49027,847,196,5,887038021,36,1,4,2,Drama|Romance
68819,888,189,3,880177994,14,1,19,1,Drama|War
33063,901,126,3,879464726,35,0,1,1,Action|Crime|Drama
50413,325,0,3,879876159,31,1,0,2,Animation|Children|Comedy


In [37]:
train_index = train.index.tolist()
test_index = test.index.tolist()

In [40]:
train_index[:10]

[7257, 30089, 83742, 24023, 42368, 36431, 23717, 40085, 17293, 80631]

In [41]:
genres_list_train = genres_list[train_index]
genres_list_test = genres_list[test_index]

In [42]:
genres_list_train

array([[15,  0,  0,  0,  0,  0],
       [ 7,  3,  2,  8,  5,  0],
       [ 7,  3,  6,  9,  8,  0],
       ...,
       [ 7, 11,  4,  0,  0,  0],
       [16,  6,  1,  0,  0,  0],
       [ 1,  9,  2,  8,  0,  0]])

In [43]:
train_model_input = {name: train[name] for name in sparse_features}
train_model_input["genres"] = genres_list_train
test_model_input = {name: test[name] for name in sparse_features}
test_model_input["genres"] = genres_list_test

In [44]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)

model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=5, verbose=2, validation_split=0.2)

cpu
Train on 60000 samples, validate on 15000 samples, 235 steps per epoch
Epoch 1/5
6s - loss:  2.2523 - mse:  2.2489 - val_mse:  0.9097
Epoch 2/5
6s - loss:  0.8842 - mse:  0.8840 - val_mse:  0.8890
Epoch 3/5
6s - loss:  0.8641 - mse:  0.8643 - val_mse:  0.8875
Epoch 4/5
6s - loss:  0.8579 - mse:  0.8581 - val_mse:  0.8909
Epoch 5/5
6s - loss:  0.8546 - mse:  0.8543 - val_mse:  0.8902


## Evaluación

In [45]:
pred_ans = model.predict(test_model_input, 256)

In [46]:
print("test MSE", round(mean_squared_error(
        test[target].values, pred_ans), 4))

test MSE 0.8963


### Material adicional

* [Documentación DeepCTR-Torch](https://deepctr-torch.readthedocs.io/en/latest/index.html)
* [Ejemplos de documentación](https://deepctr-torch.readthedocs.io/en/latest/Examples.html)