# User Encoder - VAE

In [1]:
import numpy as np
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta
from scipy import sparse
%matplotlib inline

sns.set_theme(style="ticks")

## Dataset

In [2]:
df_trip = pd.read_csv('../output/booking/dataset/train_0.1_10.csv',parse_dates=['start_trip', 'last_checkin'])
df_trip['start_trip_month'] = df_trip['start_trip'].dt.month
df_trip['last_checkin_month'] = df_trip['last_checkin'].dt.month


df_trip.head()

Unnamed: 0,utrip_id,user_id,user_features,count_unique_city,trip_size,start_trip,end_trip,checkin_list,checkout_list,days_since_2016_list,...,last_booker_country,last_step,first_city_id,first_hotel_country,last_city_id,last_hotel_country,country_count,duration_sum,start_trip_month,last_checkin_month
0,1000027_1,1000027,"[-0.2721548080444336, -0.3261375427246094, -0....",4,3,2016-08-13,2016-08-18,"['0', '0', '0', '0', '0', '0', '2016-08-13', '...","['0', '0', '0', '0', '0', '0', '2016-08-14', '...","[0, 0, 0, 0, 0, 0, 225, 226, 228, 230]",...,Elbonia,4,8183,Gondal,30628,Gondal,10,8,8,8
1,1000033_1,1000033,"[-0.653695285320282, 0.9078158736228943, 0.579...",4,4,2016-04-09,2016-04-16,"['0', '0', '0', '0', '0', '2016-04-09', '2016-...","['0', '0', '0', '0', '0', '2016-04-11', '2016-...","[0, 0, 0, 0, 0, 99, 101, 102, 104, 106]",...,Gondal,5,38677,Cobra Island,38677,Cobra Island,122,10,4,4
2,1000045_1,1000045,"[-1.1103595495224, -1.2900782823562622, -0.307...",7,6,2016-06-18,2016-06-28,"['0', '0', '0', '2016-06-18', '2016-06-20', '2...","['0', '0', '0', '2016-06-20', '2016-06-22', '2...","[0, 0, 0, 169, 171, 173, 175, 176, 177, 179]",...,The Devilfire Empire,7,64876,Fook Island,36063,Gondal,143,11,6,6
3,1000083_1,1000083,"[1.3209058046340942, 0.19926407933235168, 0.57...",4,3,2016-06-13,2016-06-16,"['0', '0', '0', '0', '0', '0', '2016-06-13', '...","['0', '0', '0', '0', '0', '0', '2016-06-14', '...","[0, 0, 0, 0, 0, 0, 164, 165, 166, 167]",...,The Devilfire Empire,4,55990,Osterlich,36063,Gondal,122,5,6,6
4,100008_1,100008,"[-0.346758633852005, 0.11678697913885117, -1.0...",5,4,2016-07-18,2016-07-25,"['0', '0', '0', '0', '0', '2016-07-18', '2016-...","['0', '0', '0', '0', '0', '2016-07-21', '2016-...","[0, 0, 0, 0, 0, 199, 202, 203, 204, 206]",...,Gondal,5,11306,Kamistan,65690,Kamistan,31,9,7,7


## Geral User Features

In [3]:

df_trip.iloc[0]

utrip_id                                                        1000027_1
user_id                                                           1000027
user_features           [-0.2721548080444336, -0.3261375427246094, -0....
count_unique_city                                                       4
trip_size                                                               3
start_trip                                            2016-08-13 00:00:00
end_trip                                                       2016-08-18
checkin_list            ['0', '0', '0', '0', '0', '0', '2016-08-13', '...
checkout_list           ['0', '0', '0', '0', '0', '0', '2016-08-14', '...
days_since_2016_list               [0, 0, 0, 0, 0, 0, 225, 226, 228, 230]
duration_list                              [0, 0, 0, 0, 0, 0, 1, 2, 2, 3]
city_id_list            ['0', '0', '0', '0', '0', '0', '8183', '15626'...
device_class_list       ['0', '0', '0', '0', '0', '0', 'desktop', 'des...
affiliate_id_list       ['0', '0', '0'

In [4]:
import scipy

def mode(x):
    return scipy.stats.mode(x)[0]

def group_by_trip(df_trip):

    df_user = df_trip.groupby(['user_id']).agg(
        trip_count=('count_unique_city', 'count'),
        count_unique_city=('count_unique_city', 'sum'),
        mean_trip_size=('trip_size', 'mean'),
        mode_last_checkin_month=('last_checkin_month', mode),
        mode_device_class=('last_device_class', mode),
        mode_trip_month=('start_trip_month', mode),
        mode_city_id=('first_city_id', mode),
        mode_affiliate_id=('last_affiliate_id', mode),
        mode_booker_country=('last_booker_country', mode),
        country_count=('country_count', 'sum'),
        mean_duration_sum=('duration_sum', 'mean'),
        sum_duration_sum=('duration_sum', 'sum'),
    )
    
    df_user             = df_user.reset_index()
    df_user['user_id']  = df_user['user_id'].astype('int')

    for c in ['mode_last_checkin_month', 'mode_device_class', 'mode_trip_month',
    'mode_city_id', 'mode_affiliate_id', 'mode_booker_country']:
        df_user[c]  = df_user[c].astype('str')
        
    return df_user

df_user = group_by_trip(df_trip)
df_user.sort_index()

Unnamed: 0,user_id,trip_count,count_unique_city,mean_trip_size,mode_last_checkin_month,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,mode_booker_country,country_count,mean_duration_sum,sum_duration_sum
0,29,1,4,3.0,7,desktop,7,47054,8132,Elbonia,2,9.0,9
1,81,1,4,3.0,5,desktop,5,33665,9924,Elbonia,11,6.0,6
2,136,2,11,4.5,4,desktop,4,38793,9924,The Devilfire Empire,137,7.0,14
3,156,1,7,7.0,8,mobile,8,2748,359,Gondal,3,9.0,9
4,160,1,4,3.0,8,tablet,7,17590,10332,Gondal,2,18.0,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...
171300,6257979,1,4,3.0,8,desktop,8,42560,10332,Gondal,2,5.0,5
171301,6258017,1,7,7.0,3,mobile,2,13356,359,The Devilfire Empire,157,16.0,16
171302,6258041,1,3,3.0,5,mobile,4,57109,9452,Elbonia,19,4.0,4
171303,6258065,1,4,3.0,4,mobile,4,59444,9452,Gondal,4,8.0,8


In [5]:
#df_user_all.to_csv('../output/booking/dataset/user_features_raw.csv')

## Transform Dataset

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [7]:
df = df_user.set_index('user_id')
df.head()

Unnamed: 0_level_0,trip_count,count_unique_city,mean_trip_size,mode_last_checkin_month,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,mode_booker_country,country_count,mean_duration_sum,sum_duration_sum
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
29,1,4,3.0,7,desktop,7,47054,8132,Elbonia,2,9.0,9
81,1,4,3.0,5,desktop,5,33665,9924,Elbonia,11,6.0,6
136,2,11,4.5,4,desktop,4,38793,9924,The Devilfire Empire,137,7.0,14
156,1,7,7.0,8,mobile,8,2748,359,Gondal,3,9.0,9
160,1,4,3.0,8,tablet,7,17590,10332,Gondal,2,18.0,18


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171305 entries, 29 to 6258087
Data columns (total 12 columns):
trip_count                 171305 non-null int64
count_unique_city          171305 non-null int64
mean_trip_size             171305 non-null float64
mode_last_checkin_month    171305 non-null object
mode_device_class          171305 non-null object
mode_trip_month            171305 non-null object
mode_city_id               171305 non-null object
mode_affiliate_id          171305 non-null object
mode_booker_country        171305 non-null object
country_count              171305 non-null int64
mean_duration_sum          171305 non-null float64
sum_duration_sum           171305 non-null int64
dtypes: float64(2), int64(4), object(6)
memory usage: 17.0+ MB


In [9]:
df.select_dtypes(include=['int64', 'float64']).columns

Index(['trip_count', 'count_unique_city', 'mean_trip_size', 'country_count',
       'mean_duration_sum', 'sum_duration_sum'],
      dtype='object')

In [10]:
# determine categorical and numerical features
numerical_ix   = df.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = df.select_dtypes(include=['object', 'bool']).columns

In [11]:
df[categorical_ix]

Unnamed: 0_level_0,mode_last_checkin_month,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,mode_booker_country
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
29,7,desktop,7,47054,8132,Elbonia
81,5,desktop,5,33665,9924,Elbonia
136,4,desktop,4,38793,9924,The Devilfire Empire
156,8,mobile,8,2748,359,Gondal
160,8,tablet,7,17590,10332,Gondal
...,...,...,...,...,...,...
6257979,8,desktop,8,42560,10332,Gondal
6258017,3,mobile,2,13356,359,The Devilfire Empire
6258041,5,mobile,4,57109,9452,Elbonia
6258065,4,mobile,4,59444,9452,Gondal


In [12]:
df[numerical_ix]

Unnamed: 0_level_0,trip_count,count_unique_city,mean_trip_size,country_count,mean_duration_sum,sum_duration_sum
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
29,1,4,3.0,2,9.0,9
81,1,4,3.0,11,6.0,6
136,2,11,4.5,137,7.0,14
156,1,7,7.0,3,9.0,9
160,1,4,3.0,2,18.0,18
...,...,...,...,...,...,...
6257979,1,4,3.0,2,5.0,5
6258017,1,7,7.0,157,16.0,16
6258041,1,3,3.0,19,4.0,4
6258065,1,4,3.0,4,8.0,8


In [13]:
# define the data preparation for the columns
t = [
     ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_ix), 
     ('num', StandardScaler(), numerical_ix)
    ]

col_transform = ColumnTransformer(transformers=t)
df_transform  = col_transform.fit_transform(df)
df_transform

<171305x19308 sparse matrix of type '<class 'numpy.float64'>'
	with 2055660 stored elements in Compressed Sparse Row format>

In [14]:
df_transform.shape

(171305, 19308)

In [15]:
# from scipy import sparse

# sparse.save_npz("../output/booking/dataset/train_user_features.npz", df_transform)
# df.reset_index().to_csv('../output/booking/dataset/train_user_features.csv', index=False)

## Train Autoencoder

In [16]:
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [43]:
batch_size   = 512
epochs       = 100
learning_rate = 1e-3

In [44]:
#pd.DataFrame(df_transform.todense())

In [45]:
# train_loader = torch.utils.data.DataLoader(
#     pd.DataFrame(df_transform.todense()), batch_size=batch_size, shuffle=True
# )
import torch.utils.data as D

# next(train_loader)
class Dataset(D.Dataset):
    
    def __init__(self, sparse):
        self.data = sparse

    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        # convert it to COO so to get the atributes to create a sparse tensor
        data = self.data[index].tocoo()
        i = torch.LongTensor(np.vstack((data.row, data.col)))
        v = torch.FloatTensor(data.data)
        data = torch.sparse.FloatTensor(i, v, torch.Size(data.shape))
        return data

d = Dataset(df_transform)
d.__getitem__([1])

tensor(indices=tensor([[    0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0],
                       [    7,    12,    22,  6856, 19278, 19298, 19302, 19303,
                        19304, 19305, 19306, 19307]]),
       values=tensor([ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
                      -0.2258, -0.3826, -0.8056, -0.3666, -0.7609, -0.7528]),
       size=(1, 19308), nnz=12, layout=torch.sparse_coo)

In [46]:
train_loader = torch.utils.data.DataLoader(d, batch_size=batch_size, num_workers=0)
len(d)

171305

In [47]:
class AE(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        
        self.encoder_hidden_layer = nn.Linear(
            in_features=kwargs["input_shape"], out_features=128
        )
        self.encoder_output_layer = nn.Linear(
            in_features=128, out_features=10
        )
        self.decoder_hidden_layer = nn.Linear(
            in_features=10, out_features=128
        )
        self.decoder_output_layer = nn.Linear(
            in_features=128, out_features=kwargs["input_shape"]
        )
        
        self.dropout = nn.Dropout(0.3)

    def encoder(self, x):
        x = self.encoder_hidden_layer(x)
        x = torch.selu(x)
        x = self.encoder_output_layer(x)
        #code = torch.relu(code)
        return x
    
    def decoder(self, x):
        x = self.decoder_hidden_layer(x)
        x = torch.selu(x)
        x = self.decoder_output_layer(x)
        #x = torch.relu(x)        
        return x
    def forward(self, x):
        x = self.decoder(self.dropout(self.encoder(x)))

        return x

In [48]:
#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# create a model from `AE` autoencoder class
# load it to the specified device, either gpu or cpu
model = AE(input_shape=df_transform.shape[1]).to(device)

# create an optimizer object
# Adam optimizer with learning rate 1e-3
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# mean-squared error loss
criterion = nn.MSELoss()

In [49]:
for epoch in range(epochs):
    loss = 0
    for batch_features in train_loader:
        # reshape mini-batch data to [N, 784] matrix
        # load it to the active device
        #batch_features #= batch_features.view(-1, df_transform.shape[1]).to(device)
        batch_features  = batch_features.to(device).to_dense()
        # reset the gradients back to zero
        # PyTorch accumulates gradients on subsequent backward passes
        optimizer.zero_grad()
        
        # compute reconstructions
        outputs = model(batch_features)

        # compute training reconstruction loss
        train_loss = criterion(outputs, batch_features)
        
        # compute accumulated gradients
        train_loss.backward()
        
        # perform parameter update based on current gradients
        optimizer.step()
        
        # add the mini-batch training loss to epoch loss
        loss += train_loss.item()
    
    # compute the epoch training loss
    loss = loss / len(train_loader)
    
    # display the epoch training loss
    print("epoch : {}/{}, recon loss = {:.8f}".format(epoch + 1, epochs, loss))

epoch : 1/100, recon loss = 0.00092350
epoch : 2/100, recon loss = 0.00048237
epoch : 3/100, recon loss = 0.00039003
epoch : 4/100, recon loss = 0.00032559
epoch : 5/100, recon loss = 0.00028720
epoch : 6/100, recon loss = 0.00027816
epoch : 7/100, recon loss = 0.00027167
epoch : 8/100, recon loss = 0.00026868
epoch : 9/100, recon loss = 0.00026658
epoch : 10/100, recon loss = 0.00026548
epoch : 11/100, recon loss = 0.00026527
epoch : 12/100, recon loss = 0.00026337
epoch : 13/100, recon loss = 0.00026239
epoch : 14/100, recon loss = 0.00026133
epoch : 15/100, recon loss = 0.00025965
epoch : 16/100, recon loss = 0.00025852
epoch : 17/100, recon loss = 0.00025852
epoch : 18/100, recon loss = 0.00025659
epoch : 19/100, recon loss = 0.00025614
epoch : 20/100, recon loss = 0.00025560
epoch : 21/100, recon loss = 0.00025534
epoch : 22/100, recon loss = 0.00025469
epoch : 23/100, recon loss = 0.00025425
epoch : 24/100, recon loss = 0.00025277
epoch : 25/100, recon loss = 0.00025210
epoch : 2

In [50]:
PATH = 'model_autoenc.pth'

torch.save(model, PATH)
model = torch.load(PATH)
model.eval()

  "type " + obj.__name__ + ". It won't be checked "


AE(
  (encoder_hidden_layer): Linear(in_features=19308, out_features=128, bias=True)
  (encoder_output_layer): Linear(in_features=128, out_features=10, bias=True)
  (decoder_hidden_layer): Linear(in_features=10, out_features=128, bias=True)
  (decoder_output_layer): Linear(in_features=128, out_features=19308, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

## Predict

Use model trained to predict all users

In [51]:
#df_user = group_by_trip(df_trip)
#

In [52]:
df_test_trip = pd.read_csv('../output/booking/dataset/test_0.1_10.csv',parse_dates=['start_trip', 'last_checkin'])
df_test_trip['start_trip_month'] = df_test_trip['start_trip'].dt.month
df_test_trip['last_checkin_month'] = df_test_trip['last_checkin'].dt.month


df_user_trip = group_by_trip(df_test_trip)
df_user_trip.sort_index()
df_user_trip.head()

Unnamed: 0,user_id,trip_count,count_unique_city,mean_trip_size,mode_last_checkin_month,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,mode_booker_country,country_count,mean_duration_sum,sum_duration_sum
0,469,1,6,5.0,12,mobile,12,9608,359,Gondal,1,6.0,6
1,680,1,4,5.0,8,mobile,8,48507,9452,Gondal,1,8.0,8
2,869,1,4,3.0,7,mobile,6,62036,359,Gondal,1,7.0,7
3,1120,1,5,4.0,7,desktop,7,32713,3449,Tcherkistan,1,19.0,19
4,1431,1,5,5.0,11,mobile,11,14549,359,The Devilfire Empire,1,9.0,9


In [53]:
df_trip.head()

Unnamed: 0,utrip_id,user_id,user_features,count_unique_city,trip_size,start_trip,end_trip,checkin_list,checkout_list,days_since_2016_list,...,last_booker_country,last_step,first_city_id,first_hotel_country,last_city_id,last_hotel_country,country_count,duration_sum,start_trip_month,last_checkin_month
0,1000027_1,1000027,"[-0.2721548080444336, -0.3261375427246094, -0....",4,3,2016-08-13,2016-08-18,"['0', '0', '0', '0', '0', '0', '2016-08-13', '...","['0', '0', '0', '0', '0', '0', '2016-08-14', '...","[0, 0, 0, 0, 0, 0, 225, 226, 228, 230]",...,Elbonia,4,8183,Gondal,30628,Gondal,10,8,8,8
1,1000033_1,1000033,"[-0.653695285320282, 0.9078158736228943, 0.579...",4,4,2016-04-09,2016-04-16,"['0', '0', '0', '0', '0', '2016-04-09', '2016-...","['0', '0', '0', '0', '0', '2016-04-11', '2016-...","[0, 0, 0, 0, 0, 99, 101, 102, 104, 106]",...,Gondal,5,38677,Cobra Island,38677,Cobra Island,122,10,4,4
2,1000045_1,1000045,"[-1.1103595495224, -1.2900782823562622, -0.307...",7,6,2016-06-18,2016-06-28,"['0', '0', '0', '2016-06-18', '2016-06-20', '2...","['0', '0', '0', '2016-06-20', '2016-06-22', '2...","[0, 0, 0, 169, 171, 173, 175, 176, 177, 179]",...,The Devilfire Empire,7,64876,Fook Island,36063,Gondal,143,11,6,6
3,1000083_1,1000083,"[1.3209058046340942, 0.19926407933235168, 0.57...",4,3,2016-06-13,2016-06-16,"['0', '0', '0', '0', '0', '0', '2016-06-13', '...","['0', '0', '0', '0', '0', '0', '2016-06-14', '...","[0, 0, 0, 0, 0, 0, 164, 165, 166, 167]",...,The Devilfire Empire,4,55990,Osterlich,36063,Gondal,122,5,6,6
4,100008_1,100008,"[-0.346758633852005, 0.11678697913885117, -1.0...",5,4,2016-07-18,2016-07-25,"['0', '0', '0', '0', '0', '2016-07-18', '2016-...","['0', '0', '0', '0', '0', '2016-07-21', '2016-...","[0, 0, 0, 0, 0, 199, 202, 203, 204, 206]",...,Gondal,5,11306,Kamistan,65690,Kamistan,31,9,7,7


In [54]:
df_all = pd.concat([df_user, df_user_trip])
df_all_transform  = col_transform.transform(df_all)
df_all_transform



<191841x19308 sparse matrix of type '<class 'numpy.float64'>'
	with 2301894 stored elements in Compressed Sparse Row format>

In [55]:
predict_loader = torch.utils.data.DataLoader(Dataset(df_all_transform), batch_size=batch_size, num_workers=0)
predict_loader

<torch.utils.data.dataloader.DataLoader at 0x7f741ca0e1d0>

In [56]:
data = []
for batch_features in predict_loader:
    # reshape mini-batch data to [N, 784] matrix
    # load it to the active device
    #batch_features #= batch_features.view(-1, df_transform.shape[1]).to(device)
    batch_features  = batch_features.to(device).to_dense()
    # reset the gradients back to zero
    # PyTorch accumulates gradients on subsequent backward passes
    data.extend(model.encoder(batch_features).cpu().detach().numpy().reshape(batch_features.shape[0],-1))

data = np.array(data)
data.shape

(191841, 10)

In [57]:
df_all['user_features'] = data.tolist()

In [58]:
df_all.head()

Unnamed: 0,user_id,trip_count,count_unique_city,mean_trip_size,mode_last_checkin_month,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,mode_booker_country,country_count,mean_duration_sum,sum_duration_sum,user_features
0,29,1,4,3.0,7,desktop,7,47054,8132,Elbonia,2,9.0,9,"[0.06258869171142578, -0.11153742671012878, -0..."
1,81,1,4,3.0,5,desktop,5,33665,9924,Elbonia,11,6.0,6,"[-0.1815657913684845, -0.28620046377182007, -0..."
2,136,2,11,4.5,4,desktop,4,38793,9924,The Devilfire Empire,137,7.0,14,"[0.02179218828678131, 0.04884636402130127, -0...."
3,156,1,7,7.0,8,mobile,8,2748,359,Gondal,3,9.0,9,"[-0.273824006319046, 0.31664836406707764, 0.31..."
4,160,1,4,3.0,8,tablet,7,17590,10332,Gondal,2,18.0,18,"[0.22664602100849152, -0.12496908009052277, 0...."


In [59]:
df_all.reset_index().to_csv('../output/booking/dataset/all_user_features.csv')

In [60]:
from numpy import asarray
from numpy import savetxt

# save to csv file
savetxt('data.csv', data, delimiter='\t')

In [61]:
df_all.reset_index().to_csv('metadata.csv', sep='\t')