# User Encoder - VAE

In [1]:
import numpy as np
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta
from scipy import sparse
%matplotlib inline

sns.set_theme(style="ticks")

## Dataset

In [2]:
def read_dataset(path):
    df_trip = pd.read_csv(path,parse_dates=['start_trip', 'last_checkin'])
    df_trip['start_trip_month'] = df_trip['start_trip'].dt.month
    df_trip['last_checkin_month'] = df_trip['last_checkin'].dt.month

    for c in ['city_id_list',
                'device_class_list',
                'affiliate_id_list',
                'booker_country_list',
                'hotel_country_list']:
        df_trip[c] = df_trip[c].apply(eval)
    return df_trip.sort_values('start_trip')
df_trip = read_dataset('../output/booking/dataset/train_0.1_10.csv')        
df_trip.head()

Unnamed: 0,utrip_id,last_step,user_id,count_unique_city,trip_size,start_trip,end_trip,checkin_list,checkout_list,days_since_2016_list,...,last_booker_country,first_city_id,first_hotel_country,last_city_id,last_hotel_country,country_count,duration_sum,user_features,start_trip_month,last_checkin_month
32721,2000964_1,5,2000964,4,4,2015-12-31,2016-01-04,"[0, 0, 0, 0, 0, 16800.0, 16801.0, 16802.0, 168...","[0, 0, 0, 0, 0, 16801.0, 16802.0, 16803.0, 168...","[0, 0, 0, 0, 0, -1, 0, 1, 2, 3]",...,The Devilfire Empire,63341,Cobra Island,64071,Cobra Island,11,5,"[0.010669194161891937, -0.05803748965263367, 0...",12,1
174974,727105_1,4,727105,4,3,2015-12-31,2016-01-03,"[0, 0, 0, 0, 0, 0, 16800.0, 16801.0, 16802.0, ...","[0, 0, 0, 0, 0, 0, 16801.0, 16802.0, 16803.0, ...","[0, 0, 0, 0, 0, 0, -1, 0, 1, 2]",...,The Devilfire Empire,18820,Cobra Island,6851,Cobra Island,19,4,"[0.11124284565448761, -0.027489766478538513, -...",12,1
52192,2595109_1,4,2595109,2,3,2015-12-31,2016-01-06,"[0, 0, 0, 0, 0, 0, 16800.0, 16802.0, 16804.0, ...","[0, 0, 0, 0, 0, 0, 16801.0, 16804.0, 16806.0, ...","[0, 0, 0, 0, 0, 0, -1, 1, 3, 5]",...,The Devilfire Empire,27404,Cobra Island,16047,Cobra Island,985,6,"[0.11022190749645233, -0.14957962930202484, 0....",12,1
83281,3554942_1,4,3554942,4,3,2016-01-01,2016-01-04,"[0, 0, 0, 0, 0, 0, 16801.0, 16802.0, 16803.0, ...","[0, 0, 0, 0, 0, 0, 16802.0, 16803.0, 16804.0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 2, 3]",...,The Devilfire Empire,9161,Bozatta,20967,Bozatta,16,4,"[0.1122531071305275, -0.034550607204437256, -0...",1,1
135948,5163164_1,5,5163164,4,4,2016-01-01,2016-01-05,"[0, 0, 0, 0, 0, 16801.0, 16802.0, 16803.0, 168...","[0, 0, 0, 0, 0, 16802.0, 16803.0, 16804.0, 168...","[0, 0, 0, 0, 0, 0, 1, 2, 3, 4]",...,The Devilfire Empire,54603,Marina Venetta,12841,Marina Venetta,99,5,"[0.012650705873966217, -0.06454622000455856, -...",1,1


In [3]:
df_trip.iloc[0]

utrip_id                                                        2000964_1
last_step                                                               5
user_id                                                           2000964
count_unique_city                                                       4
trip_size                                                               4
start_trip                                            2015-12-31 00:00:00
end_trip                                                       2016-01-04
checkin_list            [0, 0, 0, 0, 0, 16800.0, 16801.0, 16802.0, 168...
checkout_list           [0, 0, 0, 0, 0, 16801.0, 16802.0, 16803.0, 168...
days_since_2016_list                      [0, 0, 0, 0, 0, -1, 0, 1, 2, 3]
duration_list                              [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
city_id_list               [0, 0, 0, 0, 0, 63341, 66972, 64071, 31972, M]
device_class_list       [0, 0, 0, 0, 0, mobile, mobile, mobile, mobile...
affiliate_id_list           [0, 0, 0, 

## Geral User Features

In [4]:
import scipy

def mode(x):
    return scipy.stats.mode(x)[0]

def sum_list(x):
    return list(np.unique(sum(x, [])))

def group_by_trip(df_trip):

    df_user = df_trip.groupby(['user_id']).agg(
        trip_count=('count_unique_city', 'count'),
        sum_count_unique_city=('count_unique_city', 'sum'),
        mean_count_unique_city=('count_unique_city', 'mean'),
        sum_trip_size=('trip_size', 'sum'),
        mean_trip_size=('trip_size', 'mean'),
        mode_trip_month=('start_trip_month', mode),
        trip_month_list=('start_trip_month', list),
        count_uniq_trip_month=('start_trip_month', pd.Series.nunique),
        sum_duration_sum=('duration_sum', 'sum'),        
        mean_duration_sum=('duration_sum', 'mean'),
        sum_last_step=('last_step', 'sum'),        
        mean_last_step=('last_step', 'mean'),        
        city_id_list=('city_id_list', sum_list),
        device_class_list=('device_class_list', sum_list),
        affiliate_id_list=('affiliate_id_list', sum_list),
        booker_country_list=('booker_country_list', sum_list),
        hotel_country_list=('hotel_country_list', sum_list)
    )
    
    df_user             = df_user.reset_index()
    df_user['user_id']  = df_user['user_id'].astype('int')

    for c in ['mode_trip_month']:
        df_user[c]  = df_user[c].astype('str')
        
    return df_user

df_user = group_by_trip(df_trip) #.sample(10000, random_state=42)
df_user.sort_index().iloc[1]

user_id                                            81
trip_count                                          1
sum_count_unique_city                               4
mean_count_unique_city                              4
sum_trip_size                                       3
mean_trip_size                                      3
mode_trip_month                                     5
trip_month_list                                   [5]
count_uniq_trip_month                               1
sum_duration_sum                                    6
mean_duration_sum                                   6
sum_last_step                                       4
mean_last_step                                      4
city_id_list              [0, 15506, 22065, 33665, M]
device_class_list                        [0, desktop]
affiliate_id_list                           [0, 9924]
booker_country_list                      [0, Elbonia]
hotel_country_list          [0, Elbonia, Leutonia, M]
Name: 1, dtype: object

In [5]:
#df_user_all.to_csv('../output/booking/dataset/user_features_raw.csv')

## Transform Dataset

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
df = df_user.set_index('user_id')
df.head()

Unnamed: 0_level_0,trip_count,sum_count_unique_city,mean_count_unique_city,sum_trip_size,mean_trip_size,mode_trip_month,trip_month_list,count_uniq_trip_month,sum_duration_sum,mean_duration_sum,sum_last_step,mean_last_step,city_id_list,device_class_list,affiliate_id_list,booker_country_list,hotel_country_list
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
29,1,4,4.0,3,3.0,7,[7],1,9,9.0,4,4.0,"[0, 12291, 34444, 47054, M]","[0, desktop]","[0, 1601, 8132]","[0, Elbonia]","[0, Elbonia, M]"
81,1,4,4.0,3,3.0,5,[5],1,6,6.0,4,4.0,"[0, 15506, 22065, 33665, M]","[0, desktop]","[0, 9924]","[0, Elbonia]","[0, Elbonia, Leutonia, M]"
136,2,11,5.5,9,4.5,4,"[4, 9]",2,14,7.0,11,5.5,"[0, 28545, 38793, 43323, 45399, 46411, 51685, ...","[0, desktop]","[0, 9924]","[0, The Devilfire Empire]","[0, M, Osterlich, Slaka, Sylvania]"
149,1,2,2.0,3,3.0,2,[2],1,6,6.0,4,4.0,"[0, 10485, M]","[0, desktop]","[0, 3417, 5583]","[0, The Devilfire Empire]","[0, Kangan, M]"
156,1,7,7.0,7,7.0,8,[8],1,9,9.0,8,8.0,"[0, 16546, 19335, 2748, 39132, 41971, 48310, M]","[0, mobile]","[0, 359, 4775]","[0, Gondal]","[0, Kazahrus, Leutonia, M, Urkesh]"


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171366 entries, 29 to 6258087
Data columns (total 17 columns):
trip_count                171366 non-null int64
sum_count_unique_city     171366 non-null int64
mean_count_unique_city    171366 non-null float64
sum_trip_size             171366 non-null int64
mean_trip_size            171366 non-null float64
mode_trip_month           171366 non-null object
trip_month_list           171366 non-null object
count_uniq_trip_month     171366 non-null int64
sum_duration_sum          171366 non-null int64
mean_duration_sum         171366 non-null float64
sum_last_step             171366 non-null int64
mean_last_step            171366 non-null float64
city_id_list              171366 non-null object
device_class_list         171366 non-null object
affiliate_id_list         171366 non-null object
booker_country_list       171366 non-null object
hotel_country_list        171366 non-null object
dtypes: float64(4), int64(6), object(7)
memory usage: 23

In [9]:
df.select_dtypes(include=['int64', 'float64']).columns

Index(['trip_count', 'sum_count_unique_city', 'mean_count_unique_city',
       'sum_trip_size', 'mean_trip_size', 'count_uniq_trip_month',
       'sum_duration_sum', 'mean_duration_sum', 'sum_last_step',
       'mean_last_step'],
      dtype='object')

In [10]:
# determine categorical and numerical features
numerical_ix   = df.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = df.select_dtypes(include=['object', 'bool']).columns

In [11]:
df[categorical_ix].head()

Unnamed: 0_level_0,mode_trip_month,trip_month_list,city_id_list,device_class_list,affiliate_id_list,booker_country_list,hotel_country_list
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
29,7,[7],"[0, 12291, 34444, 47054, M]","[0, desktop]","[0, 1601, 8132]","[0, Elbonia]","[0, Elbonia, M]"
81,5,[5],"[0, 15506, 22065, 33665, M]","[0, desktop]","[0, 9924]","[0, Elbonia]","[0, Elbonia, Leutonia, M]"
136,4,"[4, 9]","[0, 28545, 38793, 43323, 45399, 46411, 51685, ...","[0, desktop]","[0, 9924]","[0, The Devilfire Empire]","[0, M, Osterlich, Slaka, Sylvania]"
149,2,[2],"[0, 10485, M]","[0, desktop]","[0, 3417, 5583]","[0, The Devilfire Empire]","[0, Kangan, M]"
156,8,[8],"[0, 16546, 19335, 2748, 39132, 41971, 48310, M]","[0, mobile]","[0, 359, 4775]","[0, Gondal]","[0, Kazahrus, Leutonia, M, Urkesh]"


In [12]:
df[numerical_ix].head()

Unnamed: 0_level_0,trip_count,sum_count_unique_city,mean_count_unique_city,sum_trip_size,mean_trip_size,count_uniq_trip_month,sum_duration_sum,mean_duration_sum,sum_last_step,mean_last_step
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
29,1,4,4.0,3,3.0,1,9,9.0,4,4.0
81,1,4,4.0,3,3.0,1,6,6.0,4,4.0
136,2,11,5.5,9,4.5,2,14,7.0,11,5.5
149,1,2,2.0,3,3.0,1,6,6.0,4,4.0
156,1,7,7.0,7,7.0,1,9,9.0,8,8.0


In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import coo_matrix, hstack, vstack
class DfMultiLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
        self.T = {}
        #

    def fit(self, X, y=None):
        self.columns = list(X.columns)
        
        for c in self.columns:
            self.T[c] = CountVectorizer(analyzer=set)
            self.T[c].fit(X[c])
        
        return self
    
    def transform(self, X, y=None):
        t_X = []
        for c in self.columns:
            d = self.T[c].transform(X[c])
            t_X.append(d)
        #print(t_X[0])
        return hstack(t_X)
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)

In [14]:
# define the data preparation for the columns
t = [
     ('cat', DfMultiLabelBinarizer(), categorical_ix), 
     ('num', StandardScaler(), numerical_ix)
    ]

col_transform = ColumnTransformer(transformers=t)
df_transform  = col_transform.fit_transform(df)
df_transform

<171366x37517 sparse matrix of type '<class 'numpy.float64'>'
	with 4951140 stored elements in Compressed Sparse Row format>

In [15]:
df_transform.shape

(171366, 37517)

In [16]:
# from scipy import sparse

# sparse.save_npz("../output/booking/dataset/train_user_features.npz", df_transform)
# df.reset_index().to_csv('../output/booking/dataset/train_user_features.csv', index=False)

In [17]:
#df_transform[:2]

## Train Autoencoder

In [18]:
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [19]:
# Params
batch_size   = 512
epochs       = 100
learning_rate = 1e-3
emb_size    = 100

In [20]:
#pd.DataFrame(df_transform.todense())

In [21]:
# train_loader = torch.utils.data.DataLoader(
#     pd.DataFrame(df_transform.todense()), batch_size=batch_size, shuffle=True
# )
import torch.utils.data as D

# next(train_loader)
class Dataset(D.Dataset):
    
    def __init__(self, sparse):
        self.data = sparse

    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        # convert it to COO so to get the atributes to create a sparse tensor
        data = self.data[index].tocoo()
        i = torch.LongTensor(np.vstack((data.row, data.col)))
        v = torch.FloatTensor(data.data)
        data = torch.sparse.FloatTensor(i, v, torch.Size(data.shape))
        return data

d = Dataset(df_transform)
d.__getitem__([1])

tensor(indices=tensor([[    0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0],
                       [    5,    14,    22,  3065,  6763, 13330, 34282, 34283,
                        34284, 34287, 37278, 37308, 37310, 37314, 37361, 37405,
                        37410, 37507, 37508, 37509, 37510, 37511, 37512, 37513,
                        37514, 37515, 37516]]),
       values=tensor([ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
                       1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
                       1.0000,  1.0000,  1.0000,  1.0000,  1.0000, -0.2269,
                      -0.3845, -0.3261, -0.7065, -0.8058, -0.2094, -0.7548,
                      -0.7613, -0.6628, -0.8058]),
       size=(1, 37517), nnz=27, layout=torch.sparse_coo)

In [22]:
train_loader = torch.utils.data.DataLoader(d, 
                                           batch_size=batch_size, 
                                           num_workers=0)
len(d)

171366

In [23]:
#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # create a model from `AE` autoencoder class
# # load it to the specified device, either gpu or cpu
# model = AE(input_shape=df_transform.shape[1]).to(device)

# # create an optimizer object
# # Adam optimizer with learning rate 1e-3
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# # mean-squared error loss
# criterion = nn.MSELoss()

In [24]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl

In [25]:
class LitAutoEncoder(pl.LightningModule):

    def __init__(self, input_data, emb_size = 10, dropout=0.3):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(input_data, 512), nn.ReLU(), nn.Linear(512, emb_size))
        self.decoder = nn.Sequential(nn.Linear(emb_size, 512), nn.ReLU(), nn.Linear(512, input_data))
        self.dropout = nn.Dropout(dropout)
        
    def normalize(self, x: torch.Tensor, dim: int = 1) -> torch.Tensor:
        x = F.normalize(x, p=2, dim=dim)
        return x
    
    def forward(self, x):
        # in lightning, forward defines the prediction/inference actions
        embedding = self.dropout(self.normalize(self.encoder(x)))
        return embedding

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop. It is independent of forward
        x = batch.to_dense()
        x = x.view(x.size(0), -1)
        z = self.forward(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [26]:
# Model
autoencoder = LitAutoEncoder(input_data=df_transform.shape[1], emb_size=emb_size)

# Train
trainer     = pl.Trainer(max_epochs=epochs, gpus=1)
trainer.fit(autoencoder, train_loader, train_loader)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 19.3 M
1 | decoder | Sequential | 19.3 M
2 | dropout | Dropout    | 0     
---------------------------------------
38.6 M    Trainable params
0         Non-trainable params
38.6 M    Total params
I0122 16:38:22.117249 140095783147328 lightning.py:1343] 
  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 19.3 M
1 | decoder | Sequential | 19.3 M
2 | dropout | Dropout    | 0     
---------------------------------------
38.6 M    Trainable params
0         Non-trainable params
38.6 M    Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




1

In [27]:
model = autoencoder

In [28]:
PATH = 'model_autoenc.pth'

torch.save(model, PATH)
model = torch.load(PATH)
model.eval()

LitAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=37517, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=100, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=100, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=37517, bias=True)
  )
  (dropout): Dropout(p=0.3, inplace=False)
)

## Predict

Use model trained to predict all users

In [31]:
#est_trip[c] = df_test_trip[c].apply(eval)
    
df_test_trip = read_dataset('../output/booking/dataset/test_0.1_10.csv')        

df_test_trip = pd.concat([df_trip, df_test_trip]).sort_values('start_trip')

df_test_trip = group_by_trip(df_test_trip)
df_test_trip.sort_index()
df_test_trip.head()

Unnamed: 0,user_id,trip_count,sum_count_unique_city,mean_count_unique_city,sum_trip_size,mean_trip_size,mode_trip_month,trip_month_list,count_uniq_trip_month,sum_duration_sum,mean_duration_sum,sum_last_step,mean_last_step,city_id_list,device_class_list,affiliate_id_list,booker_country_list,hotel_country_list
0,29,1,4,4.0,3,3.0,7,[7],1,9,9.0,4,4.0,"[0, 12291, 34444, 47054, M]","[0, desktop]","[0, 1601, 8132]","[0, Elbonia]","[0, Elbonia, M]"
1,81,1,4,4.0,3,3.0,5,[5],1,6,6.0,4,4.0,"[0, 15506, 22065, 33665, M]","[0, desktop]","[0, 9924]","[0, Elbonia]","[0, Elbonia, Leutonia, M]"
2,136,2,11,5.5,9,4.5,4,"[4, 9]",2,14,7.0,11,5.5,"[0, 28545, 38793, 43323, 45399, 46411, 51685, ...","[0, desktop]","[0, 9924]","[0, The Devilfire Empire]","[0, M, Osterlich, Slaka, Sylvania]"
3,149,1,2,2.0,3,3.0,2,[2],1,6,6.0,4,4.0,"[0, 10485, M]","[0, desktop]","[0, 3417, 5583]","[0, The Devilfire Empire]","[0, Kangan, M]"
4,156,1,7,7.0,7,7.0,8,[8],1,9,9.0,8,8.0,"[0, 16546, 19335, 2748, 39132, 41971, 48310, M]","[0, mobile]","[0, 359, 4775]","[0, Gondal]","[0, Kazahrus, Leutonia, M, Urkesh]"


In [32]:
df_test_trip.head()

Unnamed: 0,user_id,trip_count,sum_count_unique_city,mean_count_unique_city,sum_trip_size,mean_trip_size,mode_trip_month,trip_month_list,count_uniq_trip_month,sum_duration_sum,mean_duration_sum,sum_last_step,mean_last_step,city_id_list,device_class_list,affiliate_id_list,booker_country_list,hotel_country_list
0,29,1,4,4.0,3,3.0,7,[7],1,9,9.0,4,4.0,"[0, 12291, 34444, 47054, M]","[0, desktop]","[0, 1601, 8132]","[0, Elbonia]","[0, Elbonia, M]"
1,81,1,4,4.0,3,3.0,5,[5],1,6,6.0,4,4.0,"[0, 15506, 22065, 33665, M]","[0, desktop]","[0, 9924]","[0, Elbonia]","[0, Elbonia, Leutonia, M]"
2,136,2,11,5.5,9,4.5,4,"[4, 9]",2,14,7.0,11,5.5,"[0, 28545, 38793, 43323, 45399, 46411, 51685, ...","[0, desktop]","[0, 9924]","[0, The Devilfire Empire]","[0, M, Osterlich, Slaka, Sylvania]"
3,149,1,2,2.0,3,3.0,2,[2],1,6,6.0,4,4.0,"[0, 10485, M]","[0, desktop]","[0, 3417, 5583]","[0, The Devilfire Empire]","[0, Kangan, M]"
4,156,1,7,7.0,7,7.0,8,[8],1,9,9.0,8,8.0,"[0, 16546, 19335, 2748, 39132, 41971, 48310, M]","[0, mobile]","[0, 359, 4775]","[0, Gondal]","[0, Kazahrus, Leutonia, M, Urkesh]"


In [33]:
#df_all = pd.concat([df_user, df_test_trip])
df_all = df_test_trip
df_all_transform  = col_transform.transform(df_all)
df_all_transform



<190086x37517 sparse matrix of type '<class 'numpy.float64'>'
	with 5501666 stored elements in Compressed Sparse Row format>

In [34]:
predict_loader = torch.utils.data.DataLoader(Dataset(df_all_transform), batch_size=batch_size, num_workers=0)
predict_loader

<torch.utils.data.dataloader.DataLoader at 0x7f698021b908>

In [35]:

data = []
model.eval()
model.to(device)

with torch.no_grad():
    for batch_features in predict_loader:
        # reshape mini-batch data to [N, 784] matrix
        # load it to the active device
        #batch_features #= batch_features.view(-1, df_transform.shape[1]).to(device)
        batch_features  = batch_features.to(device).to_dense()
        # reset the gradients back to zero
        # PyTorch accumulates gradients on subsequent backward passes
        data.extend(model(batch_features).cpu().detach().numpy().reshape(batch_features.shape[0],-1))

data = np.array(data)
data.shape

(190086, 100)

In [36]:
df_all['user_features'] = data.tolist()

In [37]:
df_all.head()

Unnamed: 0,user_id,trip_count,sum_count_unique_city,mean_count_unique_city,sum_trip_size,mean_trip_size,mode_trip_month,trip_month_list,count_uniq_trip_month,sum_duration_sum,mean_duration_sum,sum_last_step,mean_last_step,city_id_list,device_class_list,affiliate_id_list,booker_country_list,hotel_country_list,user_features
0,29,1,4,4.0,3,3.0,7,[7],1,9,9.0,4,4.0,"[0, 12291, 34444, 47054, M]","[0, desktop]","[0, 1601, 8132]","[0, Elbonia]","[0, Elbonia, M]","[-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1..."
1,81,1,4,4.0,3,3.0,5,[5],1,6,6.0,4,4.0,"[0, 15506, 22065, 33665, M]","[0, desktop]","[0, 9924]","[0, Elbonia]","[0, Elbonia, Leutonia, M]","[-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, ..."
2,136,2,11,5.5,9,4.5,4,"[4, 9]",2,14,7.0,11,5.5,"[0, 28545, 38793, 43323, 45399, 46411, 51685, ...","[0, desktop]","[0, 9924]","[0, The Devilfire Empire]","[0, M, Osterlich, Slaka, Sylvania]","[-1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0..."
3,149,1,2,2.0,3,3.0,2,[2],1,6,6.0,4,4.0,"[0, 10485, M]","[0, desktop]","[0, 3417, 5583]","[0, The Devilfire Empire]","[0, Kangan, M]","[-1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, -1.0, ..."
4,156,1,7,7.0,7,7.0,8,[8],1,9,9.0,8,8.0,"[0, 16546, 19335, 2748, 39132, 41971, 48310, M]","[0, mobile]","[0, 359, 4775]","[0, Gondal]","[0, Kazahrus, Leutonia, M, Urkesh]","[-1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1..."


In [38]:
df_all.reset_index().to_csv('../output/booking/dataset/all_user_features_{}.csv'.format(emb_size))

In [39]:
from numpy import asarray
from numpy import savetxt

# save to csv file
savetxt('data.csv', data, delimiter='\t')

In [None]:
df_all.reset_index().to_csv('metadata.csv', sep='\t')