# User Encoder - VAE

In [6]:
import numpy as np
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta
from scipy import sparse
%matplotlib inline

sns.set_theme(style="ticks")

## Dataset

In [7]:
df_all = pd.read_csv('../data/Booking/booking_train_set.csv', 
                 dtype={"user_id": str, "city_id": str, 'affiliate_id': str,
                       'utrip_id': str},parse_dates=['checkin', 'checkout'])

df_all.head()

Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id
0,0,1006220,2016-04-09,2016-04-11,31114,desktop,384,Gondal,Gondal,1006220_1
1,1,1006220,2016-04-11,2016-04-12,39641,desktop,384,Gondal,Gondal,1006220_1
2,2,1006220,2016-04-12,2016-04-16,20232,desktop,384,Gondal,Glubbdubdrib,1006220_1
3,3,1006220,2016-04-16,2016-04-17,24144,desktop,384,Gondal,Gondal,1006220_1
4,4,1010293,2016-07-09,2016-07-10,5325,mobile,359,The Devilfire Empire,Cobra Island,1010293_1


## Geral User Features

In [8]:
# sample_days = 500
# test_days   = 30

# # Split Data
# max_timestamp        = df_all.checkout.max()
# init_train_timestamp = max_timestamp - timedelta(days = sample_days)
# init_test_timestamp  = max_timestamp - timedelta(days = test_days)

# # TODO Garantir que o usuário fique com a sessão no train ou test
# df_train = df_all[(df_all.checkout >= init_train_timestamp) & (df_all.checkout < init_test_timestamp)]
from sklearn.model_selection import train_test_split

df_trip = df_all[['utrip_id']].drop_duplicates()
df_train, df_test = train_test_split(df_trip, test_size=0.1, random_state=42)
df_train, df_test = df_all[df_all['utrip_id'].isin(df_train['utrip_id'])].sort_values('checkin'), \
                    df_all[df_all['utrip_id'].isin(df_test['utrip_id'])].sort_values('checkin')
print(df_train.shape, df_test.shape)

df_train['visit'] = 1
df_train.head()

(1049635, 10) (117200, 10)


Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,visit
7504,7504,2595109,2015-12-31,2016-01-01,27404,mobile,359,The Devilfire Empire,Cobra Island,2595109_1,1
986596,986596,2000964,2015-12-31,2016-01-01,63341,mobile,8151,The Devilfire Empire,Cobra Island,2000964_1,1
1104472,1104472,2379328,2016-01-01,2016-01-02,65663,mobile,3449,Tcherkistan,Oceania,2379328_1,1
788050,788050,1379517,2016-01-01,2016-01-03,47499,mobile,7360,Elbonia,Kangan,1379517_1,1
765551,765551,2147992,2016-01-01,2016-01-04,51259,mobile,9452,Gondal,Oceania,2147992_1,1


In [27]:
# change here to predict
#df_user = df_train
df_user = df_all

In [28]:

df_user['start_trip_month'] = df_user['checkin'].dt.month
df_user['duration']  = (df_user['checkout'] - df_user['checkin']).dt.days

df_user.head()

Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,start_trip_month,duration
0,0,1006220,2016-04-09,2016-04-11,31114,desktop,384,Gondal,Gondal,1006220_1,4,2
1,1,1006220,2016-04-11,2016-04-12,39641,desktop,384,Gondal,Gondal,1006220_1,4,1
2,2,1006220,2016-04-12,2016-04-16,20232,desktop,384,Gondal,Glubbdubdrib,1006220_1,4,4
3,3,1006220,2016-04-16,2016-04-17,24144,desktop,384,Gondal,Gondal,1006220_1,4,1
4,4,1010293,2016-07-09,2016-07-10,5325,mobile,359,The Devilfire Empire,Cobra Island,1010293_1,7,1


In [29]:
import scipy
def mode(x):
    return scipy.stats.mode(x)[0]

df_user_geral = df_user.groupby(['user_id']).agg(
    checkins_count=('checkin', 'count'),
    utrip_id_count=('utrip_id', pd.Series.nunique),
    duration_mean=('duration', 'mean'),
    mode_booker_country=('booker_country', mode),
    mode_device_class=('device_class', mode),
    mode_trip_month=('start_trip_month', mode),
    mode_city_id=('city_id', mode),
    mode_affiliate_id=('affiliate_id', mode),
)

df_user_geral.sort_index()

Unnamed: 0_level_0,checkins_count,utrip_id_count,duration_mean,mode_booker_country,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1000027,4,1,2.000000,Elbonia,desktop,8,15626,7168
1000033,5,1,2.000000,Gondal,desktop,4,38677,384
1000045,7,1,1.571429,The Devilfire Empire,desktop,6,31817,359
100008,5,1,1.800000,Gondal,desktop,7,11306,8436
1000083,4,1,1.250000,The Devilfire Empire,mobile,6,14705,359
...,...,...,...,...,...,...,...,...
999776,4,1,1.000000,Gondal,desktop,3,17775,4541
999839,4,1,2.000000,The Devilfire Empire,mobile,8,8335,359
999842,4,1,1.000000,Gondal,desktop,5,24036,3894
999855,15,1,1.533333,Gondal,mobile,4,38509,359


In [30]:
df_user_geral = df_user_geral.reset_index()
df_user_geral['user_id']  = df_user_geral['user_id'].astype('int')

## Trip  User Features

In [31]:
df_trip = pd.read_csv('../output/booking/dataset/train_0.1_10.csv')
df_trip.head()

Unnamed: 0,utrip_id,user_id,user_features,count_unique_city,trip_size,start_trip,end_trip,checkin_list,checkout_list,days_since_2016_list,...,last_device_class,last_affiliate_id,last_booker_country,last_step,first_city_id,first_hotel_country,last_city_id,last_hotel_country,country_count,duration_sum
0,1000027_1,1000027,"[-0.2721548080444336, -0.3261375427246094, -0....",4,3,2016-08-13,2016-08-18,"['0', '0', '0', '0', '0', '0', '2016-08-13', '...","['0', '0', '0', '0', '0', '0', '2016-08-14', '...","[0, 0, 0, 0, 0, 0, 225, 226, 228, 230]",...,desktop,253,Elbonia,4,8183,Gondal,30628,Gondal,10,8
1,1000033_1,1000033,"[-0.653695285320282, 0.9078158736228943, 0.579...",4,4,2016-04-09,2016-04-16,"['0', '0', '0', '0', '0', '2016-04-09', '2016-...","['0', '0', '0', '0', '0', '2016-04-11', '2016-...","[0, 0, 0, 0, 0, 99, 101, 102, 104, 106]",...,desktop,384,Gondal,5,38677,Cobra Island,38677,Cobra Island,122,10
2,1000045_1,1000045,"[-1.1103595495224, -1.2900782823562622, -0.307...",7,6,2016-06-18,2016-06-28,"['0', '0', '0', '2016-06-18', '2016-06-20', '2...","['0', '0', '0', '2016-06-20', '2016-06-22', '2...","[0, 0, 0, 169, 171, 173, 175, 176, 177, 179]",...,mobile,359,The Devilfire Empire,7,64876,Fook Island,36063,Gondal,143,11
3,1000083_1,1000083,"[1.3209058046340942, 0.19926407933235168, 0.57...",4,3,2016-06-13,2016-06-16,"['0', '0', '0', '0', '0', '0', '2016-06-13', '...","['0', '0', '0', '0', '0', '0', '2016-06-14', '...","[0, 0, 0, 0, 0, 0, 164, 165, 166, 167]",...,mobile,359,The Devilfire Empire,4,55990,Osterlich,36063,Gondal,122,5
4,100008_1,100008,"[-0.346758633852005, 0.11678697913885117, -1.0...",5,4,2016-07-18,2016-07-25,"['0', '0', '0', '0', '0', '2016-07-18', '2016-...","['0', '0', '0', '0', '0', '2016-07-21', '2016-...","[0, 0, 0, 0, 0, 199, 202, 203, 204, 206]",...,desktop,8436,Gondal,5,11306,Kamistan,65690,Kamistan,31,9


In [32]:
df_trip.iloc[0]

utrip_id                                                        1000027_1
user_id                                                           1000027
user_features           [-0.2721548080444336, -0.3261375427246094, -0....
count_unique_city                                                       4
trip_size                                                               3
start_trip                                                     2016-08-13
end_trip                                                       2016-08-18
checkin_list            ['0', '0', '0', '0', '0', '0', '2016-08-13', '...
checkout_list           ['0', '0', '0', '0', '0', '0', '2016-08-14', '...
days_since_2016_list               [0, 0, 0, 0, 0, 0, 225, 226, 228, 230]
duration_list                              [0, 0, 0, 0, 0, 0, 1, 2, 2, 3]
city_id_list            ['0', '0', '0', '0', '0', '0', '8183', '15626'...
device_class_list       ['0', '0', '0', '0', '0', '0', 'desktop', 'des...
affiliate_id_list       ['0', '0', '0'

In [33]:
df_user_trip = df_trip.groupby(['user_id']).agg(
    trip_duration_mean=('duration_sum', 'mean'),
    first_city_id_last_trip=('first_city_id', 'last'),
    first_hotel_country_last_trip=('first_hotel_country', 'last'),
    
)
df_user_trip = df_user_trip.reset_index()
df_user_trip['user_id'] = df_user_trip['user_id'].astype('int')

In [34]:
df_user_all = df_user_geral.merge(df_user_trip, how="left").fillna(0)
df_user_all['first_city_id_last_trip'] = df_user_all['first_city_id_last_trip'].astype('str')
df_user_all['first_hotel_country_last_trip'] = df_user_all['first_hotel_country_last_trip'].astype('str')
df_user_all

Unnamed: 0,user_id,checkins_count,utrip_id_count,duration_mean,mode_booker_country,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,trip_duration_mean,first_city_id_last_trip,first_hotel_country_last_trip
0,1000027,4,1,2.000000,Elbonia,desktop,8,15626,7168,8.0,8183.0,Gondal
1,1000033,5,1,2.000000,Gondal,desktop,4,38677,384,10.0,38677.0,Cobra Island
2,1000045,7,1,1.571429,The Devilfire Empire,desktop,6,31817,359,11.0,64876.0,Fook Island
3,100008,5,1,1.800000,Gondal,desktop,7,11306,8436,9.0,11306.0,Kamistan
4,1000083,4,1,1.250000,The Devilfire Empire,mobile,6,14705,359,5.0,55990.0,Osterlich
...,...,...,...,...,...,...,...,...,...,...,...,...
200148,999776,4,1,1.000000,Gondal,desktop,3,17775,4541,4.0,17775.0,Novistrana
200149,999839,4,1,2.000000,The Devilfire Empire,mobile,8,8335,359,8.0,8335.0,Cobra Island
200150,999842,4,1,1.000000,Gondal,desktop,5,24036,3894,4.0,51291.0,Glubbdubdrib
200151,999855,15,1,1.533333,Gondal,mobile,4,38509,359,0.0,0.0,0


In [35]:
df_user_all.describe()

Unnamed: 0,user_id,checkins_count,utrip_id_count,duration_mean,mode_trip_month,trip_duration_mean
count,200153.0,200153.0,200153.0,200153.0,200153.0,200153.0
mean,3126968.0,5.829715,1.087598,1.799346,6.976603,7.549838
std,1806523.0,3.021691,0.366366,0.80535,2.80367,4.622135
min,29.0,1.0,1.0,1.0,1.0,0.0
25%,1561836.0,4.0,1.0,1.25,5.0,5.0
50%,3124280.0,5.0,1.0,1.6,7.0,7.0
75%,4692002.0,6.0,1.0,2.2,9.0,10.0
max,6258087.0,172.0,20.0,30.0,12.0,21.0


In [17]:
df_user_all.to_csv('../output/booking/dataset/user_features_raw.csv')

## Transform Dataset

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [37]:
df = df_user_all.set_index('user_id')
df.head()

Unnamed: 0_level_0,checkins_count,utrip_id_count,duration_mean,mode_booker_country,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,trip_duration_mean,first_city_id_last_trip,first_hotel_country_last_trip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000027,4,1,2.0,Elbonia,desktop,8,15626,7168,8.0,8183.0,Gondal
1000033,5,1,2.0,Gondal,desktop,4,38677,384,10.0,38677.0,Cobra Island
1000045,7,1,1.571429,The Devilfire Empire,desktop,6,31817,359,11.0,64876.0,Fook Island
100008,5,1,1.8,Gondal,desktop,7,11306,8436,9.0,11306.0,Kamistan
1000083,4,1,1.25,The Devilfire Empire,mobile,6,14705,359,5.0,55990.0,Osterlich


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200153 entries, 1000027 to 999944
Data columns (total 11 columns):
checkins_count                   200153 non-null int64
utrip_id_count                   200153 non-null int64
duration_mean                    200153 non-null float64
mode_booker_country              200153 non-null object
mode_device_class                200153 non-null object
mode_trip_month                  200153 non-null int64
mode_city_id                     200153 non-null object
mode_affiliate_id                200153 non-null object
trip_duration_mean               200153 non-null float64
first_city_id_last_trip          200153 non-null object
first_hotel_country_last_trip    200153 non-null object
dtypes: float64(2), int64(3), object(6)
memory usage: 18.3+ MB


In [39]:
df.select_dtypes(include=['int64', 'float64']).columns

Index(['checkins_count', 'utrip_id_count', 'duration_mean', 'mode_trip_month',
       'trip_duration_mean'],
      dtype='object')

In [40]:
# determine categorical and numerical features
numerical_ix   = df.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = df.select_dtypes(include=['object', 'bool']).columns

In [41]:
df[categorical_ix]

Unnamed: 0_level_0,mode_booker_country,mode_device_class,mode_city_id,mode_affiliate_id,first_city_id_last_trip,first_hotel_country_last_trip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000027,Elbonia,desktop,15626,7168,8183.0,Gondal
1000033,Gondal,desktop,38677,384,38677.0,Cobra Island
1000045,The Devilfire Empire,desktop,31817,359,64876.0,Fook Island
100008,Gondal,desktop,11306,8436,11306.0,Kamistan
1000083,The Devilfire Empire,mobile,14705,359,55990.0,Osterlich
...,...,...,...,...,...,...
999776,Gondal,desktop,17775,4541,17775.0,Novistrana
999839,The Devilfire Empire,mobile,8335,359,8335.0,Cobra Island
999842,Gondal,desktop,24036,3894,51291.0,Glubbdubdrib
999855,Gondal,mobile,38509,359,0.0,0


In [24]:
# define the data preparation for the columns
t = [
     ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_ix), 
     ('num', StandardScaler(), numerical_ix)
    ]

col_transform = ColumnTransformer(transformers=t)
df_transform  = col_transform.fit_transform(df)
df_transform

<181480x32979 sparse matrix of type '<class 'numpy.float64'>'
	with 1996280 stored elements in Compressed Sparse Row format>

In [25]:
from scipy import sparse

sparse.save_npz("../output/booking/dataset/train_user_features.npz", df_transform)
df.reset_index().to_csv('../output/booking/dataset/train_user_features.csv', index=False)

## Train VAE

In [26]:
#   python train.py \
#   --dataset '/media/workspace/booking_challenge/output/booking/dataset/train_user_features.npz' \
#   --original_dim 32979 \
#   --intermediate_dim 64 \
#   --latent_dim 10 \
#   --batch 128 \
#   --lr 1e-4 \
#   --epochs 100

## Predict

Use model trained to predict all users

In [42]:
df

Unnamed: 0_level_0,checkins_count,utrip_id_count,duration_mean,mode_booker_country,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,trip_duration_mean,first_city_id_last_trip,first_hotel_country_last_trip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000027,4,1,2.000000,Elbonia,desktop,8,15626,7168,8.0,8183.0,Gondal
1000033,5,1,2.000000,Gondal,desktop,4,38677,384,10.0,38677.0,Cobra Island
1000045,7,1,1.571429,The Devilfire Empire,desktop,6,31817,359,11.0,64876.0,Fook Island
100008,5,1,1.800000,Gondal,desktop,7,11306,8436,9.0,11306.0,Kamistan
1000083,4,1,1.250000,The Devilfire Empire,mobile,6,14705,359,5.0,55990.0,Osterlich
...,...,...,...,...,...,...,...,...,...,...,...
999776,4,1,1.000000,Gondal,desktop,3,17775,4541,4.0,17775.0,Novistrana
999839,4,1,2.000000,The Devilfire Empire,mobile,8,8335,359,8.0,8335.0,Cobra Island
999842,4,1,1.000000,Gondal,desktop,5,24036,3894,4.0,51291.0,Glubbdubdrib
999855,15,1,1.533333,Gondal,mobile,4,38509,359,0.0,0.0,0


In [43]:
df_transform_all  = col_transform.transform(df)
df_transform_all

<200153x32979 sparse matrix of type '<class 'numpy.float64'>'
	with 2200076 stored elements in Compressed Sparse Row format>

In [45]:
from scipy import sparse
sparse.save_npz("../output/booking/dataset/all_user_features.npz", df_transform_all)
df.reset_index().to_csv('../output/booking/dataset/all_user_features.csv', index=False)

In [None]:
#predict

# python predict.py \
# --dataset '/media/workspace/booking_challenge/output/booking/dataset/all_user_features.npz' \
# --load_model '/media/workspace/vae/output/20210115-113523' \
# --original_dim 32979 \
# --intermediate_dim 64 \
# --latent_dim 10

### Load and Save

In [None]:
# Load predicted file

In [46]:
df.head()

Unnamed: 0_level_0,checkins_count,utrip_id_count,duration_mean,mode_booker_country,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,trip_duration_mean,first_city_id_last_trip,first_hotel_country_last_trip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000027,4,1,2.0,Elbonia,desktop,8,15626,7168,8.0,8183.0,Gondal
1000033,5,1,2.0,Gondal,desktop,4,38677,384,10.0,38677.0,Cobra Island
1000045,7,1,1.571429,The Devilfire Empire,desktop,6,31817,359,11.0,64876.0,Fook Island
100008,5,1,1.8,Gondal,desktop,7,11306,8436,9.0,11306.0,Kamistan
1000083,4,1,1.25,The Devilfire Empire,mobile,6,14705,359,5.0,55990.0,Osterlich


In [None]:
# python predict.py \
# --dataset '/media/workspace/booking_challenge/output/booking/dataset/all_user_features.npz' \
# --load_model '/media/workspace/vae/output/20210115-113523/weigths' \
# --original_dim 32979 \
# --intermediate_dim 64 \
# --latent_dim 10

In [47]:
# Copy file 
!cp /media/workspace/vae/output/20210115-144921/latent_space.npy /media/workspace/booking_challenge/output/booking/dataset/all_user_features.npy

In [48]:
np_user_features = np.load('../output/booking/dataset/all_user_features.npy')
df.shape, np_user_features.shape

((200153, 11), (200153, 10))

In [49]:
df['user_features'] = np_user_features.tolist()

In [50]:
df.head()

Unnamed: 0_level_0,checkins_count,utrip_id_count,duration_mean,mode_booker_country,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,trip_duration_mean,first_city_id_last_trip,first_hotel_country_last_trip,user_features
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000027,4,1,2.0,Elbonia,desktop,8,15626,7168,8.0,8183.0,Gondal,"[-0.043323297053575516, -0.25124290585517883, ..."
1000033,5,1,2.0,Gondal,desktop,4,38677,384,10.0,38677.0,Cobra Island,"[-0.814620852470398, -0.7487749457359314, 0.73..."
1000045,7,1,1.571429,The Devilfire Empire,desktop,6,31817,359,11.0,64876.0,Fook Island,"[0.912726640701294, -0.22855083644390106, 0.88..."
100008,5,1,1.8,Gondal,desktop,7,11306,8436,9.0,11306.0,Kamistan,"[1.1990561485290527, 0.3393579423427582, 1.638..."
1000083,4,1,1.25,The Devilfire Empire,mobile,6,14705,359,5.0,55990.0,Osterlich,"[0.7267171144485474, 1.0654934644699097, 0.638..."


In [51]:
df.reset_index().to_csv('../output/booking/dataset/all_user_features.csv')

In [52]:
from numpy import asarray
from numpy import savetxt

# save to csv file
savetxt('data.csv', np_user_features, delimiter='\t')

In [53]:
df.reset_index().to_csv('metadata.csv', sep='\t')