# User Encoder - VAE

In [1]:
import numpy as np
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta
from scipy import sparse
%matplotlib inline

sns.set_theme(style="ticks")

## Dataset

In [2]:
df_all = pd.read_csv('../data/Booking/booking_train_set.csv', 
                 dtype={"user_id": str, "city_id": str, 'affiliate_id': str,
                       'utrip_id': str},parse_dates=['checkin', 'checkout'])

df_all.head()

Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id
0,0,1006220,2016-04-09,2016-04-11,31114,desktop,384,Gondal,Gondal,1006220_1
1,1,1006220,2016-04-11,2016-04-12,39641,desktop,384,Gondal,Gondal,1006220_1
2,2,1006220,2016-04-12,2016-04-16,20232,desktop,384,Gondal,Glubbdubdrib,1006220_1
3,3,1006220,2016-04-16,2016-04-17,24144,desktop,384,Gondal,Gondal,1006220_1
4,4,1010293,2016-07-09,2016-07-10,5325,mobile,359,The Devilfire Empire,Cobra Island,1010293_1


## Geral User Features

In [3]:
# sample_days = 500
# test_days   = 30

# # Split Data
# max_timestamp        = df_all.checkout.max()
# init_train_timestamp = max_timestamp - timedelta(days = sample_days)
# init_test_timestamp  = max_timestamp - timedelta(days = test_days)

# # TODO Garantir que o usuário fique com a sessão no train ou test
# df_train = df_all[(df_all.checkout >= init_train_timestamp) & (df_all.checkout < init_test_timestamp)]
from sklearn.model_selection import train_test_split

df_trip = df_all[['utrip_id']].drop_duplicates()
df_train, df_test = train_test_split(df_trip, test_size=0.1, random_state=42)
df_train, df_test = df_all[df_all['utrip_id'].isin(df_train['utrip_id'])], \
                    df_all[df_all['utrip_id'].isin(df_test['utrip_id'])]
print(df_train.shape, df_test.shape)

df_train['visit'] = 1
df_train.head()

(1049635, 10) (117200, 10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,visit
0,0,1006220,2016-04-09,2016-04-11,31114,desktop,384,Gondal,Gondal,1006220_1,1
1,1,1006220,2016-04-11,2016-04-12,39641,desktop,384,Gondal,Gondal,1006220_1,1
2,2,1006220,2016-04-12,2016-04-16,20232,desktop,384,Gondal,Glubbdubdrib,1006220_1,1
3,3,1006220,2016-04-16,2016-04-17,24144,desktop,384,Gondal,Gondal,1006220_1,1
4,4,1010293,2016-07-09,2016-07-10,5325,mobile,359,The Devilfire Empire,Cobra Island,1010293_1,1


In [4]:
df_user = df_train

df_user['start_trip_month'] = df_user['checkin'].dt.month
df_user['duration']  = (df_user['checkout'] - df_user['checkin']).dt.days

df_user.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,visit,start_trip_month,duration
0,0,1006220,2016-04-09,2016-04-11,31114,desktop,384,Gondal,Gondal,1006220_1,1,4,2
1,1,1006220,2016-04-11,2016-04-12,39641,desktop,384,Gondal,Gondal,1006220_1,1,4,1
2,2,1006220,2016-04-12,2016-04-16,20232,desktop,384,Gondal,Glubbdubdrib,1006220_1,1,4,4
3,3,1006220,2016-04-16,2016-04-17,24144,desktop,384,Gondal,Gondal,1006220_1,1,4,1
4,4,1010293,2016-07-09,2016-07-10,5325,mobile,359,The Devilfire Empire,Cobra Island,1010293_1,1,7,1


In [5]:
import scipy
def mode(x):
    return scipy.stats.mode(x)[0]

df_user_geral = df_user.groupby(['user_id']).agg(
    checkins_count=('checkin', 'count'),
    utrip_id_count=('utrip_id', pd.Series.nunique),
    duration_mean=('duration', 'mean'),
    mode_booker_country=('booker_country', mode),
    mode_device_class=('device_class', mode),
    mode_trip_month=('start_trip_month', mode),
    mode_city_id=('city_id', mode),
    mode_affiliate_id=('affiliate_id', mode),
)

df_user_geral.sort_index()

Unnamed: 0_level_0,checkins_count,utrip_id_count,duration_mean,mode_booker_country,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1000027,4,1,2.000000,Elbonia,desktop,8,15626,7168
1000033,5,1,2.000000,Gondal,desktop,4,38677,384
1000045,7,1,1.571429,The Devilfire Empire,desktop,6,31817,359
100008,5,1,1.800000,Gondal,desktop,7,11306,8436
1000083,4,1,1.250000,The Devilfire Empire,mobile,6,14705,359
...,...,...,...,...,...,...,...,...
999735,5,1,1.800000,The Devilfire Empire,mobile,1,36063,1020
999755,4,1,2.500000,The Devilfire Empire,desktop,12,1034,7974
999776,4,1,1.000000,Gondal,desktop,3,17775,4541
999842,4,1,1.000000,Gondal,desktop,5,24036,3894


In [6]:
df_user_geral = df_user_geral.reset_index()
df_user_geral['user_id']  = df_user_geral['user_id'].astype('int')

## Trip  User Features

In [7]:
df_trip = pd.read_csv('../output/booking/dataset/train_500_30_5.csv')
df_trip.head()

Unnamed: 0,utrip_id,user_id,user_features,count_unique_city,trip_size,start_trip,end_trip,checkin_list,checkout_list,days_since_2016_list,...,affiliate_id_list,booker_country_list,hotel_country_list,step_list,first_city_id,first_hotel_country,last_city_id,last_hotel_country,country_count,duration_sum
0,1000027_1,1000027,"[-0.2721548080444336, -0.3261375427246094, -0....",4,3,2016-08-13,2016-08-18,"['0', '0', '2016-08-13', '2016-08-14', '2016-0...","['0', '0', '2016-08-14', '2016-08-16', '2016-0...","[0, 0, 225, 226, 228]",...,"['0', '0', '7168', '7168', '7168']","['0', '0', 'Elbonia', 'Elbonia', 'Elbonia']","['0', '0', 'Gondal', 'Gondal', 'Gondal']","[0, 0, 1, 2, 3]",8183,Gondal,30628,Gondal,88076,5
1,1000033_1,1000033,"[-0.653695285320282, 0.9078158736228943, 0.579...",4,4,2016-04-09,2016-04-16,"['0', '2016-04-09', '2016-04-11', '2016-04-12'...","['0', '2016-04-11', '2016-04-12', '2016-04-14'...","[0, 99, 101, 102, 104]",...,"['0', '359', '384', '384', '384']","['0', 'Gondal', 'Gondal', 'Gondal', 'Gondal']","['0', 'Cobra Island', 'Cobra Island', 'Cobra I...","[0, 1, 2, 3, 4]",38677,Cobra Island,38677,Cobra Island,96654,7
2,1000045_1,1000045,"[-1.1103595495224, -1.2900782823562622, -0.307...",7,6,2016-06-18,2016-06-28,"['2016-06-20', '2016-06-22', '2016-06-24', '20...","['2016-06-22', '2016-06-24', '2016-06-25', '20...","[171, 173, 175, 176, 177]",...,"['7974', '7974', '7974', '359', '359']","['The Devilfire Empire', 'The Devilfire Empire...","['Fook Island', 'Fook Island', 'Carpathia', 'C...","[2, 3, 4, 5, 6]",64876,Fook Island,36063,Gondal,16624,8
3,1000083_1,1000083,"[1.3209058046340942, 0.19926407933235168, 0.57...",4,3,2016-06-13,2016-06-16,"['0', '0', '2016-06-13', '2016-06-14', '2016-0...","['0', '0', '2016-06-14', '2016-06-15', '2016-0...","[0, 0, 164, 165, 166]",...,"['0', '0', '359', '359', '359']","['0', '0', 'The Devilfire Empire', 'The Devilf...","['0', '0', 'Osterlich', 'Osterlich', 'Osterlich']","[0, 0, 1, 2, 3]",55990,Osterlich,36063,Gondal,13913,3
4,100008_1,100008,"[-0.346758633852005, 0.11678697913885117, -1.0...",5,4,2016-07-18,2016-07-25,"['0', '2016-07-18', '2016-07-21', '2016-07-22'...","['0', '2016-07-21', '2016-07-22', '2016-07-23'...","[0, 199, 202, 203, 204]",...,"['0', '8436', '8436', '8436', '8436']","['0', 'Gondal', 'Gondal', 'Gondal', 'Gondal']","['0', 'Kamistan', 'Kamistan', 'Kamistan', 'Kam...","[0, 1, 2, 3, 4]",11306,Kamistan,65690,Kamistan,6480,7


In [8]:
df_trip.iloc[0]

utrip_id                                                        1000027_1
user_id                                                           1000027
user_features           [-0.2721548080444336, -0.3261375427246094, -0....
count_unique_city                                                       4
trip_size                                                               3
start_trip                                                     2016-08-13
end_trip                                                       2016-08-18
checkin_list            ['0', '0', '2016-08-13', '2016-08-14', '2016-0...
checkout_list           ['0', '0', '2016-08-14', '2016-08-16', '2016-0...
days_since_2016_list                                [0, 0, 225, 226, 228]
duration_list                                             [0, 0, 1, 2, 2]
city_id_list                         ['0', '0', '8183', '15626', '60902']
device_class_list             ['0', '0', 'desktop', 'desktop', 'desktop']
affiliate_id_list                     

In [9]:
df_user_trip = df_trip.groupby(['user_id']).agg(
    trip_duration_mean=('duration_sum', 'mean'),
    first_city_id_last_trip=('first_city_id', 'last'),
    first_hotel_country_last_trip=('first_hotel_country', 'last'),
    
)
df_user_trip = df_user_trip.reset_index()
df_user_trip['user_id'] = df_user_trip['user_id'].astype('int')

In [10]:
df_user_all = df_user_geral.merge(df_user_trip, how="left").fillna(0)
df_user_all['first_city_id_last_trip'] = df_user_all['first_city_id_last_trip'].astype('str')
df_user_all['first_hotel_country_last_trip'] = df_user_all['first_hotel_country_last_trip'].astype('str')
df_user_all

Unnamed: 0,user_id,checkins_count,utrip_id_count,duration_mean,mode_booker_country,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,trip_duration_mean,first_city_id_last_trip,first_hotel_country_last_trip
0,1000027,4,1,2.000000,Elbonia,desktop,8,15626,7168,3.500000,8183.0,Gondal
1,1000033,5,1,2.000000,Gondal,desktop,4,38677,384,4.800000,38677.0,Cobra Island
2,1000045,7,1,1.571429,The Devilfire Empire,desktop,6,31817,359,5.833333,64876.0,Fook Island
3,100008,5,1,1.800000,Gondal,desktop,7,11306,8436,5.200000,11306.0,Kamistan
4,1000083,4,1,1.250000,The Devilfire Empire,mobile,6,14705,359,2.250000,55990.0,Osterlich
...,...,...,...,...,...,...,...,...,...,...,...,...
181475,999735,5,1,1.800000,The Devilfire Empire,mobile,1,36063,1020,3.800000,36063.0,Gondal
181476,999755,4,1,2.500000,The Devilfire Empire,desktop,12,1034,7974,5.250000,4476.0,Gondal
181477,999776,4,1,1.000000,Gondal,desktop,3,17775,4541,2.250000,17775.0,Novistrana
181478,999842,4,1,1.000000,Gondal,desktop,5,24036,3894,2.250000,51291.0,Glubbdubdrib


In [11]:
df_user_all.describe()

Unnamed: 0,user_id,checkins_count,utrip_id_count,duration_mean,mode_trip_month,trip_duration_mean
count,181480.0,181480.0,181480.0,181480.0,181480.0,181480.0
mean,3127523.0,5.78375,1.079551,1.797755,6.975711,4.069519
std,1807416.0,2.925228,0.34388,0.80629,2.805365,2.521657
min,29.0,1.0,1.0,1.0,1.0,0.0
25%,1559288.0,4.0,1.0,1.25,5.0,2.75
50%,3125121.0,5.0,1.0,1.6,7.0,3.888889
75%,4694615.0,6.0,1.0,2.2,9.0,5.5
max,6258087.0,156.0,18.0,30.0,12.0,18.333333


In [12]:
df_user_all.to_csv('../output/booking/dataset/user_features_raw.csv')

## Transform Dataset

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [14]:
df = df_user_all.set_index('user_id')
df.head()

Unnamed: 0_level_0,checkins_count,utrip_id_count,duration_mean,mode_booker_country,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,trip_duration_mean,first_city_id_last_trip,first_hotel_country_last_trip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000027,4,1,2.0,Elbonia,desktop,8,15626,7168,3.5,8183.0,Gondal
1000033,5,1,2.0,Gondal,desktop,4,38677,384,4.8,38677.0,Cobra Island
1000045,7,1,1.571429,The Devilfire Empire,desktop,6,31817,359,5.833333,64876.0,Fook Island
100008,5,1,1.8,Gondal,desktop,7,11306,8436,5.2,11306.0,Kamistan
1000083,4,1,1.25,The Devilfire Empire,mobile,6,14705,359,2.25,55990.0,Osterlich


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 181480 entries, 1000027 to 999944
Data columns (total 11 columns):
checkins_count                   181480 non-null int64
utrip_id_count                   181480 non-null int64
duration_mean                    181480 non-null float64
mode_booker_country              181480 non-null object
mode_device_class                181480 non-null object
mode_trip_month                  181480 non-null int64
mode_city_id                     181480 non-null object
mode_affiliate_id                181480 non-null object
trip_duration_mean               181480 non-null float64
first_city_id_last_trip          181480 non-null object
first_hotel_country_last_trip    181480 non-null object
dtypes: float64(2), int64(3), object(6)
memory usage: 16.6+ MB


In [16]:
df.select_dtypes(include=['int64', 'float64']).columns

Index(['checkins_count', 'utrip_id_count', 'duration_mean', 'mode_trip_month',
       'trip_duration_mean'],
      dtype='object')

In [17]:
# determine categorical and numerical features
numerical_ix   = df.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = df.select_dtypes(include=['object', 'bool']).columns

In [18]:
df[categorical_ix]

Unnamed: 0_level_0,mode_booker_country,mode_device_class,mode_city_id,mode_affiliate_id,first_city_id_last_trip,first_hotel_country_last_trip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000027,Elbonia,desktop,15626,7168,8183.0,Gondal
1000033,Gondal,desktop,38677,384,38677.0,Cobra Island
1000045,The Devilfire Empire,desktop,31817,359,64876.0,Fook Island
100008,Gondal,desktop,11306,8436,11306.0,Kamistan
1000083,The Devilfire Empire,mobile,14705,359,55990.0,Osterlich
...,...,...,...,...,...,...
999735,The Devilfire Empire,mobile,36063,1020,36063.0,Gondal
999755,The Devilfire Empire,desktop,1034,7974,4476.0,Gondal
999776,Gondal,desktop,17775,4541,17775.0,Novistrana
999842,Gondal,desktop,24036,3894,51291.0,Glubbdubdrib


In [19]:
# define the data preparation for the columns
t = [
     ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_ix), 
     ('num', StandardScaler(), numerical_ix)
    ]

col_transform = ColumnTransformer(transformers=t)
df_transform  = col_transform.fit_transform(df)
df_transform

<181480x33061 sparse matrix of type '<class 'numpy.float64'>'
	with 1996280 stored elements in Compressed Sparse Row format>

In [20]:
from scipy import sparse

sparse.save_npz("../output/booking/dataset/train_user_features.npz", df_transform)
df.reset_index().to_csv('../output/booking/dataset/train_user_features.csv', index=False)

In [None]:
df

In [36]:
df_transform_all  = col_transform.transform(df)
df_transform_all

<200153x35277 sparse matrix of type '<class 'numpy.float64'>'
	with 2201601 stored elements in Compressed Sparse Row format>

In [47]:
from scipy import sparse
sparse.save_npz("../output/booking/dataset/all_user_features.npz", df_transform_all)
df.reset_index().to_csv('../output/booking/dataset/all_user_features.csv', index=False)

### Load and Save

In [48]:
df.head()

Unnamed: 0_level_0,checkins_count,utrip_id_count,duration_mean,mode_booker_country,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,trip_duration_mean,first_city_id_last_trip,first_hotel_country_last_trip,user_features
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000027,4,1,2.0,Elbonia,desktop,8,15626,7168,3.5,8183.0,Gondal,"[-0.06505580991506577, 0.6599699854850769, 0.5..."
1000033,5,1,2.0,Gondal,desktop,4,38677,384,4.8,38677.0,Cobra Island,"[-0.41572630405426025, -0.05840621143579483, -..."
1000045,7,1,1.571429,The Devilfire Empire,desktop,6,31817,359,5.833333,64876.0,Fook Island,"[-0.3443673849105835, -0.9910609722137451, -1...."
100008,5,1,1.8,Gondal,desktop,7,11306,8436,5.2,11306.0,Kamistan,"[0.13192051649093628, -1.070617437362671, -0.2..."
1000083,4,1,1.25,The Devilfire Empire,mobile,6,14705,359,2.25,55990.0,Osterlich,"[0.38146448135375977, -1.216643214225769, 1.80..."


In [51]:
np_user_features = np.load('../output/booking/dataset/all_user_features.npy')
df.shape, np_user_features.shape

((200153, 12), (200153, 10))

In [52]:
df['user_features'] = np_user_features.tolist()

In [53]:
df.head()

Unnamed: 0_level_0,checkins_count,utrip_id_count,duration_mean,mode_booker_country,mode_device_class,mode_trip_month,mode_city_id,mode_affiliate_id,trip_duration_mean,first_city_id_last_trip,first_hotel_country_last_trip,user_features
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000027,4,1,2.0,Elbonia,desktop,8,15626,7168,3.5,8183.0,Gondal,"[-0.2721548080444336, -0.3261375427246094, -0...."
1000033,5,1,2.0,Gondal,desktop,4,38677,384,4.8,38677.0,Cobra Island,"[-0.653695285320282, 0.9078158736228943, 0.579..."
1000045,7,1,1.571429,The Devilfire Empire,desktop,6,31817,359,5.833333,64876.0,Fook Island,"[-1.1103595495224, -1.2900782823562622, -0.307..."
100008,5,1,1.8,Gondal,desktop,7,11306,8436,5.2,11306.0,Kamistan,"[-0.346758633852005, 0.11678697913885117, -1.0..."
1000083,4,1,1.25,The Devilfire Empire,mobile,6,14705,359,2.25,55990.0,Osterlich,"[1.3209058046340942, 0.19926407933235168, 0.57..."


In [54]:
df.reset_index().to_csv('../output/booking/dataset/all_user_features.csv')

In [55]:
from numpy import asarray
from numpy import savetxt

# save to csv file
savetxt('data.csv', np_user_features, delimiter='\t')

In [56]:
df.reset_index().to_csv('metadata.csv', sep='\t')