In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Activation
import keras.backend as K

In [2]:
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

In [3]:
import tensorflow as tf

In [4]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  2


2022-12-30 20:50:38.143394: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-30 20:50:38.144374: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-30 20:50:38.443421: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-30 20:50:38.444425: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-30 20:50:38.445323: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

In [5]:
def get_user_history(df):
    gr = df.groupby(['order_requests', 'avatar_id'])

    new_df = pd.DataFrame()
    for _, v in gr:
        new_df = new_df.append(v.head(1)[['order_requests', 'avatar_id']])
    new_df['user_history'] = new_df.groupby('avatar_id').cumcount()

    df = pd.merge(df, new_df, how='inner', on=['order_requests', 'avatar_id'])

    return df

ysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [6]:
def map_hotel_group(group):
    groups = {'Boss Western': 'Boss_Western_Group', 'Accar Hotels': 'Accar_Hotels', 'Independant': 'Independant_Group',
              'Yin Yang': 'Yin_Yang', 'Chillton Worldwide': 'Chillton_Worldwide',
              'Morriott International': 'Morriott_International'}

    return groups[group]

def map_hotel_brand(brand):
    brands = {'J.Halliday Inn': 'J_Halliday_Inn', 'Marcure': 'Marcure', 'Independant': 'Independant_Brand',
              'Ibas': 'Ibas', 'Safitel': 'Safitel', '8 Premium': '8_Premium', 'Tripletree': 'Tripletree',
              'CourtYord': 'CourtYord', 'Royal Lotus': 'Royal_Lotus', 'Boss Western': 'Boss_Western_Brand',
              'Corlton': 'Corlton', 'Navatel': 'Navatel', 'Ardisson': 'Ardisson', 'Morriot': 'Morriot',
              'Chill Garden Inn': 'Chill_Garden_Inn', 'Quadrupletree': 'Quadrupletree'}

    return brands[brand]

def load_full_feature_set():
    # load data
    queries = pd.read_csv('/kaggle/input/defi-ia/all_queries.csv')
    prices = pd.read_csv('/kaggle/input/defi-ia/all_prices.csv')
    hotels = pd.read_csv('/kaggle/input/defi-ia/features_hotels.csv')
    test = pd.read_csv('/kaggle/input/defi-ia/test_set.csv')

    # drop query duplicates
    # queries = queries.drop_duplicates(subset=['language', 'city', 'date', 'mobile'])
    queries = queries.rename(columns={'queryId': 'order_requests'})
    prices = prices.rename(columns={'queryId': 'order_requests'})
    queries = get_user_history(queries)
    queries = queries.loc[queries['user_history'] <= 3]

    ### X_TRAIN ###
    # merge queries, prices and hotel_features
    X_train = pd.merge(queries, prices, how='inner', on='order_requests')
    X_train = pd.merge(X_train, hotels, how='inner', on='hotel_id')
    X_train = X_train.drop(columns='city_y')
    X_train = X_train.rename(columns={'city_x': 'city'})

    # brand and group correction
    X_train['brand'] = X_train.apply(lambda x: map_hotel_brand(x['brand']), axis=1)
    X_train['group'] = X_train.apply(lambda x: map_hotel_group(x['group']), axis=1)
    
    # encode as categorical
    categories = ['city', 'language', 'mobile', 'group', 'brand', 'parking', 'pool', 'children_policy']

    # X_train = X_train.drop(columns=['order_requests', 'avatar_id', 'avatar_name'])
    X_train = X_train.drop(columns=['avatar_name'])
    
    # feature ordering to match test set
    X_train = X_train[['order_requests', 'avatar_id', 'city', 'language', 'date', 'mobile',
                       'user_history',
                       'stock', 'group', 'brand', 'parking', 'pool', 'hotel_id',
                       'children_policy', 'price']]
    ### X_TRAIN ###
    
    
    ### X_TEST ###
    # merge test_set with hotel_features
    test = get_user_history(test)
    X_test = pd.merge(test, hotels, how='inner', on='hotel_id')
    X_test = X_test.drop(columns='city_y')
    X_test = X_test.rename(columns={'city_x': 'city'})

    # brand and group correction
    X_test['brand'] = X_test.apply(lambda x: map_hotel_brand(x['brand']), axis=1)
    X_test['group'] = X_test.apply(lambda x: map_hotel_group(x['group']), axis=1)

    # X_test = X_test.drop(columns=['order_requests', 'avatar_id'])

    X_test = X_test[['index', 'order_requests', 'avatar_id', 'city', 'language', 'date', 'mobile',
                     'user_history',
                     'stock', 'group', 'brand', 'parking', 'pool', 'hotel_id',
                     'children_policy']]
    ### X_TEST ###
    
    return X_train, X_test

In [7]:
X_train, X_test = load_full_feature_set()

y_train = X_train.pop('price')
test_idxs = X_test.pop('index')

######################################################################################

X_train[['date', 'stock']] = MinMaxScaler().fit_transform(X_train[['date', 'stock']])
X_test[['date', 'stock']] = MinMaxScaler().fit_transform(X_test[['date', 'stock']])

######################################################################################

X_train = X_train.set_index(['order_requests', 'avatar_id', 'hotel_id'])

poly = PolynomialFeatures(3, include_bias=False)
poly_features = poly.fit_transform(X_train[['date', 'stock']])
for i in range(poly_features.shape[1]):
    col_name = 'poly_' + str(i)
    X_train[col_name] = poly_features[:, i]
X_train.pop('mobile')

######################################################################################

X_test = X_test.set_index(['order_requests', 'avatar_id', 'hotel_id'])

poly = PolynomialFeatures(3, include_bias=False)
poly_features = poly.fit_transform(X_test[['date', 'stock']])
for i in range(poly_features.shape[1]):
    col_name = 'poly_' + str(i)
    X_test[col_name] = poly_features[:, i]
X_test.pop('mobile')

order_requests  avatar_id  hotel_id
1               1          161         0
2               1          161         0
30              12         161         1
31              12         161         1
32              12         161         0
                                      ..
662             771        856         1
693             514        856         1
753             768        856         0
811             766        856         1
823             640        856         1
Name: mobile, Length: 6644, dtype: int64

In [8]:
categories = ['user_history', 'city', 'language', 'group', 'brand', 'parking', 'pool', 'children_policy']

X_train = pd.get_dummies(X_train, columns=categories)
X_test = pd.get_dummies(X_test, columns=categories)

In [9]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date,stock,poly_0,poly_1,poly_2,poly_3,poly_4,poly_5,poly_6,poly_7,...,brand_Royal_Lotus,brand_Safitel,brand_Tripletree,parking_0,parking_1,pool_0,pool_1,children_policy_0,children_policy_1,children_policy_2
order_requests,avatar_id,hotel_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,41279,929,0.022727,0.003521,0.022727,0.003521,0.000517,0.000080,0.000012,0.000012,0.000002,2.817803e-07,...,0,1,0,0,1,1,0,1,0,0
6,41286,929,0.818182,0.584507,0.818182,0.584507,0.669421,0.478233,0.341648,0.547708,0.391282,2.795306e-01,...,0,1,0,0,1,1,0,1,0,0
50,55898,929,1.000000,0.985915,1.000000,0.985915,1.000000,0.985915,0.972029,1.000000,0.985915,9.720294e-01,...,0,1,0,0,1,1,0,1,0,0
54,55901,929,0.545455,0.179577,0.545455,0.179577,0.297521,0.097951,0.032248,0.162284,0.053428,1.758985e-02,...,0,1,0,0,1,1,0,1,0,0
55,55901,929,0.045455,0.007042,0.045455,0.007042,0.002066,0.000320,0.000050,0.000094,0.000015,2.254242e-06,...,0,1,0,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4500,203141,71,0.909091,0.750000,0.909091,0.750000,0.826446,0.681818,0.562500,0.751315,0.619835,5.113636e-01,...,0,1,0,0,1,0,1,0,1,0
4508,203149,71,0.909091,0.750000,0.909091,0.750000,0.826446,0.681818,0.562500,0.751315,0.619835,5.113636e-01,...,0,1,0,0,1,0,1,0,1,0
4090,202657,71,0.886364,0.711268,0.886364,0.711268,0.785640,0.630442,0.505902,0.696363,0.558801,4.484128e-01,...,0,1,0,0,1,0,1,0,1,0
4157,202735,71,0.886364,0.711268,0.886364,0.711268,0.785640,0.630442,0.505902,0.696363,0.558801,4.484128e-01,...,0,1,0,0,1,0,1,0,1,0


In [10]:
X_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date,stock,poly_0,poly_1,poly_2,poly_3,poly_4,poly_5,poly_6,poly_7,...,brand_Royal_Lotus,brand_Safitel,brand_Tripletree,parking_0,parking_1,pool_0,pool_1,children_policy_0,children_policy_1,children_policy_2
order_requests,avatar_id,hotel_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,1,161,0.525,0.231156,0.525,0.231156,0.275625,0.121357,0.053433,0.144703,0.063712,0.028052,...,0,0,0,0,1,1,0,1,0,0
2,1,161,0.450,0.180905,0.450,0.180905,0.202500,0.081407,0.032726,0.091125,0.036633,0.014727,...,0,0,0,0,1,1,0,1,0,0
30,12,161,0.500,0.221106,0.500,0.221106,0.250000,0.110553,0.048888,0.125000,0.055276,0.024444,...,0,0,0,0,1,1,0,1,0,0
31,12,161,0.450,0.180905,0.450,0.180905,0.202500,0.081407,0.032726,0.091125,0.036633,0.014727,...,0,0,0,0,1,1,0,1,0,0
32,12,161,0.400,0.150754,0.400,0.150754,0.160000,0.060302,0.022727,0.064000,0.024121,0.009091,...,0,0,0,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,771,856,0.975,0.195980,0.975,0.195980,0.950625,0.191080,0.038408,0.926859,0.186303,0.037448,...,0,0,0,0,1,1,0,1,0,0
693,514,856,0.850,0.130653,0.850,0.130653,0.722500,0.111055,0.017070,0.614125,0.094397,0.014510,...,0,0,0,0,1,1,0,1,0,0
753,768,856,0.900,0.155779,0.900,0.155779,0.810000,0.140201,0.024267,0.729000,0.126181,0.021840,...,0,0,0,0,1,1,0,1,0,0
811,766,856,0.850,0.130653,0.850,0.130653,0.722500,0.111055,0.017070,0.614125,0.094397,0.014510,...,0,0,0,0,1,1,0,1,0,0


In [11]:
print(f'X_train columns == X_test columns: {np.all(X_train.columns == X_test.columns)}')
print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of features: {X_train.shape[1]}')

X_train columns == X_test columns: True
Number of training samples: 578853
Number of features: 80


In [12]:
model = Sequential()
model.add(Dense(units=64, input_dim=X_train.shape[1],activation='relu'))
model.add(Dense(units=64, input_dim=X_train.shape[1],activation='relu'))
model.add(Dense(units=64, input_dim=X_train.shape[1],activation='relu'))
model.add(Dense(1))

model.compile(loss=tf.keras.metrics.mean_absolute_error,
              metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse')],
              optimizer='adam')

2022-12-30 20:51:00.101017: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-30 20:51:00.347390: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-30 20:51:00.348321: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-30 20:51:00.349148: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

In [13]:
model.fit(X_train, y_train, epochs=100, batch_size=64, verbose=2)

2022-12-30 20:51:07.375244: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/100
9045/9045 - 22s - loss: 15.5658 - rmse: 27.4096
Epoch 2/100
9045/9045 - 19s - loss: 12.1932 - rmse: 19.4422
Epoch 3/100
9045/9045 - 21s - loss: 11.6320 - rmse: 18.8705
Epoch 4/100
9045/9045 - 20s - loss: 11.3359 - rmse: 18.5292
Epoch 5/100
9045/9045 - 19s - loss: 11.1186 - rmse: 18.2899
Epoch 6/100
9045/9045 - 19s - loss: 10.9345 - rmse: 18.1115
Epoch 7/100
9045/9045 - 19s - loss: 10.7944 - rmse: 17.9319
Epoch 8/100
9045/9045 - 19s - loss: 10.6397 - rmse: 17.7503
Epoch 9/100
9045/9045 - 19s - loss: 10.4985 - rmse: 17.6165
Epoch 10/100
9045/9045 - 18s - loss: 10.3630 - rmse: 17.4100
Epoch 11/100
9045/9045 - 19s - loss: 10.2555 - rmse: 17.2818
Epoch 12/100
9045/9045 - 19s - loss: 10.1168 - rmse: 17.1062
Epoch 13/100
9045/9045 - 19s - loss: 9.9820 - rmse: 16.8977
Epoch 14/100
9045/9045 - 19s - loss: 9.8843 - rmse: 16.7181
Epoch 15/100
9045/9045 - 19s - loss: 9.7772 - rmse: 16.5456
Epoch 16/100
9045/9045 - 19s - loss: 9.6793 - rmse: 16.4124
Epoch 17/100
9045/9045 - 19s - loss: 

<keras.callbacks.History at 0x7f9fb0c61b10>

In [14]:
predictions = model.predict(X_test)
predictions = predictions.squeeze()

In [15]:
print(predictions)

[106.615265 107.7079   106.49609  ...  66.97584   70.31348   74.9205  ]


In [16]:
submission = pd.DataFrame(data={'index': test_idxs, 'price': predictions})
submission = submission.sort_values(by=['index'])

filename = './mlp_reg_submission_30_12_2022.csv'
submission.to_csv(filename, index=False)