# Imports

In [79]:
import datetime, calendar
import pandas as pd
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer  

# Data loading

In [80]:
X_train = pd.read_csv("train.csv", sep=";")    
X_test = pd.read_csv("test.csv", sep=";") 

# Data preprocessing

### Remove ID

In [81]:
X_train = X_train.drop('ID',  axis='columns')
X_test = X_test.drop('ID',  axis='columns')

### Remove rows with empty fields

In [82]:
X_train.dropna(inplace=True)
X_test.dropna(inplace=True)

### Transform dates

In [83]:
month_names = {month.lower(): index for index, month in enumerate(calendar.month_name) if month}
def replaceMonthName(date):
    index = date.find("/")
    month_name = f"{date[index+1:]}".lower()
    return date[:index] + f"{month_names[month_name]}".zfill(2)

def processDates(data):
    data['DEPARTURE'] = data['DEPARTURE'].str.replace("-","/")
    data['ARRIVAL'] = data['ARRIVAL'].str.replace("-","/")
    data['TIMESTAMP'] = data['TIMESTAMP'].apply(lambda x: f"{replaceMonthName(x)}")
    data['DEPARTURE'] = data['DEPARTURE'].apply(lambda x: f"{replaceMonthName(x)}")
    data['ARRIVAL'] = data['ARRIVAL'].apply(lambda x: f"{replaceMonthName(x)}")
    
processDates(X_train)
processDates(X_test)

### Convert distance to float

In [84]:
X_train['DISTANCE'] = X_train['DISTANCE'].apply(lambda x: x.replace(',','.'))
X_test['DISTANCE'] = X_test['DISTANCE'].apply(lambda x: x.replace(',','.'))
X_train['DISTANCE'] = pd.to_numeric(X_train['DISTANCE'], errors='coerce')

### Encode categorical columns

In [85]:
def hot_encode_categorical_inputs(X_train, X_test, columns):
    oe_style = OneHotEncoder(handle_unknown = 'ignore')   
    
    for col in columns:        
        X_train_enc = oe_style.fit_transform(X_train[[col]])
        X_train = X_train.join(pd.DataFrame(X_train_enc.toarray(), columns=oe_style.categories_))
        X_test_enc = oe_style.transform(X_test[[col]])   
        X_test = X_test.join(pd.DataFrame(X_test_enc.toarray(), columns=oe_style.categories_))
        X_train = X_train.drop([col], axis=1)
        X_test = X_test.drop([col], axis=1)   
    
    return X_train, X_test

def label_encode_categorical_inputs(X_train, X_test, columns):
    le = LabelEncoder()
    for col in columns:   
        X_train_enc = le.fit_transform(X_train[[col]])
        X_train = X_train.drop([col], axis=1)
        X_train = X_train.join(pd.DataFrame(X_train_enc, columns=[col]))
        
        X_test_enc = le.fit_transform(X_test[[col]])
        X_test = X_test.drop([col], axis=1)
        X_test = X_test.join(pd.DataFrame(X_test_enc, columns=[col]))  
        
    return X_train, X_test

hot_encode_cols = ['WEBSITE','DEVICE','HAUL_TYPE','TRIP_TYPE', 'PRODUCT']
label_encode_cols = ['TRAIN', 'SMS']

[X_train, X_test] = hot_encode_categorical_inputs(X_train, X_test, hot_encode_cols)
[X_train, X_test] = label_encode_categorical_inputs(X_train, X_test, label_encode_cols)

  y = column_or_1d(y, warn=True)


In [86]:
X_train.dropna(inplace=True)
X_test.dropna(inplace=True)

### Normalize data

In [None]:
#normalizer = Normalizer()
#X_train['DISTANCE']= pd.DataFrame(normalizer.fit_transform(X_train[['DISTANCE']])) 
#X_test['DISTANCE']= pd.DataFrame(normalizer.transform(X_test[['DISTANCE']]))



### Prepare targets

In [87]:
def prepare_targets(y_train):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)    
    y_train_enc = pd.DataFrame(y_train_enc)
    return y_train_enc

Y_train = X_train['EXTRA_BAGGAGE']
X_train = X_train.drop('EXTRA_BAGGAGE',  axis='columns')

Y_train = prepare_targets(Y_train)
print(X_train.head(100))

   TIMESTAMP  GDS DEPARTURE ARRIVAL  ADULTS  CHILDREN  INFANTS  DISTANCE  \
0       0107    1      2207    2507       1         0        0   628.844   
1       0107    0      2907    2907       1         0        0  1281.430   
2       0107    2      2907    1908       1         0        0  1730.350   
3       0107    0      2407    0408       1         0        0   652.702   
4       0107    0      1108    1108       1         0        0  1717.850   
..       ...  ...       ...     ...     ...       ...      ...       ...   
95      0107    0      1007    1107       1         0        0  1246.350   
96      0107    0      1309    2309       2         0        0  1342.250   
97      0107    1      0407    2207       1         0        0  1753.880   
98      0107    1      0908    3008       1         1        0  2611.920   
99      0107    1      3107    0608       2         0        0   671.543   

    NO_GDS  (EDAE,)  ...  (CONTINENTAL,)  (DOMESTIC,)  (INTERCONTINENTAL,)  \
0        

### Save data

In [88]:
def saveData(filename, data):
    pickle_out = open(filename, "wb")
    pickle.dump(data, pickle_out)
    pickle_out.close() 
    
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.values

saveData("X_train.pickle", X_train)
saveData("Y_train.pickle", Y_train)
saveData("X_test.pickle", X_test)


## Neural network training

In [None]:
#Dependencies
import time
import tensorflow as tf
import pickle
import numpy as np
import sys
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping


pickle_in = open("X_train.pickle", "rb")
X_train = pickle.load(pickle_in)

pickle_in = open("Y_train.pickle", "rb")
Y_train = pickle.load(pickle_in)


early_stop = EarlyStopping(monitor='val_accuracy', mode='min', patience=10)
NAME = "NN-{}".format(int(time.time()))
tensorBoard = TensorBoard(log_dir='logs/{}'.format(NAME))

model = Sequential()
model.add(Dense(100, input_dim=83, activation='relu')) 
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, Y_train, batch_size=512, epochs=200,validation_split=0.1, callbacks=[tensorBoard,early_stop], verbose=1,shuffle=True)
model.save('model')


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200


In [1]:
import pkg_resources

for entry_point in pkg_resources.iter_entry_points('tensorboard_plugins'):
    print(entry_point.dist)

-ensorboard 2.5.0
tensorboard-plugin-wit 1.8.0
tensorboard 2.5.0


In [90]:
from keras.models import load_model
pickle_in = open("X_test.pickle", "rb")
X_test = pickle.load(pickle_in)

p = model.predict(X_test)
print(p[p>0.5].size)

0
