# Imports

In [None]:
%pip install tensorflow-data-validation

In [None]:
%pip install -q tensorflow_data_validation[visualization]

In [None]:
%pip install tfx

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import pickle
import tensorflow_data_validation as tfdv
np.set_printoptions(threshold=sys.maxsize)
print('TFDV version: {}'.format(tfdv.version.__version__))

# Data loading

In [None]:
DATA = './data'
TRAIN_DATA = os.path.join(DATA, 'train.csv')
TEST_DATA = os.path.join(DATA, 'test.csv')

In [None]:
train_df = pd.read_csv(TRAIN_DATA, sep=",")    
test_df = pd.read_csv(TEST_DATA, sep=",")
display(train_df)
display(test_df)

In [None]:
%%capture
import tensorflow_data_validation as tfdv
print('TFDV version: {}'.format(tfdv.version.__version__))
train_stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
test_stats = tfdv.generate_statistics_from_csv(data_location=TEST_DATA)

In [None]:
tfdv.visualize_statistics(train_stats)
tfdv.visualize_statistics(lhs_statistics=train_stats,
                         rhs_statistics=test_stats)

In [41]:
schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'ID',INT,required,,-
'TIMESTAMP',STRING,required,,'TIMESTAMP'
'WEBSITE',STRING,required,,'WEBSITE'
'GDS',INT,required,,-
'DEPARTURE',BYTES,required,,-
'ARRIVAL',BYTES,required,,-
'ADULTS',INT,required,,-
'CHILDREN',INT,required,,-
'INFANTS',INT,required,,-
'TRAIN',STRING,required,,'TRAIN'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'TIMESTAMP',"'01/July', '02/July'"
'WEBSITE',"'EDAE', 'EDAR', 'EDAU', 'EDBR', 'EDCA', 'EDCH', 'EDCL', 'EDCN', 'EDCO', 'EDDE', 'EDEG', 'EDES', 'EDFR', 'EDGB', 'EDGR', 'EDHK', 'EDID', 'EDIN', 'EDIT', 'EDJP', 'EDMA', 'EDMX', 'EDNL', 'EDNZ', 'EDPE', 'EDPH', 'EDPT', 'EDRU', 'EDSG', 'EDTH', 'EDTR', 'EDUK', 'EDUS', 'EDVE', 'EDZA', 'GODE', 'GOES', 'GOFR', 'GOGB', 'GOIT', 'GONL', 'GOPT', 'OPAT', 'OPAU', 'OPCH', 'OPDE', 'OPDEC', 'OPFR', 'OPFRC', 'OPGB', 'OPIT', 'OPNL', 'OPPL', 'OPPLC', 'OPUK', 'TLDK', 'TLDKC', 'TLFI', 'TLNO', 'TLSE'"
'TRAIN',"'False', 'True'"
'HAUL_TYPE',"'CONTINENTAL', 'DOMESTIC', 'INTERCONTINENTAL'"
'DEVICE',"'COMPUTER', 'MULTI_DESTINATION', 'ONE_WAY', 'OTHER', 'ROUND_TRIP', 'SMARTPHONE', 'TABLET'"
'TRIP_TYPE',"'DYNPACK', 'MULTI_DESTINATION', 'ONE_WAY', 'ROUND_TRIP', 'TRIP'"
'PRODUCT',"'DYNPACK', 'False', 'TRIP', 'True'"
'SMS',"'False', 'True'"
'EXTRA_BAGGAGE',"'0', '1', '2', '3', 'False', 'True'"


In [44]:
from tensorflow_metadata.proto.v0 import schema_pb2
tfdv.set_domain(schema, 'PRODUCT', schema_pb2.StringDomain(name='PRODUCT', value=['DYNPACK', 'TRIP']))
tfdv.set_domain(schema, 'EXTRA_BAGGAGE', schema_pb2.BoolDomain())

stats_options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)

train_stats = tfdv.generate_statistics_from_csv(
    data_location=TRAIN_DATA,
    stats_options=stats_options,
)

anomalies = tfdv.validate_statistics(train_stats, schema)
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'EXTRA_BAGGAGE',Non-boolean values,"Saw unexpected value ""0"" instead of {}."
'PRODUCT',Unexpected string values,"Examples contain values missing from the schema: False (<1%), True (<1%)."


# Data preprocessing

### Remove ID, TIMESTAMP and TRAIN
##### Train is always false so it does not provide information

In [None]:
DROP_COLS = ['ID', 'TIMESTAMP', 'TRAIN']
train_df.drop(DROP_COLS,  axis='columns', inplace=True)
test_df.drop(DROP_COLS,  axis='columns', inplace=True)
display(train_df)

### Remove rows with empty fields

### Transform dates

In [None]:
month_names = {month.lower(): index for index, month in enumerate(calendar.month_name) if month}
def replaceMonthName(date):
    index = date.find("/")
    month_name = f"{date[index+1:]}".lower()
    return date[:index] + f"{month_names[month_name]}".zfill(2)

def processDates(data):
    data['DEPARTURE'] = data['DEPARTURE'].str.replace("-","/")
    data['ARRIVAL'] = data['ARRIVAL'].str.replace("-","/")
    data['DEPARTURE'] = data['DEPARTURE'].apply(lambda x: f"{replaceMonthName(x)}")
    data['ARRIVAL'] = data['ARRIVAL'].apply(lambda x: f"{replaceMonthName(x)}")
    
processDates(X_train)
processDates(X_test)

### Convert distance to float

In [None]:
X_train['DISTANCE'] = X_train['DISTANCE'].str.strip()
X_train['DISTANCE'] = X_train['DISTANCE'].apply(lambda x: x.replace(',','.'))
X_test['DISTANCE'] = X_test['DISTANCE'].apply(lambda x: x.replace(',','.'))
X_train['DISTANCE'] = pd.to_numeric(X_train['DISTANCE'], errors='coerce')

### Encode categorical columns

In [None]:
def hot_encode_categorical_inputs(X_train, X_test, columns):
    oe_style = OneHotEncoder(handle_unknown = 'ignore')   
    
    for col in columns:        
        X_train_enc = oe_style.fit_transform(X_train[[col]])
        X_train = X_train.join(pd.DataFrame(X_train_enc.toarray(), columns=oe_style.categories_))
        X_test_enc = oe_style.transform(X_test[[col]])   
        X_test = X_test.join(pd.DataFrame(X_test_enc.toarray(), columns=oe_style.categories_))
        X_train = X_train.drop([col], axis=1)
        X_test = X_test.drop([col], axis=1)   
    
    return X_train, X_test

def binary_encode_categorical_inputs(X_train, X_test, columns):
    
    for col in columns:   
        X_train[col] = X_train[col].astype(int)
        X_test[col] = X_test[col].astype(int)
        
    return X_train, X_test

hot_encode_cols = ['WEBSITE','DEVICE','HAUL_TYPE','TRIP_TYPE', 'PRODUCT']
label_encode_cols = ['TRAIN', 'SMS']

[X_train, X_test] = hot_encode_categorical_inputs(X_train, X_test, hot_encode_cols)
[X_train, X_test] = binary_encode_categorical_inputs(X_train, X_test, label_encode_cols)

In [None]:
X_train.dropna(inplace=True)
X_test.dropna(inplace=True)

### Normalize data

In [None]:
def normalize(X_train, X_test, columns):
    # copy the data
    train = X_train.copy()
    test = X_test.copy()
    normalizer = preprocessing.MinMaxScaler()
    for column in columns:
        train[column] = normalizer.fit_transform(np.array(train[column]).reshape(-1,1))
        test[column] = normalizer.transform(np.array(test[column]).reshape(-1,1))
    return train, test
    
[X_train, X_test] = normalize(X_train, X_test, ['DISTANCE', 'DEPARTURE', 'ARRIVAL'])

### Prepare targets

In [None]:
def prepare_targets(y_train):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)    
    y_train_enc = pd.DataFrame(y_train_enc)
    return y_train_enc

Y_train = X_train['EXTRA_BAGGAGE']
X_train = X_train.drop('EXTRA_BAGGAGE',  axis='columns')

Y_train = prepare_targets(Y_train)
Y_train.isin([1]).sum()

In [None]:
saveData("IDs.pickle", X_test["ID"].values)
X_test = X_test.drop('ID',  axis='columns')

# Feature extraction. PCA

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca = PCA(n_components=82)
pca.fit(train_df)
fig = plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
fig.savefig('Cumulative explained variance.png', dpi=fig.dpi)

In [None]:
pca = PCA(n_components=30)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
pca_std = np.std(X_train)

### Save data

In [None]:
def saveData(filename, data):
    pickle_out = open(filename, "wb")
    pickle.dump(data, pickle_out)
    pickle_out.close() 


saveData("X_train.pickle", X_train)
saveData("Y_train.pickle", Y_train.values)
saveData("X_test.pickle", X_test)



## Neural network training

In [None]:
#Dependencies
import time
import tensorflow as tf
import pickle
import numpy as np
import sys
from tf.keras.models import Sequential
from tf.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tf.keras.callbacks import TensorBoard
from tf.keras.callbacks import EarlyStopping


pickle_in = open("X_train.pickle", "rb")
X_train = pickle.load(pickle_in)

pickle_in = open("Y_train.pickle", "rb")
Y_train = pickle.load(pickle_in)
#nonzeroind = np.nonzero(Y_train)[0]
early_stop = EarlyStopping(monitor='val_accuracy', mode='min', patience=50)
NAME = "NN-{}".format(int(time.time()))
tensorBoard = TensorBoard(log_dir='logs/{}'.format(NAME))

model = Sequential()
model.add(Dense(100, input_dim=30, activation='relu')) 
model.add(Dense(50, activation='relu')) 
model.add(Dense(20, activation='relu')) 
model.add(Dense(1, activation='sigmoid'))

model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, Y_train, batch_size=2500, epochs=200,validation_split=0.1, callbacks=[tensorBoard,early_stop], verbose=1,shuffle=True)
model.save('model')


# Model prediction

In [None]:
from keras.models import load_model
pickle_in = open("X_test.pickle", "rb")
X_test = pickle.load(pickle_in)

pickle_in = open("IDs.pickle", "rb")
IDs = pickle.load(pickle_in)

p = model.predict(X_test)
p = np.array(p).reshape(-1)
IDs = np.array(IDs).reshape(-1)

result = pd.DataFrame({'ID': IDs, 'PREDICTION': p})
result.to_csv('result.csv', index=False)
print(result.head())
print(p[p>0.5].size)