In [None]:
# Import libraries
import locale

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, LabelBinarizer

from datetime import datetime
from locale import atof

In [None]:
# Define function that transforms date into value corresponding to number of week in year 
# that it belongs to.
def day_to_week_of_year(date_to_transform):
    return datetime.strptime(date_to_transform + "/2017", '%d/%B/%Y').isocalendar()[1]

In [None]:
# Load data
data = pd.read_csv('train.csv', sep=';', decimal=',')

In [None]:
# See data column names.
data.columns

In [None]:
# See how many missing values the dataset has.
print(data.isnull().sum())

# Since there are missing values only in variable 'DEVICE' and they are very few we'll 
# just remove them. If there were much more we could use imputation methods to fill this
# missing values.
data = data[pd.notnull(data['DEVICE'])]
data = data.reset_index(drop=True)

print(data.isnull().sum())

In [None]:
# See distribution of classes in target variable.
data['EXTRA_BAGGAGE'].value_counts()

# Classes are imbalanced. There are much more samples belonging to the 'False' class than 
# to the 'True' class.

In [None]:
# Review date variables: 'TIMESTAMP', 'DEPARTURE' and 'ARRIVAL'
print(data['TIMESTAMP'][0:5])
print(data['DEPARTURE'][0:5])
print(data['ARRIVAL'][0:5])

# All three date variables have the same format 'day_number/month_name'.
# We'll transform these values to numeric values corresponding to number 
# of week in year that the date belongs to.

data['TIMESTAMP'] = data['TIMESTAMP'].apply(day_to_week_of_year)
data['DEPARTURE'] = data['DEPARTURE'].apply(day_to_week_of_year)
data['ARRIVAL'] = data['ARRIVAL'].apply(day_to_week_of_year)

print(data['TIMESTAMP'][0:5])
print(data['DEPARTURE'][0:5])
print(data['ARRIVAL'][0:5])

In [None]:
# Transform string variables 'WEBSITE', 'DEVICE', 'HAUL_TYPE', 'TRIP_TYPE' and 'PRODUCT' 
# to categorical using LabelEncoder.
# IMPORTANT: form 'WEBSITE' variable, we are not sure, all levels that this categorical 
# variable can take, arein the train.csv dataset.
label_encoder = LabelEncoder()

print(data['WEBSITE'].dtype)
data['WEBSITE'] = pd.Series(label_encoder.fit_transform(data['WEBSITE'])).values
print(data['WEBSITE'][0:5])

print(data['DEVICE'].dtype)
data['DEVICE'] = pd.Series(label_encoder.fit_transform(data['DEVICE'])).values
print(data['DEVICE'][0:5])

print(data['HAUL_TYPE'].dtype)
data['HAUL_TYPE'] = pd.Series(label_encoder.fit_transform(data['HAUL_TYPE'])).values
print(data['HAUL_TYPE'][0:5])

print(data['TRIP_TYPE'].dtype)
data['TRIP_TYPE'] = pd.Series(label_encoder.fit_transform(data['TRIP_TYPE'])).values
print(data['TRIP_TYPE'][0:5])

print(data['PRODUCT'].dtype)
data['PRODUCT'] = pd.Series(label_encoder.fit_transform(data['PRODUCT'])).values
print(data['PRODUCT'][0:5])

In [None]:
# Transform string variables 'WEBSITE', 'DEVICE', 'HAUL_TYPE', 'TRIP_TYPE' and 'PRODUCT' 
# to categorical using LabelEncoder.
# IMPORTANT: form 'WEBSITE' variable, we are not sure, all levels that this categorical 
# variable can take, arein the train.csv dataset.

# Won't binarize 'WEBSITE' variable because it has too many levels.
label_encoder = LabelEncoder()

print(data['WEBSITE'].dtype)
data['WEBSITE'] = pd.Series(label_encoder.fit_transform(data['WEBSITE'])).values
print(data['WEBSITE'][0:5])

label_encoder = LabelBinarizer()

print(data['DEVICE'].dtype)
print(data['DEVICE'].shape)
print(data['DEVICE'][0:5])
encoder_result = label_encoder.fit_transform(data['DEVICE'])
device_columns = ["DEVICE_" + str(bin_class) for bin_class in label_encoder.classes_]
data_device = pd.DataFrame(encoder_result, columns=device_columns)
print("data device shape:", data_device.shape)
print(data_device[0:5])

print("data shape:", data.shape)
print("new device columns:", len(device_columns))
data = data.drop('DEVICE', 1)
data = data.reset_index(drop=True)
print("data shape:", data.shape)
data = pd.concat([data, data_device], axis=1)
data = data.reset_index(drop=True)
print("new data shape:", data.shape)
print(data[device_columns][0:5])
print()

print(data['HAUL_TYPE'].dtype)
print(data['HAUL_TYPE'].shape)
print(data['HAUL_TYPE'][0:5])
encoder_result = label_encoder.fit_transform(data['HAUL_TYPE'])
haul_type_columns = ["HAUL_TYPE_" + str(bin_class) for bin_class in label_encoder.classes_]
data_haul_type = pd.DataFrame(encoder_result, columns=haul_type_columns)
print("data haul_type shape:", data_haul_type.shape)
print(data_haul_type[0:5])

print("data shape:", data.shape)
print("new haul_type columns:", len(haul_type_columns))
data = data.drop('HAUL_TYPE', 1)
data = data.reset_index(drop=True)
print("data shape:", data.shape)
data = pd.concat([data, data_haul_type], axis=1)
data = data.reset_index(drop=True)
print("new data shape:", data.shape)
print(data[haul_type_columns][0:5])
print()

print(data['TRIP_TYPE'].dtype)
print(data['TRIP_TYPE'].shape)
print(data['TRIP_TYPE'][0:5])
encoder_result = label_encoder.fit_transform(data['TRIP_TYPE'])
trip_type_columns = ["TRIP_TYPE_" + str(bin_class) for bin_class in label_encoder.classes_]
data_trip_type = pd.DataFrame(encoder_result, columns=trip_type_columns)
print("data trip_type shape:", data_trip_type.shape)
print(data_trip_type[0:5])

print("data shape:", data.shape)
print("new trip_type columns:", len(trip_type_columns))
data = data.drop('TRIP_TYPE', 1)
data = data.reset_index(drop=True)
print("data shape:", data.shape)
data = pd.concat([data, data_trip_type], axis=1)
data = data.reset_index(drop=True)
print("new data shape:", data.shape)
print(data[trip_type_columns][0:5])
print()

print(data['PRODUCT'].dtype)
print(data['PRODUCT'].shape)
print(data['PRODUCT'][0:5])
encoder_result = label_encoder.fit_transform(data['PRODUCT'])
data_product = pd.Series(encoder_result[:, 0], name='PRODUCT')
print("data product shape:", data_product.shape)
print(data_product[0:5])

print("data shape:", data.shape)
data = data.drop('PRODUCT', 1)
data = data.reset_index(drop=True)
print("data shape:", data.shape)
data = pd.concat([data, data_product], axis=1)
data = data.reset_index(drop=True)
print("new data shape:", data.shape)
print(data['PRODUCT'][0:5])
print()

In [None]:
# ['ID', 'TIMESTAMP', 'WEBSITE', 'GDS', 'DEPARTURE', 'ARRIVAL', 'ADULTS',
#  'CHILDREN', 'INFANTS', 'TRAIN', 'HAUL_TYPE', 'DISTANCE', 'DEVICE',
#  'TRIP_TYPE', 'PRODUCT', 'SMS', 'EXTRA_BAGGAGE', 'NO_GDS']

# Check all integer variables 'GDS', 'NO_GDS', 'ADULTS', 'CHILDREN' and 'INFANTS', have 
# their corresponding dtype in the pandas DataFrame.
print(data['GDS'].dtype)
print(data['NO_GDS'].dtype)
print(data['ADULTS'].dtype)
print(data['CHILDREN'].dtype)
print(data['INFANTS'].dtype)

In [None]:
# Check all boolean variables 'TRAIN', 'SMS' and 'EXTRA_BAGGAGE' have their corresponding 
# dtype in the pandas DataFrame.
print(data['TRAIN'].dtype)
print(data['SMS'].dtype)
print(data['EXTRA_BAGGAGE'].dtype)

In [None]:
# Check float variable 'DISTANCE' has it's corresponding dtype in the pandas DataFrame.
print(data['DISTANCE'].dtype)

In [None]:
# Drop 'ID' variable since it is useless.
data = data.drop('ID', 1)
data = data.reset_index(drop=True)

In [None]:
# Store 'clean' dataset into new .csv file
# data.to_csv('clean_train.csv', index=False, sep=';')
data.to_csv('clean_binarized_train.csv', index=False, sep=';')