In [1]:
import os
from datetime import datetime
import itertools
import numpy as np
import pandas as pd
import pandas_profiling
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network  import MLPClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
#------------ CONFIG ------------#
eda         = False
gridsearch  = False
#--------------------------------#

In [3]:
input_folder = "./data"
output_folder = "./results"
profiles_folder = "./profiles"
train = pd.read_csv(f"{input_folder}/train.csv")
test = pd.read_csv(f"{input_folder}/test.csv")
seed = 2020

When `eda` is on, two profiling reports are being generated: one for train dataset and one for test dataset.
From these two report, I can conclude that `Time_Room_Service` and `Deposit_Kept` columns are highly correlated (corr ~ 0.97). That said, I can drop one of those columns from both datasets to avoid repeating information for the model.

In [4]:
if eda:
    trainprofile = train.profile_report(title='Train Profiling Report', plot={'histogram': {'bins': 8}})
    trainprofile.to_file(output_file=f"{profiles_folder}/train_profiling.html")
    testprofile = test.profile_report(title='Test Profiling Report', plot={'histogram': {'bins': 8}})
    testprofile.to_file(output_file=f"{profiles_folder}/test_profiling.html")

In [5]:
def MapFlightClass(df):
    """
    `Flight_Class` is a categoric text variable, but its order matters.
    So, I'll take that in consideration when encoding this feature.
    """
    flightclass_map = {"Eco": 0, "Eco Plus": 1, "Business": 2}
    df["Flight_Class"] = df["Flight_Class"].map(flightclass_map)
    return df


def EncodeCategoricalFeats(df_train, df_test):
    """
    Encoding each categorical text feature to an integer.
    """
    obj_features = ["Gender", "Type"]
    df_combined = pd.concat([df_train[obj_features], df_test[obj_features]])
    for feature in obj_features:
        le = LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
        
    return df_train, df_test


def transform_df(df):
    df = MapFlightClass(df)
    df.drop("Time_Room_Service", axis=1, inplace=True)
    return df    

In [6]:
train, test = EncodeCategoricalFeats(train, test)

train = transform_df(train)
test = transform_df(test)

In [7]:
X = train.drop(['Guest_ID', 'Repeater'], axis=1)
y = train['Repeater']

colunas = X.columns

In [None]:
# commenting because MLPClassifier already uses Shuffle sampling and Train/Val split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, stratify=y)

In [None]:
%%time
if gridsearch:
    clf = MLPClassifier(
        max_iter = 1000,
        random_state = seed,
        warm_start = True
    )

    # Choose some parameter combinations to try
    parameters = {
        'learning_rate': ["constant", "invscaling", "adaptive"],
        'hidden_layer_sizes': [(100, 50, 50), (100, 100, 100), (100, 100, 100)],
        'alpha': list(10.0 ** -np.arange(2, 8)),
        'learning_rate_init': list(np.linspace(0.00001,0.1,6)),
        'activation': ["relu", "tanh"]
    }

    # Type of scoring used to compare parameter combinations
    acc_scorer = make_scorer(accuracy_score)

    # Run the grid search
    grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer, cv=5, verbose=1)
    grid_obj.fit(X, y)

    # Set the clf to the best combination of parameters
    clf = grid_obj.best_estimator_

In [None]:
# commenting because MLPClassifier already trains with different samples (`shuffle`=True)
# in all the epochs, until it converges.

# %%time

# from sklearn.model_selection import StratifiedShuffleSplit

# def run_kfold(clf):
#     kf = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=seed)
#     outcomes = []
#     fold = 0
#     for train_indices, test_indices in kf.split(X, y):
#         fold += 1
#         X_train, X_test = X.values[train_indices], X.values[test_indices]
#         y_train, y_test = y.values[train_indices], y.values[test_indices]
#         clf.fit(X_train, y_train)
#         predictions = clf.predict(X_test)
#         accuracy = accuracy_score(y_test, predictions)
#         outcomes.append(accuracy)
# #         print(f"Fold {fold} accuracy: {accuracy}")
#     mean_outcome = np.mean(outcomes)
#     print(f"# of Folds: {fold}")
#     print(f"Mean Accuracy: {mean_outcome}") 

# run_kfold(clf)

Since scikit-learn's `MLPClassifier` has no `.feature_importance_` attributes, we can't use the RFE algorithm for feature selection. Then, I'll test every single combination of `n_feats` features and select the one that gets us the best score for training. <br>
<b>NOTE:</b> The MLPClassifier hyperparameters I'm using were selected from a previous GridSearch.

In [12]:
%%time
best_loss = 1
max_feats_to_remove = 2
final_cols = X.columns

for n_feats in range(len(X.columns)-max_feats_to_remove, len(X.columns)+1):
    clf = MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
                    beta_2=0.999, early_stopping=True, epsilon=1e-08,
                    hidden_layer_sizes=(150,100,50), learning_rate='adaptive',
                    learning_rate_init=0.01, max_iter=12000, momentum=0.9,
                    n_iter_no_change=100, nesterovs_momentum=True, power_t=0.5,
                    random_state=2020, shuffle=True, solver='adam', tol=0.00001,
                    validation_fraction=0.2, verbose=False, warm_start=False)
    
    scaler = StandardScaler()
    print(f"# features: {n_feats}")
    feat_combos = list(itertools.combinations(X.columns, n_feats))
    print(f"# possible feature combinations: {len(feat_combos)}")
    
    for i, combo in enumerate(feat_combos):
        scaler.fit_transform(X)
        clf.fit(pd.DataFrame(scaler.fit_transform(X), columns=colunas)[list(combo)], y)
        if clf.loss_ < best_loss:
            best_loss = clf.loss_
            print(f"\t... best_loss: {clf.loss_}")
            
            final_scaler = scaler
            
            del final_cols
            final_cols = list(combo)
            

from datetime import datetime
print(datetime.now())

# features: 19
# possible feature combinations: 210
	... best_loss: 0.03907650681736587
	... best_loss: 0.0213515127221662
	... best_loss: 0.020824456537095543
	... best_loss: 0.017295809816274328
	... best_loss: 0.015829968951853693
# features: 20
# possible feature combinations: 21
# features: 21
# possible feature combinations: 1
2020-04-26 22:40:55.743596
Wall time: 51min 12s


In [13]:
# final_cols = ['Gender', 'Frequent_Traveler', 'Age', 'Type', 'Flight_Class', 'Points', 'Room',
#               'Check-in/Check-out', 'Location', 'Wifi', 'Entertainment', 'Gym', 'Spa',
#               'Staff', 'Pool', 'Baggage_Handling', 'Reception', 'Online_Booking', 'Deposit_Kept']

In [15]:
X = pd.DataFrame(final_scaler.fit_transform(X[final_cols]), columns=final_cols)
X

Unnamed: 0,Gender,Frequent_Traveler,Age,Type,Flight_Class,Points,Room,Check-in/Check-out,Location,Wifi,Entertainment,Gym,Spa,Staff,Pool,Baggage_Handling,Reception,Online_Booking,Deposit_Kept
0,1.015951,0.47722,-0.624929,-0.667355,-1.059270,0.712086,0.109959,1.314782,1.545036,-0.945360,-0.294723,-0.396573,0.401625,-1.157775,-0.382513,-0.598027,-0.267350,-0.269805,4.668421
1,1.015951,0.47722,1.159420,1.498454,-1.059270,0.086066,-0.610432,-1.284222,0.776529,-0.182838,-1.043044,-0.396573,-0.367319,-1.157775,1.161794,-0.598027,0.527451,-0.269805,-0.395295
2,-0.984299,0.47722,-1.550146,1.498454,-1.059270,-0.482062,-0.610432,-0.634471,-0.760483,-0.182838,0.453598,0.374211,0.401625,0.413319,0.389640,0.263193,-0.267350,1.279232,-0.395295
3,-0.984299,0.47722,0.564637,-0.667355,1.017059,0.328428,0.109959,-1.284222,-1.528990,-0.182838,-0.294723,-0.396573,-1.905207,-1.157775,0.389640,0.263193,-0.267350,-0.269805,3.585273
4,-0.984299,0.47722,0.828985,-0.667355,-1.059270,-1.860680,0.109959,0.015280,0.008023,-1.707882,-1.043044,0.374211,-0.367319,-0.372228,-0.382513,-0.598027,0.527451,-0.269805,-0.232823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9095,1.015951,-2.09547,-0.955363,-0.667355,1.017059,0.541354,-0.610432,0.015280,0.008023,1.342206,-1.043044,1.144995,1.170569,-1.943322,1.161794,0.263193,0.527451,1.279232,0.037964
9096,-0.984299,0.47722,-0.492755,-0.667355,1.017059,0.094897,-1.330822,-1.284222,-1.528990,0.579684,0.453598,0.374211,0.401625,-0.372228,-0.382513,0.263193,1.322253,0.504714,-0.395295
9097,1.015951,-2.09547,-1.285798,-0.667355,-1.059270,0.941692,-1.330822,-1.284222,0.776529,1.342206,-1.791365,1.144995,1.170569,-0.372228,-1.926819,-0.598027,-1.062152,1.279232,-0.259902
9098,1.015951,-2.09547,0.168115,-0.667355,1.017059,0.045836,-0.610432,-0.634471,-1.528990,1.342206,-1.043044,1.144995,1.170569,0.413319,1.161794,0.263193,-1.856953,1.279232,-0.395295


In [16]:
X = pd.DataFrame(final_scaler.fit_transform(X[final_cols]), columns=final_cols)

clf = MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
                    beta_2=0.999, early_stopping=True, epsilon=1e-08,
                    hidden_layer_sizes=(150,100,50), learning_rate='adaptive',
                    learning_rate_init=0.01, max_iter=12000, momentum=0.9,
                    n_iter_no_change=100, nesterovs_momentum=True, power_t=0.5,
                    random_state=2020, shuffle=True, solver='adam', tol=0.00001,
                    validation_fraction=0.2, verbose=True, warm_start=False
                   )

clf.fit(X, y)

MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(150, 100, 50), learning_rate='adaptive',
              learning_rate_init=0.01, max_iter=12000, momentum=0.9,
              n_iter_no_change=100, nesterovs_momentum=True, power_t=0.5,
              random_state=2020, shuffle=True, solver='adam', tol=1e-05,
              validation_fraction=0.2, verbose=False, warm_start=False)

In [17]:
def find_version():
    version = 1
    for file in os.listdir("results"):
        if file.split("_")[1].startswith("version"):
            if int(file.split("_")[1].split("n")[1].split(".")[0]) > version:
                version = int(file.split("_")[1].split("n")[1].split(".")[0])
    return version + 1

In [21]:
ids = test["Guest_ID"]
preds = clf.predict(pd.DataFrame(final_scaler.transform(test[final_cols]), columns=final_cols))

output = pd.DataFrame({'Guest_ID': ids, 'Repeater': preds})
output.to_csv(f'{output_folder}/m20180428_version{find_version()}.csv', index = False)
output.head()

Unnamed: 0,Guest_ID,Repeater
0,19847,0
1,12433,1
2,10273,1
3,12457,0
4,22903,0


In [22]:
from datetime import datetime
print(datetime.now())

2020-04-26 23:05:39.626882
