In [1]:
import os
from datetime import datetime
import numpy as np
import pandas as pd
import pandas_profiling
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network  import MLPClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

In [3]:
input_folder = "./data"
output_folder = "./results"
profiles_folder = "./profiles"

In [4]:
### CONFIG, CHANGE HERE ###
eda = True
exploration = False

In [6]:
train = pd.read_csv(f"{input_folder}/train.csv")
test = pd.read_csv(f"{input_folder}/test.csv")

seed = 2020

In [7]:
if eda:
    trainprofile = train.profile_report(title='Train Profiling Report', plot={'histogram': {'bins': 8}})
    trainprofile.to_file(output_file=f"{profiles_folder}/train_profiling.html")
    testprofile = test.profile_report(title='Test Profiling Report', plot={'histogram': {'bins': 8}})
    testprofile.to_file(output_file=f"{profiles_folder}/test_profiling.html")

In [141]:
train.columns

Index(['Guest_ID', 'Gender', 'Frequent_Traveler', 'Age', 'Type',
       'Flight_Class', 'Points', 'Room', 'Check-in/Check-out', 'F&B',
       'Location', 'Wifi', 'Entertainment', 'Gym', 'Spa', 'Staff', 'Pool',
       'Baggage_Handling', 'Reception', 'Cleanliness', 'Online_Booking',
       'Deposit_Kept', 'Time_Room_Service', 'Repeater'],
      dtype='object')

In [142]:
if exploration:
    for col in [c for c in train.columns if c not in ["Guest_ID", "Gender", ]]:
        print(col)
        print(train[col].value_counts(), "\n")

In [143]:
def EncodeCategoricFeats(df_train, df_test):
    features = ["Gender", "Type", "Flight_Class"]
    df_combined = pd.concat([df_train[features], df_test[features]])
    
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test

In [None]:
def MapFlightClass(df):
    flightclass_map = {"Eco": 0, "Eco Plus": 1, "Business": 2}
    df["Flight_Class"].map(flightclass_map)
    return df


def transform_df(df):
    df = MapFlightClass(df)

    return df    

In [None]:
train, test = encode_features(train, test)
train = transform_df(train)
test = transform_df(test)

In [None]:
X = train.drop(['Guest_ID', 'Repeater'], axis=1)
y = train['Repeater']
colunas = X.columns
scaler = StandardScaler()
scaler.fit(X)

In [None]:
X = pd.DataFrame(scaler.transform(X), columns=colunas)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, stratify=y)

In [145]:
clf = MLPClassifier(
    learning_rate = "adaptive",
    max_iter = 12000,
    random_state = seed,
#     tol = 1e-6,
#     early_stopping = True,
#     n_iter_no_change = 100,
#     validation_fraction = 0.3,
    warm_start = True
)


# # # Choose some parameter combinations to try
# # parameters = {'learning_rate': [4, 5, 6, 7, 8, 9, 10],
# #               'max_features': ['log2', 'sqrt', 'auto'],
# #               'criterion': ['entropy', 'gini'],
# #               'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
# #               'min_samples_split': [2, 3, 4, 5, 6, 7, 8],
# #               'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8],
# #               'random_state': [seed]
# #              }

# # Type of scoring used to compare parameter combinations
# acc_scorer = make_scorer(accuracy_score)

# # Run the grid search
# grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer, cv=5)
# grid_obj = grid_obj.fit(X_train, y_train)

# # Set the clf to the best combination of parameters
# clf = grid_obj.best_estimator_

# # Fit the best algorithm to the data. 
# clf.fit(X_train, y_train)

In [88]:
# test_preds = clf.predict(X_test)
# print(accuracy_score(y_test, test_preds))

In [89]:
%%time

# from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedShuffleSplit

def run_kfold(clf):
    kf = StratifiedShuffleSplit(n_splits=10000, test_size=0.3, random_state=seed)
#     kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=seed)
    outcomes = []
    fold = 0
    for train_indices, test_indices in kf.split(X, y):
        fold += 1
        X_train, X_test = X.values[train_indices], X.values[test_indices]
        y_train, y_test = y.values[train_indices], y.values[test_indices]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
#         print(f"Fold {fold} accuracy: {accuracy}")
    mean_outcome = np.mean(outcomes)
    print(f"# of Folds: {fold}")
    print(f"Mean Accuracy: {mean_outcome}") 

run_kfold(clf)

# of Folds: 10000
Mean Accuracy: 0.99969010989011
Wall time: 5min 55s


In [137]:
def find_version():
    version = 1
    for file in os.listdir("results"):
        if file.split("_")[1].startswith("version"):
            if int(file.split("_")[1].split("n")[1]) > version:
                version = int(file.split("_")[1].split("n")[1])
    return version + 1

3


In [91]:
ids = test["Guest_ID"]
testdata = pd.DataFrame(scaler.transform(test.drop(["Guest_ID"], axis=1)), columns=colunas)
preds = clf.predict(testdata)

output = pd.DataFrame({'Guest_ID': ids, 'Repeater': preds})

now = str(datetime.today())[:19].replace(" ", "_").replace(":","-")
output.to_csv(f'{output_folder}/m20180428_version{find_version()}.csv', index = False)

output.head()

Unnamed: 0,Guest_ID,Repeater
0,19847,0
1,12433,1
2,10273,1
3,12457,0
4,22903,0
