# Exploration of Data and trying AutoGluon
## Table of Contents
* [Import and first glance](#import)
* [EDA](#EDA)
* [Data Preparations](#prep)
* [Fit Model](#fit)
* [Model Evaluation (Leader)](#model_eval)
* [Predict on Test Set (Leader)](#pred_test)
* [Model on 2nd place](#model_2)
* [Model on 3rd place](#model_3)
* [Blend 2nd and 3rd model](#blend)

### AutoGluon Docs: https://auto.gluon.ai/stable/index.html

In [None]:
# install package
!pip install autogluon

# for interpretable models:
!pip install imodels

In [None]:
# standard
import numpy as np
import pandas as pd

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# statistics
from scipy import stats

# AutoGluon
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
# configs
pd.set_option('display.max_columns', None) # we want to display all columns in this notebook

<a id='import'></a>
# Import and first glance

In [None]:
# load data
df_train = pd.read_csv('../input/playground-series-s3e3/train.csv')
df_test = pd.read_csv('../input/playground-series-s3e3/test.csv')
df_sub = pd.read_csv('../input/playground-series-s3e3/sample_submission.csv')

In [None]:
# preview
df_train.head(10)

In [None]:
# structure of data - train
df_train.info()

In [None]:
# structure of data - test
df_test.info()

#### 💡 Train and Test Set are really small here. We have to be careful to not overfit!

<a id='EDA'></a>
# EDA

In [None]:
# basic stats - train
df_train.describe(include='all')

In [None]:
# basic stats - test
df_test.describe(include='all')

In [None]:
# define features
features_num = ['Age','DailyRate','DistanceFromHome','Education', 'EmployeeCount',
                'EnvironmentSatisfaction','HourlyRate','JobInvolvement',
                'JobLevel','JobSatisfaction', 
                'MonthlyIncome','MonthlyRate','NumCompaniesWorked', 
                'PercentSalaryHike','PerformanceRating',
                'RelationshipSatisfaction','StandardHours','StockOptionLevel',
                'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance',
                'YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion',
                'YearsWithCurrManager']

features_cat = ['BusinessTravel','Department','EducationField',
                'Gender','JobRole','MaritalStatus','Over18','OverTime']

In [None]:
# plot histograms (train and test)
for f in features_num:
    plt.figure(figsize=(12,3))
    ax1 = plt.subplot(1,2,1)
    df_train[f].plot(kind='hist', color='darkblue')
    plt.title(f + ' - Train')
    plt.grid()
    ax2 = plt.subplot(1,2,2, sharex=ax1)
    df_test[f].plot(kind='hist', color='darkgreen')
    plt.title(f + ' - Test')
    plt.grid()
    plt.show()

In [None]:
# boxplots (train and test)
for f in features_num:
    plt.figure(figsize=(12,1))
    ax1 = plt.subplot(1,2,1)
    plt.boxplot(df_train[f], vert=False)
    plt.title(f + ' - Train')
    plt.grid()
    ax2 = plt.subplot(1,2,2, sharex=ax1)
    plt.boxplot(df_test[f], vert=False)
    plt.title(f + ' - Test')
    plt.grid()
    plt.show()

#### 💡 DailyRate and Education each show one strange outlier.

In [None]:
# plot categorical features (train and test)
for f in features_cat:
    plt.figure(figsize=(12,4))
    ax1 = plt.subplot(1,2,1)
    df_train[f].value_counts().plot(kind='bar', color='darkblue')
    plt.title(f + ' - Train')
    plt.grid()
    ax2 = plt.subplot(1,2,2, sharex=ax1)
    df_test[f].value_counts().plot(kind='bar', color='darkgreen')
    plt.title(f + ' - Test')
    plt.grid()
    plt.show()

#### 💡 Observations: EmployeeCount, StandardHours and Over18 are constant. Therefore we do not use them as predictors.

<a id='prep'></a>
# Data Preparations

#### Let's remove outliers observed in boxplots above:

In [None]:
# remove outliers
df_train = df_train[(df_train.DailyRate <= 3500) & (df_train.Education <= 10)]

<a id='target_features'></a>
# Target vs Predictors

In [None]:
# target
label = 'Attrition'

# predictors
predictors = features_num + features_cat

# remove constant features (see above)
predictors.remove('EmployeeCount')
predictors.remove('StandardHours')
predictors.remove('Over18')

print('Number of predictors:', len(predictors))
print()
print(predictors)

In [None]:
# plot distributions split by target
for f in predictors:
    if (f in features_num):
        sns.violinplot(data=df_train, x=label, y=f)
        plt.title(f + ' vs Target')
        plt.grid()
        plt.show()
    else:
        # calc cross table
        ctab = pd.crosstab(df_train[f],df_train[label])
        # ...and normalized by column
        ctab_norm = ctab / ctab.sum()
        # plot as heatmap
        plt.figure(figsize=(10,3))
        g = sns.heatmap(ctab_norm, annot=True,
                        fmt='.2%', linecolor='black',
                        linewidths=1,
                        cmap='Greens', 
                        vmin=0, vmax=+1)
        plt.title(f + ' vs Target')
        plt.show()

<a id='fit'></a>
# Fit Model

In [None]:
# metric
eval_metric = 'roc_auc'
# path for model storage
save_path = 'saved_models'

In [None]:
# define time limit for Auto ML in seconds
time_limit = 5*60

In [None]:
# define setup
fit_auto = TabularPredictor(label=label,
                            problem_type='binary',
                            eval_metric=eval_metric,
                            path=save_path)

# and fit models
fit_auto.fit(df_train[predictors+[label]],
             presets='best_quality',
             num_bag_folds=10, 
             num_bag_sets=5, 
             num_stack_levels=0,
             excluded_model_types=['KNN'],
             time_limit=time_limit)

# presets types: [‘best_quality’, ‘high_quality’, ‘good_quality’, ‘medium_quality’, ‘optimize_for_deployment’, ‘interpretable’, ‘ignore_text’]

<a id='model_eval'></a>
# Model Evaluation (Leader)

In [None]:
# show results of AutoML
results = fit_auto.fit_summary(show_plot=True)

In [None]:
# variable importance (permutation method)
vi = fit_auto.feature_importance(df_train[predictors+[label]])
vi

<a id='pred_test'></a>
# Predict on Test Set (Leader)

In [None]:
# predict on test set
pred_test = fit_auto.predict_proba(data=df_test[predictors])
pred_test = pred_test[1]
# stats
print(pred_test.describe())

# distribution of predictions
plt.figure(figsize=(8,3))
pred_test.plot(kind='hist', bins=30, color='darkblue')
plt.title('Predictions Test - Leader')
plt.grid()
plt.show()

In [None]:
# expected frequency
sum(pred_test)

In [None]:
# submission
df_sub.Attrition = pred_test
df_sub.to_csv('submission_1st.csv', index=False)

<a id='model_2'></a>
#  Model on 2nd place

In [None]:
# get leaderboard data
leadertab = results['model_performance']
# sort by metric
leadertab = dict(sorted(leadertab.items(), key=lambda item: item[1], reverse=True))
leadertab

In [None]:
# extract name of 2nd best model
model_2nd = list(leadertab.items())[2-1][0]
model_2nd

In [None]:
# variable importance
vi_2nd = fit_auto.feature_importance(df_train[predictors+[label]], model=model_2nd)
vi_2nd

In [None]:
# predict for 2nd best model
pred_test_2nd = fit_auto.predict_proba(data=df_test[predictors], model=model_2nd)
pred_test_2nd = pred_test_2nd[1]
# stats
print(pred_test_2nd.describe())

# distribution of predictions
plt.figure(figsize=(8,3))
pred_test_2nd.plot(kind='hist', bins=30, color='darkblue')
plt.title('Predictions Test - 2nd best model')
plt.grid()
plt.show()

In [None]:
# expected frequency
sum(pred_test_2nd)

In [None]:
# submission of 2nd model
df_sub_2nd = df_sub.copy()
df_sub_2nd.Attrition = pred_test_2nd
df_sub_2nd.to_csv('submission_2nd.csv', index=False)

<a id='model_3'></a>
#  Model on 3rd place

In [None]:
# extract name of 3rd best model
model_3rd = list(leadertab.items())[3-1][0]
model_3rd

In [None]:
# predict for 3rd best model
pred_test_3rd = fit_auto.predict_proba(data=df_test[predictors], model=model_3rd)
pred_test_3rd = pred_test_3rd[1]
# stats
print(pred_test_3rd.describe())

# distribution of predictions
plt.figure(figsize=(8,3))
pred_test_3rd.plot(kind='hist', bins=30, color='darkblue')
plt.title('Predictions Test - 3rd best model')
plt.grid()
plt.show()

In [None]:
# expected frequency
sum(pred_test_3rd)

In [None]:
# submission of 3rd model
df_sub_3rd = df_sub.copy()
df_sub_3rd.Attrition = pred_test_3rd
df_sub_3rd.to_csv('submission_3rd.csv', index=False)

In [None]:
# check correlation of predictions
plt.figure(figsize=(5,5))
plt.scatter(pred_test_2nd, pred_test_3rd, 
            color='darkblue', alpha=0.3)
plt.title('Predictions 3rd vs 2nd')
plt.xlabel('2nd')
plt.ylabel('3rd')
plt.grid()
plt.show()

In [None]:
# correlation (and significance)
stats.pearsonr(pred_test_2nd, pred_test_3rd)

In [None]:
# rank correlation (and significance)
stats.spearmanr(pred_test_2nd, pred_test_3rd)

<a id='blend'></a>
# Blend 2nd and 3rd model

In [None]:
df_sub_blend = df_sub.copy()
df_sub_blend.Attrition = 0.5*pred_test_2nd + 0.5*pred_test_3rd
df_sub_blend.to_csv('submission_blend23.csv', index=False)

In [None]:
df_sub_blend.describe()

In [None]:
# expected frequency
sum(df_sub_blend.Attrition)