# Part 2 Supervised Learning and Part 3 Kaggle

In [1]:
# import libraries here; add more as necessary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import operator
import time
from sklearn.preprocessing import Imputer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA    
from sklearn.preprocessing import LabelEncoder
# !pip install mca
# import mca
import chardet
# magic word for producing visualizations in notebook
%matplotlib inline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, roc_auc_score

In [2]:
!pip install lightgbm
import lightgbm as lgb



This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [4]:
# load in the data
# azdias = pd.read_csv('../../data/Term2/capstone/arvato_data/Udacity_AZDIAS_052018.csv', sep=';', dtype=str)
# customers = pd.read_csv('../../data/Term2/capstone/arvato_data/Udacity_CUSTOMERS_052018.csv', sep=';', dtype=str)
azdias = pd.read_csv('Udacity_AZDIAS_052018.csv', sep=';', dtype=str)
customers = pd.read_csv('Udacity_CUSTOMERS_052018.csv', sep=';', dtype=str)

## Part 2: Supervised Learning Model

Now that you've found which parts of the population are more likely to be customers of the mail-order company, it's time to build a prediction model. Each of the rows in the "MAILOUT" data files represents an individual that was targeted for a mailout campaign. Ideally, we should be able to use the demographic information from each individual to decide whether or not it will be worth it to include that person in the campaign.

The "MAILOUT" data has been split into two approximately equal parts, each with almost 43 000 data rows. In this part, you can verify your model with the "TRAIN" partition, which includes a column, "RESPONSE", that states whether or not a person became a customer of the company following the campaign. In the next part, you'll need to create predictions on the "TEST" partition, where the "RESPONSE" column has been withheld.

In [6]:
# training = pd.read_csv('../../data/Term2/capstone/arvato_data/Udacity_MAILOUT_052018_TRAIN.csv', sep=';')
training = pd.read_csv('Udacity_MAILOUT_052018_TRAIN.csv', sep=';', dtype="str")

In [7]:
training.head()

Unnamed: 0,LNR,AGER_TYP,AKT_DAT_KL,ALTER_HH,ALTER_KIND1,ALTER_KIND2,ALTER_KIND3,ALTER_KIND4,ALTERSKATEGORIE_FEIN,ANZ_HAUSHALTE_AKTIV,...,VK_DHT4A,VK_DISTANZ,VK_ZG11,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,RESPONSE,ANREDE_KZ,ALTERSKATEGORIE_GROB
0,1763,2,1,8,,,,,8,15,...,5,2,1,6.0,9,3,3,0,2,4
1,1771,1,4,13,,,,,13,1,...,1,2,1,4.0,9,7,1,0,2,3
2,1776,1,1,9,,,,,7,0,...,6,4,2,,9,2,3,0,1,4
3,1460,2,1,6,,,,,6,4,...,8,11,11,6.0,9,1,3,0,2,4
4,1783,2,1,9,,,,,9,53,...,2,2,1,6.0,9,3,3,0,1,3


In [8]:
azdias_cleaned = azdias.copy()

In [9]:
feat_info = pd.read_csv("features.csv")

In [10]:
feat_info.head()

Unnamed: 0,attribute,information_level,type,missing_or_unknown,Comment
0,AGER_TYP,person,categorical,"[-1,0]",
1,ALTER_HH,household,interval,[0],
2,ALTERSKATEGORIE_GROB,person,ordinal,"[-1,0,9]",
3,ANREDE_KZ,person,categorical,"[-1,0]",
4,ANZ_HAUSHALTE_AKTIV,building,numeric,[0],


In [11]:
for index, row in feat_info.iterrows():
    attribute, information_level, var_type, missing, comment = row
    if attribute in azdias_cleaned.columns:
        values = missing.replace("[","").replace("]","").split(",")
        replacement = {}
        for value in values:
            value = value.strip()
            replacement[value] = None
        azdias_cleaned.loc[:, attribute].replace(replacement, inplace=True)

In [12]:
def clean_sl_data(df, azdias_cleaned=azdias_cleaned):
    """
    Cleans the data frame and performs necessary replacements and transformations.
    Input:
    - df: DataFrame to be cleaned
    Output:
    - df: cleaned and transformed DataFrame
    """
    df = df.copy()
    cat_cols = []
    num_cols = []
    for index, row in feat_info.iterrows():
        attribute, information_level, var_type, missing, comment = row
        if var_type in ["interval", "categorical"]:
            cat_cols.append(attribute)
        elif var_type in ["ordinal", "numeric"]:
            num_cols.append(attribute)
        if attribute in df.columns:
            values = missing.replace("[","").replace("]","").split(",")
            replacement = {}
            for value in values:
                value = value.strip()
                replacement[value] = None
            df.loc[:, attribute].replace(replacement, inplace=True)
#     df.replace({"-1": None, 'X': None, 'XX': None}, inplace=True)
    recode = ['D19_BANKEN_DATUM', 'D19_BANKEN_OFFLINE_DATUM',
       'D19_BANKEN_ONLINE_DATUM', 'D19_GESAMT_DATUM',
       'D19_GESAMT_OFFLINE_DATUM', 'D19_GESAMT_ONLINE_DATUM',
       'D19_TELKO_DATUM', 'D19_TELKO_OFFLINE_DATUM',
       'D19_TELKO_ONLINE_DATUM', 'D19_VERSAND_DATUM',
       'D19_VERSAND_OFFLINE_DATUM', 'D19_VERSAND_ONLINE_DATUM',
       'D19_VERSI_DATUM', 'D19_VERSI_OFFLINE_DATUM',
       'D19_VERSI_ONLINE_DATUM']
    df[recode] = df[recode].replace("10", "0")
    
#     df_for_cls = df[common_cols].copy()

    to_drop = ["LNR", 'AGER_TYP', 'ALTER_HH', 'ALTER_KIND1', 'ALTER_KIND2', 'ALTER_KIND3',
       'ALTER_KIND4', 'EXTSEL992', 'GEBURTSJAHR', 'HH_DELTA_FLAG',
       'KBA05_BAUMAX', 'KK_KUNDENTYP', 'KKK', 'REGIOTYP', 'TITEL_KZ', 'CAMEO_DEU_2015',
       'W_KEIT_KIND_HH']
    df_for_cls = df.drop(columns=[col for col in to_drop if col in df.columns])
    df_for_cls = df_for_cls[[col for col in azdias_cleaned.columns if col in df_for_cls.columns]]
    for col in df_for_cls.columns:
        df_for_cls.loc[:, col] = df_for_cls[col].fillna(value=azdias_cleaned[col].mode()[0])   
    numbers = [str(x) for x in range(100)]
    
    for col in df_for_cls.columns:
        level = 0
        for value in df_for_cls[col].unique():
            if value not in numbers:
                df_for_cls.loc[df_for_cls[col] == value, col] = level
                level += 1

    df_for_cls = df_for_cls.astype(float)
    return df_for_cls

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, input_array, y=None):
        return self
    
    def transform(self, input_array, y=None):
        return clean_sl_data(df=input_array)

In [14]:
X = training.drop(columns=["RESPONSE"])
Y = training["RESPONSE"].astype(int)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=42)

In [15]:
clf_pipeline = Pipeline([   
            ("cleaner", DataCleaner()),
            ("clf", DecisionTreeClassifier())
    
])
param_grid = {              
              'clf__max_depth': [10, None],
            'clf__min_samples_leaf': [1, 3]
            }
cv_pipeline = GridSearchCV(estimator=clf_pipeline, param_grid=param_grid, scoring='roc_auc', cv=2)

In [None]:
cv_pipeline.fit(X_train, Y_train)

In [None]:
print(cv_pipeline.best_score_)
print(cv_pipeline.best_estimator_)

In [None]:
Y_pred_proba = cv_pipeline.predict_proba(X_test)                 
auc = roc_auc_score(Y_test, Y_pred_proba[:, 1])
auc

### Refinement

Although Decision Tree Classifier serves as good starting point in choosing the classifier, there is a significant room of improvement in the ROC AUC with more sophisticated algorithm. As our current dataset is highly imbalanced, it is a shortcoming of Decision Tree Classifier that a single DTC can't handle highly imbalanced datasets very well.

I decided to explore an ensemble learning classifier named LightGBM based on Gradient Boosting algorithm, which is recently become widely accessible to scientific community through open source library called LightGBM made available by Microsoft. It can take use of GPU for faster training time and it can also handle imbalanced datasets like the current one very well.

In [275]:
lgbm_clf = lgb.LGBMClassifier(objective='binary', metric='auc', random_state=42)
lgbm_clf.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'metric': 'auc'}

In [369]:
clf_pipeline = Pipeline([   
            ("cleaner", DataCleaner()),
            ("clf", lgbm_clf)
    
])
param_grid = {'clf__boosting_type': ['gbdt', 'dart'],              
              'clf__num_iterations': [200],
              'clf__num_leaves': [65],
                'clf__learning_rate': [0.01],}
cv_pipeline = GridSearchCV(estimator=clf_pipeline, param_grid=param_grid, scoring='roc_auc', cv=2)

In [407]:
import pickle
pickle.dump(cv_pipeline.best_estimator_, open('lgbm_model.pkl', 'wb'))
pickle.dump(cv_pipeline, open('best_pipeline.pkl', 'wb'))

In [389]:
cv_pipeline.fit(X_train, Y_train)



GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cleaner', DataCleaner()), ('clf', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        metric='auc', min_child_samples=20, min_child_weight=0.001,
        ...  reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__learning_rate': [0.01], 'clf__num_iterations': [200], 'clf__boosting_type': ['gbdt'], 'clf__num_leaves': [62], 'clf__random_state': [42]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [390]:
print(cv_pipeline.best_score_)
print(cv_pipeline.best_estimator_)

0.728555132683
Pipeline(memory=None,
     steps=[('cleaner', DataCleaner()), ('clf', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.01, max_depth=-1,
        metric='auc', min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=100...0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0))])


In [395]:
Y_pred = cv_pipeline.predict(X_test)

In [396]:
from sklearn.metrics import f1_score

In [397]:
pd.DataFrame(Y_pred).loc[:, 0].value_counts()

0    10741
Name: 0, dtype: int64

In [398]:
Y_test.value_counts()

0    10594
1      147
Name: RESPONSE, dtype: int64

In [399]:
f1_score(y_true=Y_test, y_pred=Y_pred)

  'precision', 'predicted', average, warn_for)


0.0

In [400]:
Y_pred_proba = cv_pipeline.predict_proba(X_test)                 
auc = roc_auc_score(Y_test, Y_pred_proba[:, 1])
auc

0.74499299436595479

### Robustness

It's clear that LightGBM is giving impressive performance on local test in terms of high value of ROC AUC. To verify the robustness of this algorithm to be fit for this application, we can do cross validation test on a little larger number of splits and on more independent datasets. We can also increase the number of parameters against which we test this algorithm using GridSearch to even further improve its performance by finding the best values for hyper parameters. We can further assure robustness by checking that the performance of the LightGBM remains relatively high, no matter what value of `random_state` parameter is set. 

### Improvements

The training time for LightGBM is longer on CPU. It can be expedited using GPU optimized version of the library by compiling and building it using graphics card related libraries and build options. Other non-linear classifiers like SVM and Neural Netowrks can be tested to see if they provide relative performance.

## Part 3: Kaggle Competition

Now that you've created a model to predict which individuals are most likely to respond to a mailout campaign, it's time to test that model in competition through Kaggle. If you click on the link [here](http://www.kaggle.com/t/21e6d45d4c574c7fa2d868f0e8c83140), you'll be taken to the competition page where, if you have a Kaggle account, you can enter. If you're one of the top performers, you may have the chance to be contacted by a hiring manager from Arvato or Bertelsmann for an interview!

Your entry to the competition should be a CSV file with two columns. The first column should be a copy of "LNR", which acts as an ID number for each individual in the "TEST" partition. The second column, "RESPONSE", should be some measure of how likely each individual became a customer – this might not be a straightforward probability. As you should have found in Part 2, there is a large output class imbalance, where most individuals did not respond to the mailout. Thus, predicting individual classes and using accuracy does not seem to be an appropriate performance evaluation method. Instead, the competition will be using AUC to evaluate performance. The exact values of the "RESPONSE" column do not matter as much: only that the higher values try to capture as many of the actual customers as possible, early in the ROC curve sweep.

In [410]:
mailout_test = pd.read_csv('../../data/Term2/capstone/arvato_data/Udacity_MAILOUT_052018_TEST.csv', sep=';', dtype='str')

In [412]:
Y_pred_proba_final = cv_pipeline.predict_proba(mailout_test)

In [417]:
kaggle = pd.DataFrame({"LNR": list(mailout_test["LNR"]), "RESPONSE": list(Y_pred_proba_final[:, 1])})
kaggle.to_csv("kaggle1.csv", index=False)

In [421]:
pd.read_csv("kaggle1.csv").head()

Unnamed: 0,LNR,RESPONSE
0,1754,0.013365
1,1770,0.012894
2,1465,0.003863
3,1470,0.003685
4,1478,0.009237
