In [None]:
# Importing libraries:
import joblib
from .selected_column_helper import *



# magic word for producing visualizations in notebook
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


In [None]:
azdias = pd.read_csv('../data/azdias_2.csv', sep=';')
# customers=pd.read_csv('../data/customers.csv',sep=';')
mailout_train = pd.read_csv('../data/mail_train_1.csv', sep=';')


gbc_clf=pickle.load(open("gbc_clf.pkl", "rb"))
gbc_smote_clf=pickle.load(open("gbc_smote_clf.pkl", "rb"))
gbc_pca_clf=pickle.load(open("gbc_pca_clf.pkl", "rb"))
gbc_pca2_clf=pickle.load(open("gbc_pca2_clf.pkl", "rb"))
xgbc_bayes_clf=pickle.load(open("xgbc_bayes_clf.pkl", "rb"))
lgbm_bayes_clf=pickle.load(open("lgbm_bayes_clf.pkl", "rb"))



In [None]:
# Updating dtypes_dict:
new_dtypes_dict = dtypes_dict.copy()
new_dtypes_dict['AGER_TYP'] = 'cat'
new_dtypes_dict['D19_GESAMT_ANZ_24'] = 'num'
new_dtypes_dict['D19_GESAMT_DATUM'] = 'num'
new_dtypes_dict['D19_GESAMT_OFFLINE_DATUM'] = 'num'
new_dtypes_dict['D19_GESAMT_ONLINE_DATUM'] = 'num'
new_dtypes_dict['D19_KONSUMTYP'] = 'cat'
new_dtypes_dict['D19_KONSUMTYP_MAX'] = 'num'
new_dtypes_dict['D19_SONSTIGE'] = 'num'
new_dtypes_dict['D19_SOZIALES'] = 'num'
new_dtypes_dict['D19_VERSAND_DATUM'] = 'num'
new_dtypes_dict['D19_VERSAND_OFFLINE_DATUM'] = 'num'
new_dtypes_dict['D19_VOLLSORTIMENT'] = 'num'
new_dtypes_dict['EXTSEL992'] = 'num'
new_dtypes_dict['GEBURTSJAHR'] = 'num'

In [None]:
# Creating a list of variables that overcomes the threshold for nan values:
nan_threshold = 0.35 # 35%

# Nan proportion, using general population dataframe (more observations):
var_nan_prop = azdias.isnull().mean()
# List:
nan_list = list()
for i in range(len(var_nan_prop)):
    if var_nan_prop[i] >= nan_threshold:
        nan_list.append(var_nan_prop.index.values[i])

print('{} columns with more than {}% of nan values.'.format(len(nan_list), nan_threshold*100))


In [None]:
# Creating a list of variables that overcomes the threshold for nan values:
new_nan_threshold = 0.5 # 50%

# Nan proportion, using general population dataframe (more observations):
var_nan_prop = mailout_train.isnull().mean()

# List:
new_nan_list = list()
for i in range(len(var_nan_prop)):
    if var_nan_prop[i] >= new_nan_threshold:
        new_nan_list.append(var_nan_prop.index.values[i])

print('{} columns with more than {}% of nan values.'.format(len(new_nan_list), new_nan_threshold*100))


In [None]:
# Creating function that delete the columns listed in nan_list:
def eliminate_nan_columns(df, nan_cols = nan_list):
    '''
    It deletes dataframe columns in nan_cols list.

    Inputs:
    df: original dataframe;
    nan_cols:list of columns to be deleted.

    Output:
    df: dataframe updated without nan_cols.
    '''
    # Deleting nan_cols:
    df.drop(columns = nan_cols, inplace = True)

    return df




In [None]:
def supervised_data_transformation(df, test_set):
    '''
    It applies all the data transformation steps before treating nan values.

    Input:
    df: original dataframe;
    train_set: boolean indicating whether it's train or test set.

    Output:
    df: transformed dataframe.
    '''
    if test_set:
        # Joining nan and unknown values:
        df = join_nan_with_unknown(df)

        # Columns do drop:
        drop_cols = ['LNR', 'EINGEFUEGT_AM', 'EINGEZOGENAM_HH_JAHR']

    else:
        drop_cols = ['LNR', 'EINGEFUEGT_AM', 'EINGEZOGENAM_HH_JAHR', 'RESPONSE']

    # Eliminating nan columns:
    df = eliminate_nan_columns(df, nan_cols = new_nan_list)

    # Dropping useless columns:
    df = df.drop(columns = drop_cols)

    # Changing column dtypes:
    df = change_dtypes(df, dtypes = new_dtypes_dict)

    # Feature engineering I:
    df = feature_engineer(df)

    # Feature engineering II:
    df = transform_cameo_deu(df)

    return df


In [None]:
# Defining CAMEO_DEU_2015 transformation:
def transform_cameo_deu(df):
    '''
    It simplifies CAMEO_DEU_2015 classes according to the representation pattern presented in the comparison
    between customers and the general population.
    '''
    # Creating new column:
    feat = 'CAMEO_DEU_2015'
    df['CAMEO_DEU_REPRESENTATION'] = [0 if df[feat].iloc[i] in ['6A', '7A', '7B', '7C', '8A', '8B', '8C', '8D', '9A', '9B',
                                                                '9C', '9D'] \
                                      else 1 if df[feat].iloc[i] in ['5A', '5B', '5C', '6B', '7D'] \
                                      else 2 if df[feat].iloc[i] in ['3A', '3B', '3C', '4B', '4C', '4E', '5E', '5F', '6C',
                                                                     '6D', '6E', '6F', '7E', '9E'] \
                                      else 3 if df[feat].iloc[i] in ['1A', '1B', '1C', '1D', '2A', '2B', '2C', '2D', '3D',
                                                                     '4A', '5D'] \
                                      else np.nan for i in range(df.shape[0])]

    # Transforming the column to categorical type:
    df['CAMEO_DEU_REPRESENTATION'] = df['CAMEO_DEU_REPRESENTATION'].astype('category')

    # Deliting original column:
    df.drop(columns = [feat], inplace = True)

    return df

## Part 3: Kaggle Competition<a name="part3"></a>

Now that you've created a model to predict which individuals are most likely to respond to a mailout campaign, it's time to test that model in competition through Kaggle. If you click on the link [here](http://www.kaggle.com/t/21e6d45d4c574c7fa2d868f0e8c83140), you'll be taken to the competition page where, if you have a Kaggle account, you can enter. If you're one of the top performers, you may have the chance to be contacted by a hiring manager from Arvato or Bertelsmann for an interview!

Your entry to the competition should be a CSV file with two columns. The first column should be a copy of "LNR", which acts as an ID number for each individual in the "TEST" partition. The second column, "RESPONSE", should be some measure of how likely each individual became a customer – this might not be a straightforward probability. As you should have found in Part 2, there is a large output class imbalance, where most individuals did not respond to the mailout. Thus, predicting individual classes and using accuracy does not seem to be an appropriate performance evaluation method. Instead, the competition will be using AUC to evaluate performance. The exact values of the "RESPONSE" column do not matter as much: only that the higher values try to capture as many of the actual customers as possible, early in the ROC curve sweep.

In [None]:
# Load in the data:
mailout_test = pd.read_csv('../data/mail_test.csv', sep=';')

# Loading test set:
#mailout_test = joblib.load('test')

mailout_test.head()

In [None]:
# Saving LNR object:
lnr = list(mailout_test.LNR.values)

# Applying data transformations on train set:
mailout_test = supervised_data_transformation(mailout_test, test_set = True)

mailout_test.head()


In [None]:
#remove unused train columns from test columns
test_cols=mailout_test.columns.values.tolist()
train_cols=mailout_train.columns.values.tolist()
set_difference = set(test_cols) - set(train_cols)
list_difference = list(set_difference)

mailout_test=mailout_test.drop(columns = list_difference)

### 3.1 Attempt 1: Training on Unbalanced Data<a name="p1"></a>


# Defining machine learning pipeline:
gbc_ml_pipe = Pipeline(steps = [
    ('preprocessing', preproc),
    ('clf', GradientBoostingClassifier(learning_rate = 0.1,
                                       n_estimators = 150,
                                       random_state = 301))
])

# Setting parameters to be tested:
params = {'clf__min_samples_split': [2, 4],
          'clf__max_depth': [3, 5],
          'clf__max_features': [None, 'auto']
}

# Grid search + ML pipleine:
gbc_clf = GridSearchCV(gbc_ml_pipe, param_grid = params, scoring = 'roc_auc', verbose = 2)

# Training model:
gbc_clf.fit(mailout_train, y)

In [None]:
# Predicting on teste data:
y_gbc_pred = gbc_clf.predict_proba(mailout_test)

In [None]:
# Creating prediction dataframe:
gbc_pred_df = pd.DataFrame(columns = ['LNR', 'RESPONSE'])

# Assigning id:
gbc_pred_df['LNR'] = lnr

# Assigning predictions:
gbc_pred_df['RESPONSE'] = y_gbc_pred[:, 1]

gbc_pred_df.head()

In [None]:
# Saving person predictions csv:
gbc_pred_df.to_csv('../pred/gbc_pred.csv', header = True, index = False)

# KAGGLE SCORE: 0.79488

Considering the `Kaggle` rank, this first approach could be considered a regular model, being positioned among the **top 150**.

### 3.2 Attempt 2: Training on Balanced Data<a name="p2"></a>

In [None]:
# Predicting on teste data:
y_gbc_smote_pred = gbc_smote_clf.predict_proba(mailout_test)

In [None]:
# Creating prediction dataframe:
gbc_smote_pred_df = pd.DataFrame(columns = ['LNR', 'RESPONSE'])

# Assigning id:
gbc_smote_pred_df['LNR'] = lnr

# Assigning predictions:
gbc_smote_pred_df['RESPONSE'] = y_gbc_smote_pred[:, 1]

gbc_smote_pred_df.head()

In [None]:
# Saving person predictions csv:
gbc_smote_pred_df.to_csv('../pred/gbc_smote_pred.csv', header = True, index = False)

# KAGGLE SCORE: 0.70165

### 3.3 Attempt 3: Information Level and PCA Transformation<a name="p3"></a>

In [None]:
# Predicting on teste data:
y_gbc_pca_pred = gbc_pca_clf.predict_proba(mailout_test[selected_columns])

In [None]:
# Creating prediction dataframe:
gbc_pca_pred_df = pd.DataFrame(columns = ['LNR', 'RESPONSE'])

# Assigning id:
gbc_pca_pred_df['LNR'] = lnr

# Assigning predictions:
gbc_pca_pred_df['RESPONSE'] = y_gbc_pca_pred[:, 1]

gbc_pca_pred_df.head()

In [None]:
# Saving person predictions csv:
gbc_pca_pred_df.to_csv('../pred/gbc_pca_pred.csv', header = True, index = False)

# KAGGLE SCORE: 0.79168

### 3.4 Attempt 4: PCA Transformation<a name="p4"></a>

In [None]:
# Predicting on teste data:
y_gbc_pca2_pred = gbc_pca2_clf.predict_proba(mailout_test)

In [None]:
# Creating prediction dataframe:
gbc_pca2_pred_df = pd.DataFrame(columns = ['LNR', 'RESPONSE'])

# Assigning id:
gbc_pca2_pred_df['LNR'] = lnr

# Assigning predictions:
gbc_pca2_pred_df['RESPONSE'] = y_gbc_pca2_pred[:, 1]

gbc_pca2_pred_df.head()

In [None]:
# Saving person predictions csv:
gbc_pca2_pred_df.to_csv('../pred/gbc_pca2_pred.csv', header = True, index = False)

# KAGGLE SCORE: 0.71402

### 3.5 Attempt 5: XGBoost Classifier and Baysian Optimization<a name="p5"></a>

In [None]:
# Predicting on teste data:
y_xgbc_bayes_pred = xgbc_bayes_clf.predict_proba(mailout_test)

In [None]:
# Creating prediction dataframe:
xgbc_bayes_pred_df = pd.DataFrame(columns = ['LNR', 'RESPONSE'])

# Assigning id:
xgbc_bayes_pred_df['LNR'] = lnr

# Assigning predictions:
xgbc_bayes_pred_df['RESPONSE'] = y_xgbc_bayes_pred[:, 1]

xgbc_bayes_pred_df.head()

In [None]:
# Saving person predictions csv:
xgbc_bayes_pred_df.to_csv('../pred/xgbc_bayes_pred.csv', header = True, index = False)

# KAGGLE SCORE: 0.80492

Although this model `roc_auc` score is comparable to the first model, when predicting on the test data, it represented a great advance.

This score positions the model among the **top 40** in the Kaggle rank among 349 data scientists.

![best_score.png](attachment:best_score.png)

### 3.6 LightGBM and Bayesian Optimization<a name="p6"></a>

In [None]:
# Predicting on teste data:
y_lgbm_bayes_pred = lgbm_bayes_clf.predict_proba(mailout_test)

In [None]:
# Creating prediction dataframe:
lgbm_bayes_pred_df = pd.DataFrame(columns = ['LNR', 'RESPONSE'])

# Assigning id:
lgbm_bayes_pred_df['LNR'] = lnr

# Assigning predictions:
lgbm_bayes_pred_df['RESPONSE'] = y_lgbm_bayes_pred[:, 1]

lgbm_bayes_pred_df.head()

In [None]:
# Saving person predictions csv:
lgbm_bayes_pred_df.to_csv('../pred/lgbm_bayes_pred.csv', header = True, index = False)

# KAGGLE SCORE: 0.79743

