# Prediction Challenge

## Load Modules, Import Dataset

In [1]:
import sys
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from scipy.stats import uniform
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_predict, GridSearchCV, RandomizedSearchCV, StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, make_scorer, roc_auc_score, classification_report
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer, RobustScaler, KBinsDiscretizer, StandardScaler
from sklearn.base import clone
from sklearn.impute import SimpleImputer
from sklearn.datasets import make_classification

df = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/prediction-challenge/dataset.csv', parse_dates=['date'])

### Make additional categorical feature out of days_since_last_contact

In [2]:
df['days_since_last_contact'] = np.where(df['days_since_last_contact'].between(1,5), 1, df['days_since_last_contact'])
df['days_since_last_contact'] = df['days_since_last_contact'].mask(df['days_since_last_contact'] >= 6, 2)
df['days_since_last_contact'] = df['days_since_last_contact'].mask(df['days_since_last_contact'] == -1, 999)

df['days_since_last_contact_cat'] = np.where(df['days_since_last_contact'].between(1,5), 1, df['days_since_last_contact'])
df['days_since_last_contact_cat'] = df['days_since_last_contact'].mask(df['days_since_last_contact'] >= 6, 2)
df['days_since_last_contact_cat'] = df['days_since_last_contact'].mask(df['days_since_last_contact'] == -1, 999)
df['days_since_last_contact_cat'] = np.where(df['days_since_last_contact_cat'].between(1,5), 1, df['days_since_last_contact'])
df['days_since_last_contact_cat'] = df['days_since_last_contact_cat'].mask(df['days_since_last_contact'] >= 6, 2)
df['days_since_last_contact_cat'] = df['days_since_last_contact_cat'].mask(df['days_since_last_contact'] == -1, 999)

df['n_contacts_before'] = np.where(df['n_contacts_before'].between(2,4), 1, df['n_contacts_before'])
df['n_contacts_before'] = df['n_contacts_before'].mask(df['n_contacts_before'] == 0, 0)
df['n_contacts_before'] = np.where(df['n_contacts_before'].between(5,6), 2, df['n_contacts_before'])
df['n_contacts_before'] = df['n_contacts_before'].mask(df['n_contacts_before'] == 7, 3)

df['previous_conversion_bin'] = df['previous_conversion'].mask(df['previous_conversion'] == "Inexistent", 0)
df['previous_conversion_bin'] = df['previous_conversion'].mask(df['previous_conversion'] == "Failed", 2)
df['previous_conversion_bin'] = df['previous_conversion'].mask(df['previous_conversion'] == "Successful", 1)
df['previous_conversion_bin'] = df['previous_conversion_bin'].mask(df['previous_conversion'] == "Inexistent", 0)
df['previous_conversion_bin'] = df['previous_conversion_bin'].mask(df['previous_conversion'] == "Failed", 2)
df['previous_conversion_bin'] = df['previous_conversion_bin'].mask(df['previous_conversion'] == "Successful", 1)

In [3]:
df["days_since_last_contact_cat"] = df["days_since_last_contact_cat"].astype('object')

In [4]:
df["n_contacts_before"] = df["n_contacts_before"].astype('object')

## Featuretool for finding features

In [5]:
import featuretools as ft
import featuretools.variable_types as vtypes

In [6]:
'''pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
ft.primitives.list_primitives()'''

"pd.set_option('display.max_columns', None)\npd.set_option('display.max_rows', None)\nft.primitives.list_primitives()"

In [7]:
es = ft.EntitySet(id = 'dataset')

variable_types = {"identifier": vtypes.Index, "age":vtypes.Ordinal ,"marital_status": vtypes.Categorical, 
                                                "education": vtypes.Categorical, "job": vtypes.Categorical, "credit_default": vtypes.Boolean, 
                                                "housing_loan": vtypes.Boolean, "personal_loan": vtypes.Boolean, "communication_type": vtypes.Categorical,
                                                "n_contacts_before": vtypes.variable.Ordinal, "previous_conversion": vtypes.Categorical, 
                                                "duration": vtypes.Datetime, "success": vtypes.Boolean, "previous_conversion_bin": vtypes.Categorical,
                                                "days_since_last_contact_cat": vtypes.Categorical}

es = es.entity_from_dataframe(entity_id='dataset', 
                              dataframe = df, 
                              index = 'identifier', 
                              time_index = 'date')

In [8]:
es = es.normalize_entity(base_entity_id='dataset', new_entity_id='days_since_last_contact', index='days_since_last_contact')
es = es.normalize_entity(base_entity_id='dataset', new_entity_id='previous_conversion', index='previous_conversion')

In [9]:
df, df_names = ft.dfs(entityset=es, 
    target_entity = 'dataset', 
    max_depth = 2, 
    verbose = 3, 
    n_jobs = 1)

Built 114 features
Elapsed: 00:01 | Progress: 100%|██████████


In [10]:
# Threshold for removing correlated variables 
threshold = 0.7  # Absolute value correlation matrix 

corr_matrix = df.corr().abs() 
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Select columns with correlations above threshold 
collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]
df = df.drop(columns = collinear_features)

In [11]:
df = df.loc[:,df.apply(pd.Series.nunique) != 1]

In [12]:
df.shape

(37069, 26)

## Regular Oversampling of minority class
Here the code samples our miniority class of "Yes" cases up three times to even out some of the unbalanced dataset to reduce the overall failure-proof quality of the learning.

In [13]:
df['success'].value_counts()

No     32893
Yes     4176
Name: success, dtype: int64

In [14]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.success=='No']
df_minority = df[df.success=='Yes']
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=15000,   
                                 random_state=1909) 
 
# Combine majority class with upsampled minority class
df = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df.success.value_counts()

No     32893
Yes    15000
Name: success, dtype: int64

## X,y definition

In [15]:
df.drop('duration', axis=1)
X, y = df.drop('success', axis=1), df.success

## Splitting into Train and Test data


In [16]:
skf = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1909)
skf.get_n_splits(X, y)

#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1909, stratify=y, shuffle=True, test_size=.3)

5

In [17]:
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]    

TRAIN: [27010 25435 37847 ... 23406 32851 45873] TEST: [23991 32607 32400 ...   465 24879 25828]
TRAIN: [31298  7534  7322 ... 10454 39772 46240] TEST: [11833 21328 16549 ... 18170 43726  8489]
TRAIN: [40557  3719  7896 ... 29712 14488 39656] TEST: [ 6770 15107 26796 ... 14578  1051 24131]
TRAIN: [ 3298 17413  5743 ... 45201  5288 17684] TEST: [34871 15574 13776 ... 22869  3089 35656]
TRAIN: [ 7547 39047  6518 ... 21288  4192  9584] TEST: [26373  5786  9308 ... 36066 27950 23459]


In [18]:
print(X.shape)
print(y.shape)
print(X_test.shape)

(47893, 25)
(47893,)
(14368, 25)


In [19]:
X.dtypes.groupby(X.dtypes).size()

int64       9
float64     2
object     14
dtype: int64

## Feature Engineering

In [20]:
def ft_pcontacted_last_campaign(X):
    pcontacted = ~(X == 999) & (X == -1)
    return pcontacted.values.reshape(-1,1)

def ft_pcampaign(X):
    pcampaign = ~(X == 'Inexistent')
    return pcampaign.values.reshape(-1,1)

def ft_previous(X):
    previous = X.astype(str)
    return previous.values.reshape(-1,1)

def ft_campaign_gte10(X):
    campaign_gte10 = X >= 10
    return campaign_gte10.values.reshape(-1,1)

def ft_campaign_to_previous(X):
    ratio = lambda x: 0 if (X.n_contacts_before).all() == 0 else (X.n_contacts_campaign).all() / (X.n_contacts_before).all()
    campaign_to_previous = X[['n_contacts_campaign', 'n_contacts_before']].apply(ratio, axis=1)
    return campaign_to_previous.values.reshape(-1,1)

add_pcontacted_last_campaign = FunctionTransformer(ft_pcontacted_last_campaign, validate=False)
add_pcampaign = FunctionTransformer(ft_pcampaign, validate=False)
add_previous = FunctionTransformer(ft_previous, validate=False)
add_campaign_gte10 = FunctionTransformer(ft_campaign_gte10, validate=False)
add_campaign_to_previous = FunctionTransformer(ft_campaign_to_previous, validate=False)

cat_features = [
        ('ft_pcontacted_last_campaign', add_pcontacted_last_campaign, 'days_since_last_contact'),
        ('ft_pcampaign', add_pcampaign, 'previous_conversion'),
        ('ft_previous', add_previous, 'n_contacts_before'),
        ('ft_campaign_gte10', add_campaign_gte10, 'n_contacts_campaign')]

cat_ct = ColumnTransformer(cat_features)

cat_pipeline = Pipeline([
  ('cat_ct', cat_ct),
  ('ohe', OneHotEncoder(handle_unknown='ignore'))
])
cat_pipeline.fit(X_train)

Pipeline(memory=None,
         steps=[('cat_ct',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ft_pcontacted_last_campaign',
                                                  FunctionTransformer(accept_sparse=False,
                                                                      check_inverse=True,
                                                                      func=<function ft_pcontacted_last_campaign at 0x7f3183262e50>,
                                                                      inv_kw_args=None,
                                                                      inverse_func=None,
                                                                      kw_args=None,
                                                                      validate...
                             

In [21]:
binning_pipeline = Pipeline([
  ('log', FunctionTransformer(np.log, validate=True)),
  ('kbins', KBinsDiscretizer())
])

new_num_features = [
    ('ft_campaign_to_previous', FunctionTransformer(ft_campaign_to_previous, validate=False))
]

num_union = FeatureUnion(new_num_features)

num_pipeline = Pipeline([
    ('num_union', num_union),
    ('scaler', RobustScaler())
])

age_campaign_ct = ColumnTransformer([
  ('age_pipeline', clone(binning_pipeline), ['age']),
  ('campaign_pipeline', clone(binning_pipeline), ['n_contacts_campaign'])
])

In [22]:
num_pipeline.fit(X_train)
cat_ct.fit_transform(X_train).shape

(33525, 4)

In [23]:
feature_names = [
  # Don't incclude the last entry in the `named_transformers_` list since
  # it's the `remainder` parameter for the ColumnTransformer
  ['%s_%s' % (name, value) for value in values] for 
    name, values in list(zip(list(cat_ct.named_transformers_.keys())[1:], cat_pipeline.named_steps['ohe'].categories_))]

cat_feature_names = [name for names in feature_names for name in names]
cat_feature_names

['ft_pcampaign_False',
 'ft_previous_False',
 'ft_previous_True',
 'ft_campaign_gte10_0',
 'ft_campaign_gte10_1',
 'ft_campaign_gte10_2',
 'ft_campaign_gte10_3',
 'remainder_False',
 'remainder_True']

## Pipeline Creation, Classifier LightGBM Implementation

In [24]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [25]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [27]:
ft_union = FeatureUnion([
    ('cat_pipeline', cat_pipeline),
    ('age_campaign_ct', age_campaign_ct),
    ('num_union', num_union),
    ('preprocessor', preprocessor)])

In [28]:
ft_union.fit(X_train)
features = ft_union.transform(X_train)
features.shape



(33525, 82)

In [29]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

# We have chosen the Light GBM Classifier. It's a relative new algorithm that is not only fast but has also a very good acuracy
lgbm = LGBMClassifier(objective='binary', random_state=1909)
smo = SMOTE(random_state=1909)
scorer = make_scorer(f1_score, pos_label='Yes')

# RandomizedSearchCV parameters
params = {
    'classifier__learning_rate': [0.05, 0.1, 0.5],    
    'classifier__boosting' : ['gbdt'],
    'classifier__max_depth' : [-1],
    'classifier__feature_fraction' : [0.7,1.0],
    'classifier__min_gain_to_split' : [0.0,0.01,0.05],
    'classifier__min_data_in_leaf':[60, 65, 70],
    'classifier__metric':['auc'],
    'classifier__max_bin':[240, 245, 250],
    'classifier__num_iterations':[245, 250, 255],
    'classifier__num_leaves':[500, 505, 510],
    'classifier__scale_pos_weight': [1, 100, 1000],
    'sampling__sampling_strategy': [0.4, 0.8,'minority']
    }

rf = Pipeline(steps=[('ft_union', ft_union),
                     ('sampling', smo),             ### SMOTE ###                     
                     ('classifier', lgbm)])

In [30]:
# GridSearch
#grid = GridSearchCV(rf, params, scoring=scorer, verbose=1, cv=5, n_jobs=-1, return_train_score=True)
#grid.fit(X_train, y_train)

# RandomizedSearch
grid = RandomizedSearchCV(rf, params, scoring=scorer, verbose=1, cv=10, n_jobs=-1, n_iter=15, return_train_score=True)
grid.fit(X_train, y_train)

Fitting 10 folds for each of 15 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 16.3min finished


RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('ft_union',
                                              FeatureUnion(n_jobs=None,
                                                           transformer_list=[('cat_pipeline',
                                                                              Pipeline(memory=None,
                                                                                       steps=[('cat_ct',
                                                                                               ColumnTransformer(n_jobs=None,
                                                                                                                 remainder='drop',
                                                                                                                 sparse_threshold=0.3,
                                                                                     

## Evaluation

In [31]:
training_score = grid.cv_results_['mean_train_score'][grid.best_index_] * 100
test_score = grid.cv_results_['mean_test_score'][grid.best_index_] * 100

In [32]:
#grid.cv_results_['mean_fit_time']
#grid.cv_results_['mean_score_time']
grid.cv_results_['mean_train_score']

array([0.99749103, 0.90749455,        nan,        nan,        nan,
              nan, 0.8727955 , 0.84594336, 0.94890299, 0.99076894,
       0.99982013, 0.74113656, 0.8770876 ,        nan, 0.92492088])

In [33]:
f'Mean F1 Score (Training/Test): {training_score:.2f}%/{test_score:.2f}%'

'Mean F1 Score (Training/Test): 99.98%/90.63%'

In [34]:
# importance of each attribute
#print(grid.best_estimator_.named_steps["classifier"].get_fscore())
#grid.best_estimator_.named_steps["classifier"].feature_importances_
#fea_imp_.loc[fea_imp_.fea_imp > 0].sort_values(by=['fea_imp'], ascending = False)

print("\nBest Parameters = " + str(grid.best_estimator_.named_steps['classifier'].get_params))


Best Parameters = <bound method LGBMModel.get_params of LGBMClassifier(boosting='gbdt', boosting_type='gbdt', class_weight=None,
               colsample_bytree=1.0, feature_fraction=1.0,
               importance_type='split', learning_rate=0.5, max_bin=250,
               max_depth=-1, metric='auc', min_child_samples=20,
               min_child_weight=0.001, min_data_in_leaf=60,
               min_gain_to_split=0.05, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_iterations=250, num_leaves=510,
               objective='binary', random_state=1909, reg_alpha=0.0,
               reg_lambda=0.0, scale_pos_weight=1, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)>


In [35]:
predictors=list(X_train)

print('Accuracy of the GBM on test set: {:.3f}'.format(grid.score(X_test, y_test)))
pred=grid.predict(X_test)
print(classification_report(y_test, pred))

Accuracy of the GBM on test set: 0.915
              precision    recall  f1-score   support

          No       0.99      0.93      0.96      9868
         Yes       0.86      0.98      0.92      4500

    accuracy                           0.94     14368
   macro avg       0.92      0.95      0.94     14368
weighted avg       0.95      0.94      0.94     14368



# Prediction

In [36]:
prediction_dataset = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/prediction-challenge/prediction-dataset.csv', parse_dates=['date'])

In [37]:
prediction_dataset['days_since_last_contact'] = np.where(prediction_dataset['days_since_last_contact'].between(1,5), 1, prediction_dataset['days_since_last_contact'])
prediction_dataset['days_since_last_contact'] = prediction_dataset['days_since_last_contact'].mask(prediction_dataset['days_since_last_contact'] >= 6, 2)
prediction_dataset['days_since_last_contact'] = prediction_dataset['days_since_last_contact'].mask(prediction_dataset['days_since_last_contact'] == -1, 999)

prediction_dataset['days_since_last_contact_cat'] = np.where(prediction_dataset['days_since_last_contact'].between(1,5), 1, prediction_dataset['days_since_last_contact'])
prediction_dataset['days_since_last_contact_cat'] = prediction_dataset['days_since_last_contact'].mask(prediction_dataset['days_since_last_contact'] >= 6, 2)
prediction_dataset['days_since_last_contact_cat'] = prediction_dataset['days_since_last_contact'].mask(prediction_dataset['days_since_last_contact'] == -1, 999)
prediction_dataset['days_since_last_contact_cat'] = np.where(prediction_dataset['days_since_last_contact_cat'].between(1,5), 1, prediction_dataset['days_since_last_contact'])
prediction_dataset['days_since_last_contact_cat'] = prediction_dataset['days_since_last_contact_cat'].mask(prediction_dataset['days_since_last_contact'] >= 6, 2)
prediction_dataset['days_since_last_contact_cat'] = prediction_dataset['days_since_last_contact_cat'].mask(prediction_dataset['days_since_last_contact'] == -1, 999)

prediction_dataset['n_contacts_before'] = np.where(prediction_dataset['n_contacts_before'].between(2,4), 1, prediction_dataset['n_contacts_before'])
prediction_dataset['n_contacts_before'] = prediction_dataset['n_contacts_before'].mask(prediction_dataset['n_contacts_before'] == 0, 0)
prediction_dataset['n_contacts_before'] = np.where(prediction_dataset['n_contacts_before'].between(5,6), 2, prediction_dataset['n_contacts_before'])
prediction_dataset['n_contacts_before'] = prediction_dataset['n_contacts_before'].mask(prediction_dataset['n_contacts_before'] == 7, 3)

prediction_dataset['previous_conversion_bin'] = prediction_dataset['previous_conversion'].mask(prediction_dataset['previous_conversion'] == "Inexistent", 0)
prediction_dataset['previous_conversion_bin'] = prediction_dataset['previous_conversion'].mask(prediction_dataset['previous_conversion'] == "Failed", 2)
prediction_dataset['previous_conversion_bin'] = prediction_dataset['previous_conversion'].mask(prediction_dataset['previous_conversion'] == "Successful", 1)
prediction_dataset['previous_conversion_bin'] = prediction_dataset['previous_conversion_bin'].mask(prediction_dataset['previous_conversion'] == "Inexistent", 0)
prediction_dataset['previous_conversion_bin'] = prediction_dataset['previous_conversion_bin'].mask(prediction_dataset['previous_conversion'] == "Failed", 2)
prediction_dataset['previous_conversion_bin'] = prediction_dataset['previous_conversion_bin'].mask(prediction_dataset['previous_conversion'] == "Successful", 1)

In [38]:
prediction_dataset["days_since_last_contact_cat"] = prediction_dataset["days_since_last_contact_cat"].astype('object')
prediction_dataset["n_contacts_before"] = prediction_dataset["n_contacts_before"].astype('object')

In [39]:
es = ft.EntitySet(id = 'pred')

variable_types = {"identifier": vtypes.Index, "age":vtypes.Ordinal ,"marital_status": vtypes.Categorical, 
                                                "education": vtypes.Categorical, "job": vtypes.Categorical, "credit_default": vtypes.Boolean, 
                                                "housing_loan": vtypes.Boolean, "personal_loan": vtypes.Boolean, "communication_type": vtypes.Categorical,
                                                "n_contacts_before": vtypes.variable.Ordinal, "previous_conversion": vtypes.Categorical, 
                                                "duration": vtypes.Datetime, "previous_conversion_bin": vtypes.Categorical,
                                                "days_since_last_contact_cat": vtypes.Categorical}

es = es.entity_from_dataframe(entity_id='pred', 
                              dataframe = prediction_dataset, 
                              index = 'identifier', 
                              time_index = 'date')

es = es.normalize_entity(base_entity_id='pred', new_entity_id='days_since_last_contact', index='days_since_last_contact')
es = es.normalize_entity(base_entity_id='pred', new_entity_id='previous_conversion', index='previous_conversion')

prediction_dataset, prediction_dataset_names = ft.dfs(entityset=es,
    target_entity = 'pred',
    max_depth = 2,
    verbose = 3,
    n_jobs = 1)

Built 109 features
Elapsed: 00:00 | Progress: 100%|██████████


In [40]:
prediction_dataset = prediction_dataset.loc[:,prediction_dataset.apply(pd.Series.nunique) != 1]

In [41]:
df_new =pd.DataFrame(prediction_dataset, columns=X.columns)

In [42]:
predictions = grid.best_estimator_.predict(df_new)

# Submission Dataset Preparation

In [44]:
submission = pd.DataFrame(predictions, index=df_new.index, columns=['prediction'])

In [45]:
matrikel_mheichler = '465475'
matrikel_psaustum = '470057'

In [46]:
submission.to_csv(f'./submission-{matrikel_mheichler}-{matrikel_psaustum}.csv', index_label='identifier')