In [None]:
# basic library
import numpy as np
import pandas as pd

# model
from sklearn.ensemble import GradientBoostingClassifier

# sampler
from imblearn.under_sampling import RandomUnderSampler

# custom modules
from utils import set_seed, get_clf_eval, make_submission, record_experimental_results
import preprocessing as pp

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

### Global Setting

In [None]:
hparams = {
    'seed': 33,
    'num_ensemble': 30,
}

In [None]:
set_seed(hparams['seed'])

In [None]:
gbm_hparams = {
    'loss': 'log_loss', # The loss function to be optimized.
    'learning_rate':0.1, # Learning rate shrinks the contribution of each tree by learning_rate. 
    'n_estimators': 100, # The number of boosting stages to perform.
    'subsample': 1.0, # The fraction of samples to be used for fitting the individual base learners.
    'criterion': 'friedman_mse', # The function to measure the quality of a split.
    'min_samples_split': 2, # The minimum number of samples required to split an internal node:
    'min_samples_leaf': 1, # The minimum number of samples required to be at a leaf node.
    'max_depth': 3, # Maximum depth of the individual regression estimators.
    'min_impurity_decrease': 0.0, # A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
    'init': None, # An estimator object that is used to compute the initial predictions.
    # 'random_state': hparams['seed'], # Controls the random seed given to each Tree estimator at each boosting iteration.
    'max_features': None, # The number of features to consider when looking for the best split:
    'verbose': 0, # Enable verbose output.
    'max_leaf_nodes': None, # Grow trees with max_leaf_nodes in best-first fashion.
    'warm_start': False,
    'validation_fraction': 0.1, # The proportion of training data to set aside as validation set for early stopping.
    'n_iter_no_change': None, # n_iter_no_change is used to decide if early stopping will be used to terminate training when validation score is not improving.
    'tol': 1e-4, # Tolerance for the early stopping.
    'ccp_alpha': 0.0 # Complexity parameter used for Minimal Cost-Complexity Pruning.
}

### 실험 01: `GradientBoostingClassifier()`

In [None]:
# data load & label encoding
tr_data, tt_data = pp.load_data()
x_tr, x_tt = pp.label_encoding(tr_data, tt_data)
x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr, seed=hparams['seed'])

In [None]:
# model
gbm = GradientBoostingClassifier(**gbm_hparams)

In [None]:
# training
gbm.fit(x_tr.fillna(0), y_tr)

In [None]:
# check validation score
y_val_pred = gbm.predict(x_val.fillna(0))
get_clf_eval(y_val, y_val_pred)

In [None]:
# test
x_tt = x_tt.drop(['is_converted', 'id'], axis=1)

In [None]:
y_test_pred = gbm.predict(x_tt.fillna(0))
sum(y_test_pred)

### 실험 02: `GradientBoostingClassifier()` ensemble

In [None]:
gbm_hparams02 = gbm_hparams.copy()
gbm_hparams02['ccp_alpha'] = 0.0004

In [None]:
gbm_hparams03 = gbm_hparams.copy()
gbm_hparams03['max_depth'] = 6

In [None]:
gbm_hparams04 = gbm_hparams03.copy()
gbm_hparams04['n_estimators'] = 200

In [None]:
gbm_hparams05 = gbm_hparams04.copy()
gbm_hparams05['n_estimators'] = 400

In [None]:
# data load & drop dupliccates & normalize
tr_data, tt_data = pp.load_data()
tr_data, tt_data = pp.extract_country_name(tr_data, tt_data)

# binning
start, stop, step = 0, 47501, 500
bins = np.arange(start, stop, step)
labels = [i for i in range(len(bins) - 1)]

tr_data['customer_idx'] = pd.Series(pd.cut(tr_data['customer_idx'], bins=bins, labels=labels), dtype='int64')
tt_data['customer_idx'] = pd.Series(pd.cut(tt_data['customer_idx'], bins=bins, labels=labels), dtype='int64')

# delete id_strategic_ver, it_strategic_ver
tr_data, tt_data = pp.delete_features(tr_data, tt_data, features=['id_strategic_ver', 'it_strategic_ver', 'product_modelname', 'ver_cus', 'ver_pro'])

In [None]:
# log transformation
cols = ['com_reg_ver_win_rate', 'historical_existing_cnt', 'lead_desc_length']
for col in cols:
    tr_data[col] = tr_data[col].apply(np.log1p)
    tt_data[col] = tt_data[col].apply(np.log1p)

In [None]:
# regroup
regroup_customer_type = [['End-Customer', 'End Customer', 'End-user', 'Commercial end-user'],
                         ['Specifier / Influencer', 'Specifier/ Influencer'],
                         ['Distributor', 'Dealer/Distributor'],
                         ['Installer', 'Installer/Contractor'],
                         ['Homeowner', 'Home Owner'],
                         ['Others', 'other', 'Etc.', 'Other']]

regroup_customer_job = [['engineering', 'engineering & technical', 'technical', 'engineer', 'chief engineer', 'engineering & technical executive'],
                        ['others', 'other'],
                        ['information technology', 'information_technology'],
                        ['operations', 'operations manager'],
                        ['business development', 'business_development'],
                        ['art and design', 'arts and design', 'kreation_und_design', 'designer', 'arts_and_design'],
                        ['program and project management', 'programm-_und_projektmanagement', 'program_and_project_management', 'projektmenedzsment\tprogram and project management', 'manager', 'project manager', 'general manager', 'it manager', 'operations manager', 'sales manager'],
                        ['media and communication', 'media_e_comunicazione'],
                        ['healthcare services', 'healthcare_services'],
                        ['community and social services', 'community_and_social_services'],
                        ['research', 'research & development'],
                        ['surgery professional', 'surgery professional\u200b'],
                        ['quality_assurance', 'quality_assurance'],
                        ['director', 'it director', 'it', 'director of it'],
                        ['ceo/founder', 'ceo'],
                        ['architect', 'arquitecto/consultor'],
                        ['finance', 'finanzen'],
                        ['integrator', 'integrador'],
                        ['coordinator', 'project coordinator'],
                        ['administrative', 'administrative assistant']]

regroup_inquiry_type = [['Quotation or purchase consultation', 'Quotation or Purchase Consultation', 'quotation_or_purchase_consultation', 'Quotation or Purchase consultation', 'quotation_', 'Request for quotation or purchase', 'Purchase or Quotation', 'Purchase'],
                        ['Sales Inquiry', 'sales', 'Sales inquiry'],
                        ['Usage or technical consultation', 'Technical Consultation', 'Usage or Technical Consultation', 'usage or technical consultation', 'usage_or_technical_consultation', 'technical_consultation', 'Technical Support', 'Request for technical consulting', 'technical'],
                        ['Others', 'Other', 'ETC.', 'ETC.', 'Etc.', 'others', 'other', 'other_']]

regroup_customer_position = [['others', 'other'],
                             ['entry level', 'entrylevel'],
                             ['c-level executive', 'c-levelexecutive'],
                             ['vice president', 'vicepresident'],
                             ['end-user', 'commercial end-user'],
                             ['decision maker', 'decision-maker'],
                             ['decision influencer', 'decision-influencer']]

regroup_expected_timeline = [['less than 3 months', 'less_than_3_months'],
                             ['3 months ~ 6 months', '3_months_~_6_months'],
                             ['less than 6 months'],
                             ['6 months ~ 9 months', '6_months_~_9_months'],
                             ['more than a year'],
                             ['being followed up', 'being followed up.'],
                             ['no requirement', 'the client is not having any requirement hence closig in system. although the details of idb are mailed to client.']]

In [None]:
tr_data, tt_data = pp.regroup(tr_data, tt_data, 'customer_type', regroup_customer_type, except_val='others', except_thr=5)
tr_data, tt_data = pp.regroup(tr_data, tt_data, 'customer_job', regroup_customer_job, except_val='others', except_thr=5)
tr_data, tt_data = pp.regroup(tr_data, tt_data, 'inquiry_type', regroup_inquiry_type, except_val='others', except_thr=2)
tr_data, tt_data = pp.regroup(tr_data, tt_data, 'customer_position', regroup_customer_position, except_val='others', except_thr=6)
tr_data, tt_data = pp.regroup(tr_data, tt_data, 'expected_timeline', regroup_expected_timeline, except_val='others', except_thr=1)
tr_data, tt_data = pp.regroup(tr_data, tt_data, 'product_category', [[]], 'etc.', except_thr=5)
tr_data, tt_data = pp.regroup(tr_data, tt_data, 'product_subcategory', [[]], 'others.', except_thr=1)

In [None]:
# label encoding
features = ["business_subarea", "country", "business_area", "business_unit", "customer_type",
            "enterprise", "customer_job", "inquiry_type", "product_category", 
            "product_subcategory", "customer_position", "response_corporate","expected_timeline"]

tr_data, tt_data = pp.label_encoding(tr_data, tt_data, features=features)
x_tt = tt_data.drop(['is_converted', 'id'], axis=1)

In [None]:
val_precision, val_recall, val_f1 = [], [], [] # 모델별 validation score 저장
test_results = np.zeros((hparams['num_ensemble'], 5271)) # 모델별 test 결과를 저장할 배열

In [None]:
for i in range(hparams['num_ensemble']):
    # 서로 다른 seed를 이용하여 undersampling 수행
    rus = RandomUnderSampler(random_state=hparams['seed'] + i)
    x_tr_res, y_tr_res = rus.fit_resample(tr_data.drop(['is_converted'], axis=1), tr_data['is_converted'])

    # train / validation split
    x_tr_res['is_converted'] = y_tr_res # concat
    x_tr, y_tr, x_val, y_val = pp.split_train_and_validation(x_tr_res, seed=hparams['seed'])

    # define a model
    model = GradientBoostingClassifier(**gbm_hparams05, 
                               random_state=hparams['seed'] + i)

    # training
    model.fit(x_tr.fillna(0), y_tr)
    
    ### print result of current model ###
    print('-' * 20)
    print(f'Model {i + 1} results')
    print('-' * 20)

    print(f'current seed: {hparams["seed"] + i}')

    # check validation score
    y_val_pred = model.predict(x_val.fillna(0))
    pr, re, f1 = get_clf_eval(y_val, y_val_pred, is_return=True)
    
    val_precision.append(pr)
    val_recall.append(re)
    val_f1.append(f1)

    # test
    y_test_pred = model.predict(x_tt.fillna(0))

    # 예측 결과를 array에 누적
    test_results[i, :] = y_test_pred

    # number of positive predictions
    print(sum(y_test_pred))
    print()

In [None]:
print(f"average validation precision score of {hparams['num_ensemble']} models: {sum(val_precision) / hparams['num_ensemble']:.6f}")
print(f"average validation recall score of {hparams['num_ensemble']} models: {sum(val_recall) / hparams['num_ensemble']:.6f}")
print(f"average validation f1 score of {hparams['num_ensemble']} models: {sum(val_f1) / hparams['num_ensemble']:.6f}")

In [None]:
# hard voting
tmp = np.sum(test_results, axis=0, dtype=int)
final_test_pred = np.array([1 if x >= int(hparams['num_ensemble'] / 2) + 1 else 0 for x in tmp])

In [None]:
sum(final_test_pred)

In [None]:
model_name = 'hparams05_gbm_30_final'

In [None]:
make_submission(dir_name='05_gbm',
                y_pred=final_test_pred,
                model_name=model_name)

**Record**

In [None]:
record_experimental_results(model_name=model_name,
                            test_f1_score='0.755676',
                            description='feature engineering + 30개 gbm ensemble')

---