### 필요한 라이브러리 설치

In [None]:
!pip install imbalanced-learn==0.12.0

### 필요한 라이브러리 가져오기

In [None]:
# built-in library
import random
import os
import re
from typing import List
import warnings
warnings.filterwarnings('ignore')

# basic library
import numpy as np
import pandas as pd

# torch
import torch

# model
from sklearn.ensemble import GradientBoostingClassifier

# sampler
from imblearn.under_sampling import RandomUnderSampler

# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# metrics
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.metrics import ConfusionMatrixDisplay

# visualization
import matplotlib.pyplot as plt

### 전체적으로 사용할 hyperparameters선언

In [None]:
hparams = {'seed': 33, 'num_ensemble': 30}

### 성능 재현을 위한 시드 세팅

In [None]:
def set_seed(seed: int = 2024) -> None:
    """실험 재현을 위해 seed를 설정하는 함수입니다.

    Args:
        seed (int, optional): 설정할 seed 값. Defaults to 2024.
    """
    random.seed(seed)
    os.environ['PYTHONASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
set_seed(hparams['seed'])

### 데이터 전처리에 필요한 함수들 선언

In [None]:
def label_encoding(tr_data: pd.DataFrame, tt_data: pd.DataFrame,
                   features: list = ["customer_country", "business_subarea",
                                     "business_area", "business_unit", "customer_type",
                                     "enterprise", "customer_job",
                                     "inquiry_type", "product_category",
                                     "product_subcategory", "product_modelname",
                                     "customer_country.1", "customer_position",
                                     "response_corporate","expected_timeline"]) -> tuple:
    """범주형 데이터를 수치형 데이터로 encoding 하는 함수입니다.

    Args:
        tr_data (pd.DataFrame): 학습 데이터입니다.
        tt_data (pd.DataFrame): 테스트 데이터입니다.
        features (list, optional): 학습 데이터의 feature 중 범주형 features 의 이름을 담은 리스트입니다.

    Returns:
        tuple: label encoding을 마친 train, test DataFrame을 반환합니다.
    """
    # train / test data 복사
    x_tr = tr_data.copy()
    x_tt = tt_data.copy()

    for f in features:
        # 데이터 타입이 object (str) 일 때 label encoding 수행
        if x_tr[f].dtype.name == 'object':
            le = LabelEncoder()

            # train + test 데이터를 합쳐서 label encoding
            cur_tr_f = list(x_tr[f].values)
            cur_tt_f = list(x_tt[f].values)

            le.fit(cur_tr_f + cur_tt_f)

            x_tr[f] = le.transform(cur_tr_f)
            x_tt[f] = le.transform(cur_tt_f)

    return x_tr, x_tt

In [None]:
def split_train_and_validation(tr_data: pd.DataFrame, val_size: float = 0.2, seed: int = 2024) -> tuple:
    """주어진 data를 train / validation set으로 나누는 함수입니다.

    Args:
        tr_data (pd.DataFrame): split 할 data 입니다.
        val_size (float, optional): validation data의 비율입니다. Defaults to 0.2.
        seed (int, optional): sampling 시 사용할 seed 값입니다. Defaults to 42.

    Returns:
        tuple: (x_train, y_train, x_validation, y_validation) 을 반환
    """
    
    x_tr, x_val, y_tr, y_val = train_test_split(tr_data.drop(columns=['is_converted'], axis=1),
                                                tr_data['is_converted'],
                                                test_size=val_size,
                                                random_state=seed,
                                                shuffle=True)
    
    return (x_tr, y_tr, x_val, y_val)

In [None]:
def delete_features(tr_data: pd.DataFrame, tt_data: pd.DataFrame,
                    features: list = ['com_reg_ver_win_rate', 'customer_type', 'customer_country.1',
                                      'historical_existing_cnt', 'id_strategic_ver',
                                      'it_strategic_ver', 'idit_strategic_ver','product_subcategory',
                                      'product_modelname', 'expected_timeline', 'ver_win_rate_x',
                                      'ver_win_ratio_per_bu', 'business_area','business_subarea']) -> tuple:
    """
    주어진 데이터에서 features 에 속하는 feature column 들을 삭제한 뒤 반환합니다.

    Args:
        tr_data (pd.DataFrame): training data 입니다.
        tt_data (pd.DataFrame): test data 입니다.
        features (list, optional): 삭제할 feature list 입니다. 
        기본값은 결측치 비율이 50 % 이상인 feature 들 + 중복 feature 입니다.

    Returns:
        tuple: features 를 제거한 (tr_data, tt_data) 를 반환합니다.
    """
    
    tr_data = tr_data.drop(columns=features, axis=1)
    tt_data = tt_data.drop(columns=features, axis=1)

    return (tr_data, tt_data)

In [None]:
def extract_country_name(tr_data: pd.DataFrame, tt_data: pd.DataFrame) -> tuple:
    """customer_country feature로부터 국가명을 추출하여
    주어진 dataframe의 country라는 새로운 feature에 할당하는 함수입니다.

    Args:
        tr_data (pd.DataFrame): training data 입니다.
        tt_data (pd.DataFrame): test data 입니다.

    Returns:
        tuple:
            customer_country, customer_country.1 feature는 삭제되고
            country features는 추가된 (tr_data, tt_data) 를 반환합니다.
    """
    for df in [tr_data, tt_data]:
        nan_val = df[df.isna()].loc[0][0] # 결측값 가져오기

        countries = [] # 추출한 국가명을 저장할 배열
        for name in df['customer_country']:
            flag = False
            try:
                name = name.lower()
                res = name.split("/")
                if re.search("@", res[-1]) or re.search("[0-9]", res[-1]): # 비정상 데이터 예외처리
                    flag = True
                
                else:
                    countries.append(res[-1].strip())

            except AttributeError: # nan value 예외처리
                flag = True

            if flag:
                countries.append(nan_val)

        df['country'] = countries
        df.sort_index(axis=1, inplace=True)

    # correlation이 높은 customer_country, customer_country.1 feature 삭제
    tr_data, tt_data = delete_features(tr_data, tt_data, features=['customer_country', 'customer_country.1'])

    return (tr_data, tt_data)

In [None]:
def regroup(tr_data: pd.DataFrame, tt_data: pd.DataFrame, 
            feature_name: str, regroup_info: List[List],
            except_val: str='others', except_thr: int = 1) -> tuple:
    """regroup_info를 바탕으로 data[feature_name]의 값들을 regroup합니다.

    Args:
        tr_data (pd.DataFrame): training data입니다.
        tt_data (pd.DataFrame): test data입니다.
        feature_name (str): regroup을 적용할 feature의 이름입니다.
        regroup_info (List[List]): regroup 정보입니다. 각각의 리스트는 하나의 새로운 그룹을 의미합니다.
        except_val (str): except_thr 이하만큼 등장하는 값을 처리할 때 사용할 값입니다.
        except_thr (int): 최소 등장 횟수입니다.

    Returns:
        tuple: regroup을 마친 tr_data, tt_data를 반환합니다.
    """
    # 데이터를 연결
    data = pd.concat([tr_data, tt_data])

    # value별 등장 횟수 사전 생성
    freq = data[feature_name].value_counts().to_dict()

    # regroup_info를 바탕으로 regroup 수행
    regroup_results = []
    for val in data[feature_name].values:
        if type(val) == float: # 결측치
            regroup_results.append(val)
            continue
        
        flag = True
        for group_pool in regroup_info:
            if val in group_pool:
                regroup_results.append(group_pool[0].lower())
                flag = False
                break
        
        if flag:
            if freq[val] <= except_thr:
                regroup_results.append(except_val.lower())
            else:
                regroup_results.append(val)

    # 데이터 분리
    data[feature_name] = regroup_results
    tr_data, tt_data = data.iloc[:len(tr_data)].drop(['id'], axis=1), data.iloc[len(tr_data):]

    return tr_data, tt_data

### 그 외 편의성을 위해 사용하는 함수

In [None]:
def load_data(tr_path: str = "train.csv",
              tt_path: str = "submission.csv") -> tuple:
    """학습 및 테스트 데이터를 불러옵니다.

    Args:
        tr_path (str, optional): 학습용 데이터의 경로입니다.
        tt_path (str, optional): 테스트용 데이터의 경로입니다.

    Returns:
        tuple: (pd.DataFrame, pd.DataFrame)
    """
    
    tr_data = pd.read_csv(tr_path)
    tr_data.drop_duplicates(inplace=True)

    tt_data = pd.read_csv(tt_path)

    return (tr_data, tt_data)

In [None]:
def get_clf_eval(y_test: np.ndarray, y_pred: np.ndarray = None, is_return: bool = False):
    """classifier 평가 결과를 출력하는 함수입니다.

    Args:
        y_test (np.ndarray): 정답 데이터입니다.
        y_pred (np.ndarray, optional): 모델의 예측 결과 데이터입니다. Defaults to None.
    """
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, labels=[True, False])

    # visualize confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=confusion,
                                  display_labels=[True, False])
    disp.plot()
    plt.show()

    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(f1))

    if is_return:
        return precision, recall, f1

### 데이터 전처리
1. `extract_country_name()`: `country_name` feature로부터 국가명만을 추출
1. `binning`: `customer_idx` feature를 bin 단위로 묶음
1. `delete_features()`: 불필요하다고 판단한 features 삭제
1. `log transformation`: skewed distribution을 갖는 features에 대해 변환 수행
1. `regroup()`: 값이 파편화되어있는 features에 대해 regroup 수행
1. `label_encoding()`: 범주형 변수에 대해 encoding 수행

In [None]:
# 데이터 불러오기 & 중복 데이터 삭제
tr_data, tt_data = load_data()

In [None]:
# 국가명 추출
tr_data, tt_data = extract_country_name(tr_data, tt_data)

In [None]:
# binning
start, stop, step = 0, 47501, 500
bins = np.arange(start, stop, step)
labels = [i for i in range(len(bins) - 1)]

tr_data['customer_idx'] = pd.Series(pd.cut(tr_data['customer_idx'], bins=bins, labels=labels), dtype='int64')
tt_data['customer_idx'] = pd.Series(pd.cut(tt_data['customer_idx'], bins=bins, labels=labels), dtype='int64')

In [None]:
# 불필요한 feature 삭제
tr_data, tt_data = delete_features(tr_data, tt_data, features=['id_strategic_ver', 'it_strategic_ver', 'product_modelname', 'ver_cus', 'ver_pro'])

In [None]:
# log transformation
cols = ['com_reg_ver_win_rate', 'historical_existing_cnt', 'lead_desc_length']
for col in cols:
    tr_data[col] = tr_data[col].apply(np.log1p)
    tt_data[col] = tt_data[col].apply(np.log1p)

In [None]:
# regroup
regroup_customer_type = [['End-Customer', 'End Customer', 'End-user', 'Commercial end-user'],
                         ['Specifier / Influencer', 'Specifier/ Influencer'],
                         ['Distributor', 'Dealer/Distributor'],
                         ['Installer', 'Installer/Contractor'],
                         ['Homeowner', 'Home Owner'],
                         ['Others', 'other', 'Etc.', 'Other']]

regroup_customer_job = [['engineering', 'engineering & technical', 'technical', 'engineer', 'chief engineer', 'engineering & technical executive'],
                        ['others', 'other'],
                        ['information technology', 'information_technology'],
                        ['operations', 'operations manager'],
                        ['business development', 'business_development'],
                        ['art and design', 'arts and design', 'kreation_und_design', 'designer', 'arts_and_design'],
                        ['program and project management', 'programm-_und_projektmanagement', 'program_and_project_management', 'projektmenedzsment\tprogram and project management', 'manager', 'project manager', 'general manager', 'it manager', 'operations manager', 'sales manager'],
                        ['media and communication', 'media_e_comunicazione'],
                        ['healthcare services', 'healthcare_services'],
                        ['community and social services', 'community_and_social_services'],
                        ['research', 'research & development'],
                        ['surgery professional', 'surgery professional\u200b'],
                        ['quality_assurance', 'quality_assurance'],
                        ['director', 'it director', 'it', 'director of it'],
                        ['ceo/founder', 'ceo'],
                        ['architect', 'arquitecto/consultor'],
                        ['finance', 'finanzen'],
                        ['integrator', 'integrador'],
                        ['coordinator', 'project coordinator'],
                        ['administrative', 'administrative assistant']]

regroup_inquiry_type = [['Quotation or purchase consultation', 'Quotation or Purchase Consultation', 'quotation_or_purchase_consultation', 'Quotation or Purchase consultation', 'quotation_', 'Request for quotation or purchase', 'Purchase or Quotation', 'Purchase'],
                        ['Sales Inquiry', 'sales', 'Sales inquiry'],
                        ['Usage or technical consultation', 'Technical Consultation', 'Usage or Technical Consultation', 'usage or technical consultation', 'usage_or_technical_consultation', 'technical_consultation', 'Technical Support', 'Request for technical consulting', 'technical'],
                        ['Others', 'Other', 'ETC.', 'ETC.', 'Etc.', 'others', 'other', 'other_']]

regroup_customer_position = [['others', 'other'],
                             ['entry level', 'entrylevel'],
                             ['c-level executive', 'c-levelexecutive'],
                             ['vice president', 'vicepresident'],
                             ['end-user', 'commercial end-user'],
                             ['decision maker', 'decision-maker'],
                             ['decision influencer', 'decision-influencer']]

regroup_expected_timeline = [['less than 3 months', 'less_than_3_months'],
                             ['3 months ~ 6 months', '3_months_~_6_months'],
                             ['less than 6 months'],
                             ['6 months ~ 9 months', '6_months_~_9_months'],
                             ['more than a year'],
                             ['being followed up', 'being followed up.'],
                             ['no requirement', 'the client is not having any requirement hence closig in system. although the details of idb are mailed to client.']]

In [None]:
tr_data, tt_data = regroup(tr_data, tt_data, 'customer_type', regroup_customer_type, except_val='others', except_thr=5)
tr_data, tt_data = regroup(tr_data, tt_data, 'customer_job', regroup_customer_job, except_val='others', except_thr=5)
tr_data, tt_data = regroup(tr_data, tt_data, 'inquiry_type', regroup_inquiry_type, except_val='others', except_thr=2)
tr_data, tt_data = regroup(tr_data, tt_data, 'customer_position', regroup_customer_position, except_val='others', except_thr=6)
tr_data, tt_data = regroup(tr_data, tt_data, 'expected_timeline', regroup_expected_timeline, except_val='others', except_thr=1)
tr_data, tt_data = regroup(tr_data, tt_data, 'product_category', [[]], 'etc.', 5)
tr_data, tt_data = regroup(tr_data, tt_data, 'product_subcategory', [[]], 'others.', 1)

In [None]:
# label encoding
features = ["business_subarea", "country", "business_area", "business_unit", "customer_type",
            "enterprise", "customer_job", "inquiry_type", "product_category", 
            "product_subcategory", "customer_position", "response_corporate","expected_timeline"]

tr_data, tt_data = label_encoding(tr_data, tt_data, features=features)
x_tt = tt_data.drop(['is_converted', 'id'], axis=1)

### 학습에 사용할 hyperparameters 선언

In [None]:
gbm_hparams = {
    'loss': 'log_loss', # The loss function to be optimized.
    'learning_rate':0.1, # Learning rate shrinks the contribution of each tree by learning_rate. 
    'n_estimators': 400, # The number of boosting stages to perform.
    'subsample': 1.0, # The fraction of samples to be used for fitting the individual base learners.
    'criterion': 'friedman_mse', # The function to measure the quality of a split.
    'min_samples_split': 2, # The minimum number of samples required to split an internal node:
    'min_samples_leaf': 1, # The minimum number of samples required to be at a leaf node.
    'max_depth': 6, # Maximum depth of the individual regression estimators.
    'min_impurity_decrease': 0.0, # A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
    'init': None, # An estimator object that is used to compute the initial predictions.
    # 'random_state': hparams['seed'], # Controls the random seed given to each Tree estimator at each boosting iteration.
    'max_features': None, # The number of features to consider when looking for the best split:
    'verbose': 0, # Enable verbose output.
    'max_leaf_nodes': None, # Grow trees with max_leaf_nodes in best-first fashion.
    'warm_start': False,
    'validation_fraction': 0.1, # The proportion of training data to set aside as validation set for early stopping.
    'n_iter_no_change': None, # n_iter_no_change is used to decide if early stopping will be used to terminate training when validation score is not improving.
    'tol': 1e-4, # Tolerance for the early stopping.
    'ccp_alpha': 0.0 # Complexity parameter used for Minimal Cost-Complexity Pruning.
}

### 학습: Gradient Boosting Models Ensemble

In [None]:
# 평균 validation score 확인을 위해 사용
val_precision, val_recall, val_f1 = [], [], []

In [None]:
# prediction 결과를 누적할 배열 선언
test_results = np.zeros((hparams['num_ensemble'], len(tt_data)))

In [None]:
# ensemble loop
for i in range(hparams['num_ensemble']):
    # 서로 다른 seed를 이용하여 undersampling 수행
    rus = RandomUnderSampler(random_state=hparams['seed'] + i)
    x_tr_res, y_tr_res = rus.fit_resample(tr_data.drop(['is_converted'], axis=1), tr_data['is_converted'])

    # train / validation split
    x_tr_res['is_converted'] = y_tr_res # concat
    x_tr, y_tr, x_val, y_val = split_train_and_validation(x_tr_res, seed=hparams['seed'])

    # define a model
    model = GradientBoostingClassifier(**gbm_hparams, random_state=hparams['seed'] + i)

    # training
    model.fit(x_tr.fillna(0), y_tr)
    
    ### print result of current model ###
    print('-' * 20)
    print(f'Model {i + 1} results')
    print('-' * 20)

    print(f'current seed: {hparams["seed"] + i}')

    # check validation score
    y_val_pred = model.predict(x_val.fillna(0))
    pr, re, f1 = get_clf_eval(y_val, y_val_pred, is_return=True)
    
    val_precision.append(pr)
    val_recall.append(re)
    val_f1.append(f1)

    # test
    y_test_pred = model.predict(x_tt.fillna(0))

    # 예측 결과를 array에 누적
    test_results[i, :] = y_test_pred

    # number of positive predictions
    print(sum(y_test_pred))
    print()

In [None]:
# 전체 모델의 평균 validation precision/recall/f1 score 확인
print(f"average validation precision score of {hparams['num_ensemble']} models: {sum(val_precision) / hparams['num_ensemble']:.6f}")
print(f"average validation recall score of {hparams['num_ensemble']} models: {sum(val_recall) / hparams['num_ensemble']:.6f}")
print(f"average validation f1 score of {hparams['num_ensemble']} models: {sum(val_f1) / hparams['num_ensemble']:.6f}")

### submission file 생성

In [None]:
# hard voting
tmp = np.sum(test_results, axis=0, dtype=int)
final_test_pred = np.array([1 if x >= int(hparams['num_ensemble'] / 2) + 1 else 0 for x in tmp])
sum(final_test_pred)

In [None]:
df_sub = pd.read_csv("submission.csv")
df_sub['is_converted'] = final_test_pred
df_sub.to_csv("submission.csv", index=False)