# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [None]:
!pip install seaborn
!pip install miceforest
!pip install fancyimpute
!pip install missingno
!pip install hyperopt
!pip install bayesian-optimization
!pip install catboost

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import matplotlib as mpl

%matplotlib inline
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from catboost import CatBoostClassifier
from hyperopt import hp, tpe, Trials, fmin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from hyperopt import hp
from hyperopt import STATUS_OK
from hyperopt import fmin, tpe, Trials
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from fancyimpute import IterativeImputer
from sklearn.experimental import enable_iterative_imputer
import miceforest as mf
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier
from lightgbm import plot_importance
from sklearn.ensemble import GradientBoostingClassifier
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

### 데이터 셋 읽어오기

In [2]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [3]:
df_test.head() # 학습용 데이터 살펴보기

Unnamed: 0,id,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,lead_desc_length,inquiry_type,product_category,product_subcategory,product_modelname,customer_country.1,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,19844,0.0,/ / Brazil,ID,0.073248,47466,End Customer,Enterprise,53.0,,,,consulting,3,,,,,/ / Brazil,none,LGESP,,1,0,0.001183,0.04984,retail,Electronics & Telco,278,False
1,9738,0.25,400 N State Of Franklin Rd Cloud IT / Johnson...,IT,,5405,End Customer,SMB,,,,,,23,Quotation or Purchase Consultation,,,,400 N State Of Franklin Rd Cloud IT / Johnson...,none,LGEUS,,0,0,1.3e-05,,transportation,Others,437,True
2,8491,1.0,/ / U.A.E,ID,,13597,Specifier/ Influencer,SMB,,,,,information technology,138,Quotation or Purchase Consultation,hospital tv,UT662M Series,50UT662M (MEA),/ / U.A.E,manager,LGEGF,less than 3 months,0,0,6e-05,0.131148,hospital & health care,General Hospital,874,False
3,19895,0.5,/ Madison / United States,ID,0.118644,17204,,Enterprise,,,,,sales,3,Quotation or Purchase Consultation,,Digital Signage or Commercial TVs,,/ Madison / United States,none,LGEUS,more than a year,0,0,0.001183,0.04984,retail,,194,False
4,10465,1.0,/ Sao Paulo / Brazil,ID,0.074949,2329,End Customer,Enterprise,2.0,1.0,,1.0,engineering,107,Quotation or Purchase Consultation,led signage,LG LED Bloc,LSAA012,/ Sao Paulo / Brazil,others,LGESP,less than 3 months,1,1,0.003079,0.064566,corporate / office,Engineering,167,False


In [4]:
df_train['ver_test'] = df_train['ver_win_ratio_per_bu'] / df_train['ver_win_rate_x']
df_test['ver_test'] = df_test['ver_win_ratio_per_bu'] / df_test['ver_win_rate_x']

In [5]:
df_train['product_category'] = df_train['product_category'].str.lower().str.replace(" ", "")
df_test['product_category'] = df_test['product_category'].str.lower().str.replace(" ", "")

In [6]:
product_mapping = { 'HVAC/ESS': ['control', 'ventilation', 'vrf', 'multi-split', 'arcondicionadoresidencial','single-split', 'chiler','chiller', 'heating','rac','tetooucasseteinverter','multiinverter'],
                     'Commercial Display': ['monitorsignage,monior/monitortv','monitorsignage,monior/monitortv,vacuumcleaner,...	','tvsignage','monitorsignage,commercialtv,monior/monitortv','interactivedigitalboard','digitalsignage','signagecaresolutions','smarttvsignage','uhdsignage','oledsignage', 'ledsignage', 'videowallsignage', 'videowall','interactivesignage', 'highbrightnesssignage','highbrightness', 'specialsignage', 'standardsignage', 'hoteltv', 'hospitaltv', 'signagecaresolution', 'lgone:quickseries','accessories', 'webos', 'one:quickseries', 'pro:centric'],
                      'IT PRODUCTS': ['monitor', 'laptop', 'projector','pc', 'clouddevice', 'medicaldisplay'],
                      'Commerical Laundry': ['titan(largecapacity)', 'giant(standardcapacity)'],
                      'Compressor & Motor': ['reciprocatingcompressor', 'rotarycompressor', 'scrollcompressor', 'motor'],
                      'ADVANCED MATERIALS': ['antimicrobial', 'porcelainenamel', 'specialtyglass'] ,
                      'Robot': ['lgcloiuv-cbot', 'lgcloiservebot(shelftype)', 'lgcloiservebot(drawertype)', 'lgcloiguidebot'],
                      'Others':['etc.','others','other','softwaresolution']
                    }

def map_product_category(value):
    for product, values in product_mapping.items():
        if value in values:
            return product
    return value  # 매핑되지 않은 경우 원래 값을 반환

df_train['Product'] = df_train['product_category'].apply(map_product_category)
df_test['Product'] = df_test['product_category'].apply(map_product_category)

## 2. 데이터 전처리

In [7]:
df_train['customer_type'] = df_train['customer_type'].str.lower().replace(r'[^a-zA-Z0-9]', '', regex=True)
df_test['customer_type'] = df_test['customer_type'].str.lower().replace(r'[^a-zA-Z0-9]', '', regex=True)
df_train['customer_job'] = df_train['customer_job'].str.lower().replace(r'[^a-zA-Z0-9]', '', regex=True)
df_test['customer_job'] = df_test['customer_job'].str.lower().replace(r'[^a-zA-Z0-9]', '', regex=True)
df_train['inquiry_type'] = df_train['inquiry_type'].str.lower().replace(r'[^a-zA-Z0-9]', '', regex=True)
df_test['inquiry_type'] = df_test['inquiry_type'].str.lower().replace(r'[^a-zA-Z0-9]', '', regex=True)
df_train['customer_position'] = df_train['customer_position'].str.lower().replace(r'[^a-zA-Z0-9]', '', regex=True)
df_test['customer_position'] = df_test['customer_position'].str.lower().replace(r'[^a-zA-Z0-9]', '', regex=True)
df_train['expected_timeline'] = df_train['expected_timeline'].str.lower().replace(r'[^a-zA-Z0-9]', '', regex=True)
df_test['expected_timeline'] = df_test['expected_timeline'].str.lower().replace(r'[^a-zA-Z0-9]', '', regex=True)

In [8]:
df_train['ver_country'] = df_train['com_reg_ver_win_rate'] / df_train['ver_win_ratio_per_bu']
df_test['ver_country'] = df_test['com_reg_ver_win_rate'] / df_test['ver_win_ratio_per_bu']

In [9]:
cond1 = (
    (df_train['business_unit'] == 'ID') &
    ((df_train['business_area'] == 'corporate / office') | (df_train['business_area'] == 'hotel & accommodation'))
    & df_train['id_strategic_ver'].isna())

cond2 = (
    (df_test['business_unit'] == 'ID') &
    ((df_test['business_area'] == 'corporate / office') | (df_test['business_area'] == 'hotel & accommodation'))
    & df_test['id_strategic_ver'].isna())
cond3 = (
    (df_train['business_unit'] == 'IT') &
    ((df_train['business_area'] == 'corporate / office') | (df_train['business_area'] == 'hotel & accommodation'))
    & df_train['it_strategic_ver'].isna())

cond4 = (
    (df_test['business_unit'] == 'IT') &
    ((df_test['business_area'] == 'corporate / office') | (df_test['business_area'] == 'hotel & accommodation'))
    & df_test['it_strategic_ver'].isna())

In [10]:
df_train.loc[cond1, 'id_strategic_ver'] = 1
df_train['id_strategic_ver'] = df_train['id_strategic_ver'].fillna(0)
df_test.loc[cond2, 'id_strategic_ver'] = 1
df_test['id_strategic_ver'] = df_test['id_strategic_ver'].fillna(0)
df_train.loc[cond3, 'it_strategic_ver'] = 1
df_train['it_strategic_ver'] = df_train['it_strategic_ver'].fillna(0)
df_test.loc[cond4, 'it_strategic_ver'] = 1
df_test['it_strategic_ver'] = df_test['it_strategic_ver'].fillna(0)
df_train['idit_strategic_ver'] = df_train.apply(lambda row: 1 if row['id_strategic_ver'] == 1 or row['it_strategic_ver'] == 1 else 0, axis=1)
df_test['idit_strategic_ver'] = df_test.apply(lambda row: 1 if row['id_strategic_ver'] == 1 or row['it_strategic_ver'] == 1 else 0, axis=1)

In [11]:
numeric_columns = df_train.select_dtypes(include='number')

columns_without_missing_values = numeric_columns.columns[numeric_columns.isnull().sum() == 0]

columns_to_exclude = ['id_strategic_ver', 'it_strategic_ver','idit_strategic_ver']
columns_to_exclude.extend(columns_without_missing_values)

columns_to_impute = numeric_columns.drop(columns=columns_to_exclude, errors='ignore')

imputer = IterativeImputer()

df_train_imputed = pd.DataFrame(imputer.fit_transform(columns_to_impute), columns=columns_to_impute.columns)

replacement_values = df_train_imputed.mean()
print("각 열의 대체값:")
print(dict(zip(columns_to_impute.columns, replacement_values)))

각 열의 대체값:
{'com_reg_ver_win_rate': 0.09165432923188417, 'historical_existing_cnt': 18.403146499021506, 'ver_win_rate_x': 0.0011355375848576702, 'ver_win_ratio_per_bu': 0.055515326376435206, 'ver_test': 2634.475440829076, 'ver_country': 1.6052930268839807}




In [12]:
df_train[columns_to_impute.columns] = df_train[columns_to_impute.columns].fillna(replacement_values)
df_test[columns_to_impute.columns] = df_test[columns_to_impute.columns].fillna(replacement_values)

In [13]:
df_train = df_train.drop(['ver_pro','ver_win_ratio_per_bu','id_strategic_ver','it_strategic_ver','idit_strategic_ver','product_category','com_reg_ver_win_rate',"business_area","business_subarea","customer_country","product_subcategory", "product_modelname","customer_country.1"], axis=1)
df_test = df_test.drop(['ver_pro','ver_win_ratio_per_bu','ver_win_ratio_per_bu','id_strategic_ver','it_strategic_ver','idit_strategic_ver','product_category','com_reg_ver_win_rate',"business_area","business_subarea","customer_country","product_subcategory", "product_modelname","customer_country.1"], axis=1)

### 레이블 인코딩

In [14]:
# 레이블 인코딩할 칼럼들
label_columns = [
   #"customer_country",
   # "business_subarea",
  #  "business_area",
    "business_unit",
    "customer_type",
   "enterprise",
    "customer_job",
    "inquiry_type",
   # "product_category",
  #  "product_subcategory",
  #  "product_modelname",
   # "customer_country.1",
   "customer_position",
   "response_corporate",
    "expected_timeline",
    "Product"
   #"Continent",
    # "Country"
]


df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for column in label_columns:
    df_all[column] = pd.factorize(df_all[column])[0]

    

In [15]:
for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

### 2-2. 학습, 검증 데이터 분리

In [16]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,)

## 3. 모델 학습

### 모델 정의 

In [17]:
bayesian_params = {
    'max_depth': (6, 16), 
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.01, 50) 
}

In [18]:

def lgb_f1_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":500, "learning_rate":0.02,
        'max_depth': int(round(max_depth)), 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0)
    }
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)], eval_metric= 'f1', callbacks = [lgb.early_stopping(stopping_rounds = 100), lgb.log_evaluation(period = 100)],)
    valid_pred = lgb_model.predict(x_val)
    f1 = f1_score(y_val,valid_pred)
    
    return f1

In [19]:
lgbBO = BayesianOptimization(lgb_f1_eval,bayesian_params , random_state=0)
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.126742	valid_1's binary_logloss: 0.13021
[200]	training's binary_logloss: 0.10412	valid_1's binary_logloss: 0.109942
[300]	training's binary_logloss: 0.0955911	valid_1's binary_logloss: 0.102627
[400]	training's binary_logloss: 0.0911809	valid_1's binary_logloss: 0.0992772
[500]	training's binary_logloss: 0.0883285	valid_1's binary_logloss: 0.0971006
Did not meet early stopping. Best iteration is:
[500]	training's binary_logloss: 0.0883285	valid_1's binary_logloss: 0.0971006
| [0m1        [0m | [0m0.7216   [0m | [0m0.7744   [0m | [0m360.4    [0m | [0m12.03    [0m | [0m113.5    [0m | [0m21.76    [0m | [0m49.84    

In [20]:
target_list = []
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmax(np.array(target_list)))

[0.7215836526181353, 0.7415307402760352, 0.7576875768757687, 0.7206632653061225, 0.7143786597267403, 0.7467330429371499, 0.7429643527204502, 0.7077326343381389, 0.73989898989899, 0.748768472906404, 0.7346153846153847, 0.7841726618705037, 0.7452711223203026, 0.7628992628992629, 0.7276166456494325, 0.7271581600504096, 0.7093403004572175, 0.7807807807807808, 0.7062706270627063, 0.7576706324358171, 0.6984957488554612, 0.7196382428940568, 0.6995336442371751, 0.746583850931677, 0.7576875768757687, 0.6563275434243176, 0.7885304659498208, 0.7856709168184577, 0.7334183673469388, 0.7087126137841353]
maximum target index: 26


In [21]:
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

{'target': 0.7885304659498208, 'params': {'colsample_bytree': 1.0, 'max_bin': 500.0, 'max_depth': 16.0, 'min_child_samples': 10.0, 'min_child_weight': 1.0, 'num_leaves': 24.0, 'reg_alpha': 0.01, 'reg_lambda': 0.001, 'subsample': 1.0}}


In [22]:
def train_apps_all(df_train):
    ftr_app = df_train.drop("is_converted", axis=1)
    target_app = df_train["is_converted"]
    train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.2, shuffle=True, random_state=400)
    print('train shape:', train_x.shape, 'valid shape:', valid_x.shape)
    clf = LGBMClassifier(
                nthread=4,
                n_estimators=1000,
                learning_rate=0.02,
                max_depth = 16,
                num_leaves=64,
                colsample_bytree=1,
                subsample=0.716,
                max_bin=474,
                reg_alpha=0.01,
                reg_lambda=0.001,
                min_child_weight=1,
                min_child_samples=143,
                silent=-1,
                verbose=-1,
                )


    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'f1', callbacks = [lgb.early_stopping(stopping_rounds = 100), lgb.log_evaluation(period = 100)])
    
    return clf

In [23]:
clf = train_apps_all(df_train)

train shape: (47439, 18) valid shape: (11860, 18)
Training until validation scores don't improve for 100 rounds




[100]	training's binary_logloss: 0.0966901	valid_1's binary_logloss: 0.106591
[200]	training's binary_logloss: 0.069321	valid_1's binary_logloss: 0.0852778
[300]	training's binary_logloss: 0.0580626	valid_1's binary_logloss: 0.0783134
[400]	training's binary_logloss: 0.0501786	valid_1's binary_logloss: 0.0746303
[500]	training's binary_logloss: 0.044378	valid_1's binary_logloss: 0.0722741
[600]	training's binary_logloss: 0.0392159	valid_1's binary_logloss: 0.0703214
[700]	training's binary_logloss: 0.0348559	valid_1's binary_logloss: 0.0691024
[800]	training's binary_logloss: 0.0310375	valid_1's binary_logloss: 0.0677509
[900]	training's binary_logloss: 0.027855	valid_1's binary_logloss: 0.0667479
[1000]	training's binary_logloss: 0.0251256	valid_1's binary_logloss: 0.0661085
Did not meet early stopping. Best iteration is:
[1000]	training's binary_logloss: 0.0251256	valid_1's binary_logloss: 0.0661085


### 모델 성능 보기

In [24]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [25]:
pred = clf.predict(x_val)
get_clf_eval(y_val, pred)

오차행렬:
 [[  731   216]
 [   82 10831]]

정확도: 0.9749
정밀도: 0.8991
재현율: 0.7719
F1: 0.8307



## 4. 제출하기

### 테스트 데이터 예측

In [26]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [27]:
test_pred = clf.predict(x_test)
sum(test_pred) # True로 예측된 개수

677

### 제출 파일 작성

In [28]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**