# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
!pip install seaborn
!pip install miceforest
!pip install fancyimpute
!pip install missingno
!pip install hyperopt
!pip install bayesian-optimization

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import matplotlib as mpl

%matplotlib inline
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

from hyperopt import hp
from hyperopt import STATUS_OK
from hyperopt import fmin, tpe, Trials
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from fancyimpute import IterativeImputer
from sklearn.experimental import enable_iterative_imputer
import miceforest as mf
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import lightgbm as lgb

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

### 데이터 셋 읽어오기

In [3]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [4]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,lead_desc_length,inquiry_type,product_category,product_subcategory,product_modelname,customer_country.1,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,,purchasing,62,Quotation or purchase consultation,multi-split,,,/Quezon City/Philippines,entry level,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,,media and communication,96,Quotation or purchase consultation,multi-split,,,/PH-00/Philippines,ceo/founder,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,,engineering,56,Product Information,single-split,,,/Kolkata /India,partner,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,,entrepreneurship,44,Quotation or purchase consultation,vrf,,,/Bhubaneswar/India,ceo/founder,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,,consulting,97,Quotation or purchase consultation,multi-split,,,/Hyderabad/India,partner,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


## 2. 데이터 전처리

In [7]:
df_train['ver_country'] = df_train['com_reg_ver_win_rate'] / df_train['ver_win_ratio_per_bu']
df_test['ver_country'] = df_test['com_reg_ver_win_rate'] / df_test['ver_win_ratio_per_bu']

In [5]:
df_train['product_category'] = df_train['product_category'].str.lower().str.replace(" ", "")
df_test['product_category'] = df_test['product_category'].str.lower().str.replace(" ", "")

In [6]:
product_mapping = { 'HVAC/ESS': ['control', 'ventilation', 'vrf', 'multi-split', 'arcondicionadoresidencial','single-split', 'chiler','chiller', 'heating','rac','tetooucasseteinverter','multiinverter'],
                     'Commercial Display': ['interactivedigitalboard','oledsignage', 'ledsignage', 'videowallsignage', 'videowall','interactivesignage', 'highbrightnesssignage','highbrightness', 'specialsignage', 'standardsignage', 'hoteltv', 'hospitaltv', 'softwaresolution', 'signagecaresolution', 'lgone:quickseries','accessories', 'webos', 'one:quickseries', 'pro:centric'],
                      'IT PRODUCTS': ['monitor', 'laptop', 'projector','pc', 'clouddevice', 'medicaldisplay'], 
                      'Commerical Laundry': ['titan(largecapacity)', 'giant(standardcapacity)'],
                      'Compressor & Motor': ['reciprocatingcompressor', 'rotarycompressor', 'scrollcompressor', 'motor'],
                      'ADVANCED MATERIALS': ['antimicrobial', 'porcelainenamel', 'specialtyglass'] ,
                      'Robot': ['lgcloiuv-cbot', 'lgcloiservebot(shelftype)', 'lgcloiservebot(drawertype)', 'lgcloiguidebot'],
                      'Others':['etc.','others','other'] 
                    }

def map_product_category(value):
    for product, values in product_mapping.items():
        if value in values:
            return product
    return value  # 매핑되지 않은 경우 원래 값을 반환

df_train['Product'] = df_train['product_category'].apply(map_product_category)
df_test['Product'] = df_test['product_category'].apply(map_product_category)

In [8]:
conversion_rates = df_train.groupby('customer_idx')['is_converted'].mean()* 100
df_train['customer_idx'] = df_train['customer_idx'].map(conversion_rates)
conversion_rates1 = df_test.groupby('customer_idx')['is_converted'].mean()* 100
df_test['customer_idx'] = df_test['customer_idx'].map(conversion_rates1)

In [9]:
conversion_rates3 = df_train.groupby('lead_owner')['is_converted'].mean()* 100
df_train['lead_owner'] = df_train['lead_owner'].map(conversion_rates3)
conversion_rates4 = df_train.groupby('lead_owner')['is_converted'].mean()* 100
df_train['lead_owner'] = df_train['lead_owner'].map(conversion_rates4)

In [10]:
df_train['customer_type'] = df_train['customer_type'].str.lower().str.replace(" ", "")

In [11]:
customer_type_mapping = { 'End Customer': ['endcustomer','end-user','end-customer','constructioncompany', 'owner/developer', 'medical/healthcarefacility', 'government/publicsector', 'corporate', 'education', 'retail', 'fitness'],
                         'Channel Partner' : ['agent','channelpartner', 'distributor','reseller', 'nsp(usonly)', 'nationalreseller', 'regionalreseller', 'si(systemintegrator)', 'proav/avconsultant', 'var(3po)'],
                          'Specifier/ Influencer': ['specifier/influencer','architect', 'consultant', 'contractor', 'technical/designfirm', 'regionbuilder', 'installer', 'ad&contentsprovider', 'appliedrep'],
                          'Solution Eco-Partner': ['solutioneco-partner','cms/webos/isv', 'mount/metalfabrication','meetingsolution', 'control/processor', 'externalcompute'],
                          'Service Partner': ['servicepartner','authorizedservicecenter', 'authorizedservicedealer'], } 
                          
def map_customer_type_category(value):
    for customer_type, values in customer_type_mapping.items():
        if value in values:
            return customer_type
    return value  # 매핑되지 않은 경우 원래 값을 반환

df_train['customer_type'] = df_train['customer_type'].apply(map_customer_type_category)

In [12]:
df_train['inquiry_type'] = df_train['inquiry_type'].str.lower().str.replace(" ", "")
df_test['inquiry_type'] = df_test['inquiry_type'].str.lower().str.replace(" ", "")

In [13]:
continent_mapping = { 'Europe': ['LGEWA', 'LGEMA', 'LGEWR', 'LGEUK', 'LGEFS', 'LGEES', 'LGEEH', 'LGEJE', 'LGEDG', 'LGEIS', 'LGEMK', 'LGEPL', 'LGESW', 'LGEHS', 'LGEAG', 'LGERO', 'LGECZ', 'LGEPT', 'LGEBN', 'LGESC', 'LGELS', 'LGENO','LGEMF'],
 'CSI' : ['LGEUA', 'LGEAK','LGERU', 'LGERA', 'LGERI', 'LGERM', 'LGEUR', 'LGELV'], 
 'China': ['LGETR', 'LGERD', 'LGEHZ', 'LGEND', 'LGEHK', 'LGETT',  'LGEPN', 'LGECH','LGEQH', 'LGESH', 'LGESY', 'LGETA', 'LGEYT', 'LGEKS', 'LGENP', 'LGEHN', 'LGEQD'], 
 'Asia': ['LGEIL', 'LGSI','LGEAP',  'LGETH', 'LGEVN', 'LGEIN', 'LGESL', 'LGEML', 'LGEJP', 'LGEPH', 'LGEVH','LGEKR'],
 'MIDDLE EAST & America': ['LGEEG', 'LGEAT', 'LGESR', 'LGETK', 'LGESA', 'LGEMC', 'LGEGF', 'LGEME', 'LGEOT', 'LGEEC', 'LGENI', 'LGEAF', 'LGELF','LGESJ'],      
 'North America': ['LGEMX', 'LGEMM', 'LGEAI', 'LGECI', 'LGEUS', 'LGEMU', 'LGEMS', 'LGEMR'],
 'SOUTH & CENTRAL AMERICA' :['LGEAR', 'LGEAZ', 'LGECB', 'LGECL', 'LGEPR', 'LGEPS','LGERS', 'LGESP' ],        
 }

 
df_train['Continent'] = df_train['response_corporate'].map({value: continent for continent, values in continent_mapping.items() for value in values})
df_test['Continent'] = df_test['response_corporate'].map({value: continent for continent, values in continent_mapping.items() for value in values})

In [14]:
numeric_columns = df_train.select_dtypes(include='number')

columns_without_missing_values = numeric_columns.columns[numeric_columns.isnull().sum() == 0]

columns_to_exclude = ['id_strategic_ver', 'it_strategic_ver','idit_strategic_ver']
columns_to_exclude.extend(columns_without_missing_values)

columns_to_impute = numeric_columns.drop(columns=columns_to_exclude, errors='ignore')

imputer = IterativeImputer()

df_train_imputed = pd.DataFrame(imputer.fit_transform(columns_to_impute), columns=columns_to_impute.columns)

replacement_values = df_train_imputed.mean()
print("각 열의 대체값:")
print(dict(zip(columns_to_impute.columns, replacement_values)))

각 열의 대체값:
{'com_reg_ver_win_rate': 0.09149000306480037, 'historical_existing_cnt': 17.94179198425886, 'ver_win_rate_x': 0.0011425450075443645, 'ver_win_ratio_per_bu': 0.05576048085559978, 'ver_country': 1.603636335312523}




In [15]:
df_train[columns_to_impute.columns] = df_train[columns_to_impute.columns].fillna(replacement_values)

In [16]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  float64
 1   customer_country         58317 non-null  object 
 2   business_unit            59299 non-null  object 
 3   com_reg_ver_win_rate     59299 non-null  float64
 4   customer_idx             59299 non-null  float64
 5   customer_type            15338 non-null  object 
 6   enterprise               59299 non-null  object 
 7   historical_existing_cnt  59299 non-null  float64
 8   id_strategic_ver         3444 non-null   float64
 9   it_strategic_ver         1121 non-null   float64
 10  idit_strategic_ver       4565 non-null   float64
 11  customer_job             40566 non-null  object 
 12  lead_desc_length         59299 non-null  int64  
 13  inquiry_type             58358 non-null  object 
 14  product_category      

In [17]:
df_train = df_train.drop(["product_category","customer_country", "com_reg_ver_win_rate", "id_strategic_ver","it_strategic_ver", "idit_strategic_ver","product_subcategory", "product_modelname", "customer_country.1","response_corporate","business_area" ,"business_subarea","ver_cus","ver_pro"], axis=1)
df_test = df_test.drop(["product_category","customer_country", "com_reg_ver_win_rate", "id_strategic_ver","it_strategic_ver", "idit_strategic_ver","product_subcategory", "product_modelname", "customer_country.1","response_corporate","business_area" ,"business_subarea","ver_cus","ver_pro"], axis=1)

### 레이블 인코딩

In [18]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [19]:
# 레이블 인코딩할 칼럼들
label_columns = [
   #"customer_country",
    #"business_subarea",
   # "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    #"product_category",
   # "product_subcategory",
    #"product_modelname",
   # "customer_country.1",
    "customer_position",
    #"response_corporate",
    "expected_timeline",
    "Continent",
    # "Country"
    "Product"
]


df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

    

다시 학습 데이터와 제출 데이터를 분리합니다.

In [20]:
for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

### 2-2. 학습, 검증 데이터 분리

In [21]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,)

## 3. 모델 학습

### 모델 정의 

In [22]:
bayesian_params = {
    'max_depth': (6, 16), 
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.01, 50) 
}

In [23]:
def lgb_f1_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":500, "learning_rate":0.02,
        'max_depth': int(round(max_depth)), 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0)
    }
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)], eval_metric= 'f1', callbacks = [lgb.early_stopping(stopping_rounds = 100), lgb.log_evaluation(period = 100)],)
    valid_pred = lgb_model.predict(x_val)
    f1 = f1_score(y_val,valid_pred)
    
    return f1

In [24]:
lgbBO = BayesianOptimization(lgb_f1_eval,bayesian_params , random_state=0)
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.0407918	valid_1's binary_logloss: 0.0416801
[200]	training's binary_logloss: 0.0167769	valid_1's binary_logloss: 0.0177545
[300]	training's binary_logloss: 0.0117689	valid_1's binary_logloss: 0.0126404
[400]	training's binary_logloss: 0.0105096	valid_1's binary_logloss: 0.0113621
[500]	training's binary_logloss: 0.0101214	valid_1's binary_logloss: 0.0109865
Did not meet early stopping. Best iteration is:
[500]	training's binary_logloss: 0.0101214	valid_1's binary_logloss: 0.0109865
| [0m1        [0m | [0m0.974    [0m | [0m0.7744   [0m | [0m360.4    [0m | [0m12.03    [0m | [0m113.5    [0m | [0m21.76    [0m | [0m49

In [25]:
target_list = []
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmax(np.array(target_list)))

[0.9739500265816055, 0.9739500265816055, 0.9745222929936306, 0.975531914893617, 0.974468085106383, 0.9750132908027644, 0.9755058572949946, 0.9739500265816055, 0.9750132908027644, 0.9767195767195767, 0.9756613756613757, 0.9762030671602326, 0.9756871035940804, 0.9744952178533475, 0.9739500265816055, 0.9767441860465116, 0.9751454257006875, 0.9745493107104984, 0.9756613756613757, 0.9746031746031746, 0.9750663129973474, 0.9749866950505588, 0.9755058572949946, 0.9739776951672862, 0.9751454257006875, 0.9751191106405506, 0.9761526232114468, 0.9746300211416491, 0.9740603493912122, 0.9744952178533475]
maximum target index: 15


In [26]:
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

{'target': 0.9767441860465116, 'params': {'colsample_bytree': 0.7984694893597473, 'max_bin': 280.8125538867223, 'max_depth': 15.561555833121675, 'min_child_samples': 57.07249223589623, 'min_child_weight': 4.053589307787831, 'num_leaves': 34.01763477561242, 'reg_alpha': 5.379945877221973, 'reg_lambda': 4.094651011509741, 'subsample': 0.5859549044680197}}


In [27]:
def train_apps_all(df_train):
    ftr_app = df_train.drop("is_converted", axis=1)
    target_app = df_train["is_converted"]
    train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.2, shuffle=True, random_state=400)
    print('train shape:', train_x.shape, 'valid shape:', valid_x.shape)
    clf = LGBMClassifier(
                nthread=4,
                n_estimators=1000,
                learning_rate=0.02,
                max_depth = 15,
                num_leaves=34,
                colsample_bytree=0.7984,
                subsample=0.5859,
                max_bin=280,
                reg_alpha=5.3799,
                reg_lambda=4.094,
                min_child_weight=4,
                min_child_samples=57,
                silent=-1,
                verbose=-1,
                )

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'f1', callbacks = [lgb.early_stopping(stopping_rounds = 100), lgb.log_evaluation(period = 100)])
    
    return clf

### 모델 학습

In [28]:
clf = train_apps_all(df_train)

train shape: (47439, 17) valid shape: (11860, 17)
Training until validation scores don't improve for 100 rounds




[100]	training's binary_logloss: 0.0359942	valid_1's binary_logloss: 0.0366811
[200]	training's binary_logloss: 0.0126568	valid_1's binary_logloss: 0.0135755
[300]	training's binary_logloss: 0.00840412	valid_1's binary_logloss: 0.00962959
[400]	training's binary_logloss: 0.00723403	valid_1's binary_logloss: 0.00862832
[500]	training's binary_logloss: 0.00682124	valid_1's binary_logloss: 0.00835666
[600]	training's binary_logloss: 0.00660564	valid_1's binary_logloss: 0.00825543
[700]	training's binary_logloss: 0.00654141	valid_1's binary_logloss: 0.00821787
[800]	training's binary_logloss: 0.00649856	valid_1's binary_logloss: 0.00820343
[900]	training's binary_logloss: 0.00649367	valid_1's binary_logloss: 0.00820284
Early stopping, best iteration is:
[813]	training's binary_logloss: 0.00649593	valid_1's binary_logloss: 0.00820265


### 모델 성능 보기

In [29]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [30]:
pred = clf.predict(x_val)
get_clf_eval(y_val, pred)

오차행렬:
 [[  924    23]
 [   21 10892]]

정확도: 0.9963
정밀도: 0.9778
재현율: 0.9757
F1: 0.9767



## 4. 제출하기

### 테스트 데이터 예측

In [31]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [32]:
test_pred = clf.predict(x_test)
sum(test_pred) # True로 예측된 개수

1410

### 제출 파일 작성

In [33]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**