# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler

import numpy as np
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

def seed_everything(seed):
    np.random.seed(seed) #numpy를 사용할 경우 고정
seed_everything(42)

### 데이터 셋 읽어오기

In [2]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [5]:
# 결측치가 있는 변수 및 결측률 파악 

mis_val = df_train.isnull().sum()
mis_val_bool = mis_val >= 1
mis_val_df = pd.concat([mis_val, mis_val_bool], axis = 1)
mis_val_df = mis_val_df.rename(columns = {0 : 'mis_val', 1 : 'mis_val_bool'})

mis_val_data = mis_val_df.loc[mis_val_df['mis_val_bool'] == True, :]
mis_val_data['ratio'] = mis_val_data['mis_val'] / 59299
mis_val_data_sorted = mis_val_data.sort_values(by='ratio', ascending=True)
mis_val_data_sorted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mis_val_data['ratio'] = mis_val_data['mis_val'] / 59299


Unnamed: 0,mis_val,mis_val_bool,ratio
inquiry_type,941,True,0.015869
customer_country.1,982,True,0.01656
customer_country,982,True,0.01656
customer_job,18733,True,0.315908
product_category,19374,True,0.326717
expected_timeline,30863,True,0.520464
business_area,40882,True,0.689421
ver_win_rate_x,40882,True,0.689421
customer_type,43961,True,0.741345
ver_win_ratio_per_bu,43995,True,0.741918


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  float64
 1   customer_country         58317 non-null  object 
 2   business_unit            59299 non-null  object 
 3   com_reg_ver_win_rate     14568 non-null  float64
 4   customer_idx             59299 non-null  int64  
 5   customer_type            15338 non-null  object 
 6   enterprise               59299 non-null  object 
 7   historical_existing_cnt  13756 non-null  float64
 8   id_strategic_ver         3444 non-null   float64
 9   it_strategic_ver         1121 non-null   float64
 10  idit_strategic_ver       4565 non-null   float64
 11  customer_job             40566 non-null  object 
 12  lead_desc_length         59299 non-null  int64  
 13  inquiry_type             58358 non-null  object 
 14  product_category      

# 전처리
customer_country.1 열 삭제

customer_country 결측치 행 삭제

float -> 범주형 데이터로 변환 ['it_strategic_ver','id_strategic_ver','idit_strategic_ver','lead_owner','ver_cus','ver_pro','customer_idx']

inquiry_type,customer_country - 대소문자 통합

이상치 - MinMaxScale

In [7]:
# customer_country , customer_country.1의 값이 일치함.
pre_train = df_train.drop(labels = ['customer_country.1'],axis=1)
pre_test = df_test.drop(labels = ['customer_country.1','id'],axis=1)

# test data customer_country = nan 이 없어서 행삭제 해도 될듯. 삭제양도 많지 않음.
pre_train = pre_train.dropna(subset=['customer_country'])

# 도시/국가 -> 국가
df_train['customer_country'] = df_train['customer_country'].apply(lambda x: x.split('/')[-1])
df_test['customer_country'] = df_test['customer_country'].apply(lambda x: x.split('/')[-1])

범주형 컬럼: ['customer_country', 'business_unit', 'customer_type', 'enterprise', 'customer_job', 'inquiry_type', 'product_category', 'product_subcategory', 'product_modelname', 'customer_position', 'response_corporate', 'expected_timeline', 'business_area', 'business_subarea'] 

수치형 컬럼 : ['is_converted', 'it_strategic_ver', 'bant_submit', 'customer_idx', 'ver_win_rate_x', 'lead_owner', 'com_reg_ver_win_rate', 'lead_desc_length', 'ver_win_ratio_per_bu', 'id_strategic_ver', 'ver_pro', 'historical_existing_cnt', 'idit_strategic_ver', 'ver_cus']


In [8]:
dtypes = pre_train.dtypes

# 각 열의 고유값 개수 확인
unique_value_counts = pre_train.nunique()
object_columns = []

# 각 열에 대해 데이터 타입이 'object'인 경우 non_numeric_columns 리스트에 추가
for col in pre_train.columns:
    if dtypes[col] == 'object':
        object_columns.append(col)

# 수치형 변수가 아닌 컬럼 출력
print("범주형 컬럼:", object_columns,"\n")
print("수치형 컬럼 :", list(set(pre_train.columns)-set(object_columns)))

"""
범주형 데이터 변환 이유 -> array([nan,  1.]) 값으로 구성되어 연속형 데이터가 아니라고 판단
"""
columns_categorized = ['it_strategic_ver','id_strategic_ver','idit_strategic_ver','lead_owner','ver_cus','ver_pro','customer_idx']

for n in columns_categorized:
    pre_train[n] = pre_train[n].fillna(0)
    pre_train[n] = pre_train[n].astype('int').astype('object')

In [11]:
def preprocess_1(col):
    # 대소문자 통일
    pre_train[col] = pre_train[col].str.lower()
    pre_test[col] = pre_test[col].str.lower()
    
    # 공백제거
    mask_train = pre_train[col].notnull()
    mask_test = pre_test[col].notnull()
    pre_train.loc[mask_train, col] = pre_train.loc[mask_train, col].apply(lambda x: x.replace(" ", ""))
    pre_test.loc[mask_test, col] = pre_test.loc[mask_test, col].apply(lambda x: x.replace(" ", ""))    

for n in object_columns:
    preprocess_1(n)

object_columns = object_columns + ['it_strategic_ver','id_strategic_ver','idit_strategic_ver','lead_owner','ver_cus','ver_pro','customer_idx']
numerical_columns = list(set(pre_train.columns)-set(object_columns))

In [13]:
# 각 컬럼별로 이상치 판별 임계값을 계산하는 함수
def find_outlier_thresholds(data, threshold=3):
    thresholds = {}
    min_outliers = {}  # 각 컬럼별 최소 이상치 값을 저장하기 위한 딕셔너리
    
    for column in numerical_columns:
        # 각 컬럼의 Z-Score 계산
        z_scores = (data[column] - data[column].mean()) / data[column].std()
        
        # Z-Score가 임계값보다 큰 데이터를 이상치로 판단
        outliers = data[np.abs(z_scores) > threshold]
        
        # 이상치 판별 임계값 저장
        thresholds[column] = {
            'threshold': threshold,
            'outliers_count': len(outliers),
            'outliers_indices': list(outliers.index)
        }
        
        # 최소 이상치 값 찾기
        if len(outliers) > 0:
            min_outliers[column] = outliers[column].min()
    
    return thresholds, min_outliers

# 예시 데이터 프레임 생성 (스트링 데이터를 가진 'C' 컬럼 추가)

# 이상치 판별 임계값 계산 (임계값은 3으로 설정)
outlier_thresholds, min_outliers = find_outlier_thresholds(pre_train, threshold=3)

# Y_LABEL 값에 따라 이상치 개수와 각 컬럼별 최소 이상치 값을 구분하여 출력
for column, threshold_info in outlier_thresholds.items():
    print(f"Column: {column}")
    print(f"Threshold: {threshold_info['threshold']}")
    
    outliers_true = len([idx for idx in threshold_info['outliers_indices'] if pre_train.loc[idx, 'is_converted'] == True])
    print(f"Number of Outliers : {outliers_true}")
    outliers_false = len([idx for idx in threshold_info['outliers_indices'] if pre_train.loc[idx, 'is_converted'] == False])
    print(f"Number of Outliers : {outliers_false}")
    
    
    # 최소 이상치 값 출력
    if column in min_outliers:
        print(f"Min Outlier Value: {min_outliers[column]}")

    print()


Column: is_converted
Threshold: 3
Number of Outliers : 4715
Number of Outliers : 0
Min Outlier Value: True

Column: bant_submit
Threshold: 3
Number of Outliers : 0
Number of Outliers : 0

Column: ver_win_rate_x
Threshold: 3
Number of Outliers : 0
Number of Outliers : 0

Column: com_reg_ver_win_rate
Threshold: 3
Number of Outliers : 340
Number of Outliers : 493
Min Outlier Value: 0.6153846153846154

Column: lead_desc_length
Threshold: 3
Number of Outliers : 189
Number of Outliers : 1186
Min Outlier Value: 475

Column: ver_win_ratio_per_bu
Threshold: 3
Number of Outliers : 20
Number of Outliers : 82
Min Outlier Value: 0.2272727272727272

Column: historical_existing_cnt
Threshold: 3
Number of Outliers : 22
Number of Outliers : 43
Min Outlier Value: 158.0



In [18]:
features = ['com_reg_ver_win_rate','lead_desc_length','ver_win_ratio_per_bu','historical_existing_cnt']

print(pre_train[features].min())
print(pre_train[features].max())

com_reg_ver_win_rate       0.003788
lead_desc_length           1.000000
ver_win_ratio_per_bu       0.011583
historical_existing_cnt    0.000000
dtype: float64
com_reg_ver_win_rate          1.000000
lead_desc_length           1264.000000
ver_win_ratio_per_bu          0.285714
historical_existing_cnt    1394.000000
dtype: float64


In [20]:
scaler = MinMaxScaler()

pre_train[features] = scaler.fit_transform(pre_train[features]) # fit_transform
pre_test[features] = scaler.transform(pre_test[features]) # transform

In [22]:
pre_train[features].head()

Unnamed: 0,com_reg_ver_win_rate,lead_desc_length,ver_win_ratio_per_bu,historical_existing_cnt
0,0.063118,0.048298,0.055676,
1,0.063118,0.075218,0.055676,0.008608
2,0.085425,0.043547,0.055676,0.1033
3,0.085425,0.034046,0.055676,
4,0.085425,0.07601,0.055676,


## 3. 모델 학습

### 모델 정의 

In [23]:
pre_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58317 entries, 0 to 59298
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              58317 non-null  float64
 1   customer_country         58317 non-null  object 
 2   business_unit            58317 non-null  object 
 3   com_reg_ver_win_rate     13927 non-null  float64
 4   customer_idx             58317 non-null  object 
 5   customer_type            15310 non-null  object 
 6   enterprise               58317 non-null  object 
 7   historical_existing_cnt  13615 non-null  float64
 8   id_strategic_ver         58317 non-null  object 
 9   it_strategic_ver         58317 non-null  object 
 10  idit_strategic_ver       58317 non-null  object 
 11  customer_job             40133 non-null  object 
 12  lead_desc_length         58317 non-null  float64
 13  inquiry_type             57948 non-null  object 
 14  product_category         39

### 모델 학습

In [26]:
# Stratified K-Fold 교차 검증 설정
n_splits = 5  # 원하는 Fold 수
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# 최적의 하이퍼파라미터 설정
best_params = {'depth': 10, 'iterations': 1500, 'learning_rate': 0.01}

# 교차 검증 수행 및 평가 지표 저장
macro_f1_scores = []
probabilities_list = []
best_f1_score = 0

train_x = pre_train.drop(columns=['is_converted'])
train_y = pre_train['is_converted']
train_x = train_x.fillna(0)

X = train_x.values
y = train_y.values

for train_index, valid_index in skf.split(X, y):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    # CatBoost 모델 생성 및 학습
    model = CatBoostClassifier(loss_function='Logloss', cat_features=np.where(pre_train.dtypes == np.object)[0], eval_metric='F1', 
                               random_seed=42, verbose=500, **best_params)
    
    model.fit(X_train, y_train,)

    # 검증 데이터에 대한 예측 및 평가 (Macro F1 Score)
    valid_predictions = model.predict(X_valid)
    valid_predictions = np.array([True if pred == 'True' else False for pred in valid_predictions])
    
    macro_f1 = f1_score(y_valid, valid_predictions, average='macro')
    macro_f1_scores.append(macro_f1)

    if macro_f1 > best_f1_score:
        best_f1_score = macro_f1
        best_classification_model = model

# Macro F1 Score 출력
print("Macro F1 Scores:", macro_f1_scores)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  model = CatBoostClassifier(loss_function='Logloss', cat_features=np.where(pre_train.dtypes == np.object)[0], eval_metric='F1',


0:	learn: 0.2607004	total: 131ms	remaining: 3m 16s
500:	learn: 0.8613396	total: 1m 27s	remaining: 2m 54s
1000:	learn: 0.9010327	total: 3m	remaining: 1m 29s
1499:	learn: 0.9252007	total: 4m 30s	remaining: 0us
[ True  True False ... False False False] 



[ True  True  True ... False False False]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  model = CatBoostClassifier(loss_function='Logloss', cat_features=np.where(pre_train.dtypes == np.object)[0], eval_metric='F1',


0:	learn: 0.4824609	total: 176ms	remaining: 4m 23s
500:	learn: 0.8657067	total: 1m 26s	remaining: 2m 53s
1000:	learn: 0.9033181	total: 2m 57s	remaining: 1m 28s
1499:	learn: 0.9265616	total: 4m 26s	remaining: 0us
[False False False ... False False False] 



[ True  True False ... False False False]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  model = CatBoostClassifier(loss_function='Logloss', cat_features=np.where(pre_train.dtypes == np.object)[0], eval_metric='F1',


0:	learn: 0.4895085	total: 45.1ms	remaining: 1m 7s
500:	learn: 0.8590445	total: 1m 24s	remaining: 2m 48s
1000:	learn: 0.8992826	total: 2m 55s	remaining: 1m 27s
1499:	learn: 0.9214245	total: 4m 25s	remaining: 0us
[ True  True False ... False False False] 



[ True  True  True ... False False False]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  model = CatBoostClassifier(loss_function='Logloss', cat_features=np.where(pre_train.dtypes == np.object)[0], eval_metric='F1',


0:	learn: 0.0000000	total: 70.8ms	remaining: 1m 46s
500:	learn: 0.8666373	total: 1m 26s	remaining: 2m 52s
1000:	learn: 0.9040390	total: 2m 58s	remaining: 1m 28s
1499:	learn: 0.9267880	total: 4m 29s	remaining: 0us
[ True  True False ... False False False] 



[ True  True  True ... False False False]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  model = CatBoostClassifier(loss_function='Logloss', cat_features=np.where(pre_train.dtypes == np.object)[0], eval_metric='F1',


0:	learn: 0.2721919	total: 50.4ms	remaining: 1m 15s
500:	learn: 0.8621753	total: 1m 26s	remaining: 2m 52s
1000:	learn: 0.8968983	total: 2m 56s	remaining: 1m 27s
1499:	learn: 0.9163946	total: 4m 28s	remaining: 0us
[False False  True ... False False False] 



[ True  True  True ... False False False]
Macro F1 Scores: [0.9295240476928819, 0.9222885334012731, 0.9330904972912779, 0.920540002965627, 0.9262522009081642]


In [None]:
df_train['']

## 4. 제출하기

### 테스트 데이터 예측

In [27]:
for n in columns_categorized:
    pre_test[n] = pre_test[n].fillna(0)
    pre_test[n] = pre_test[n].astype('int').astype('object')

In [28]:
# 예측에 필요한 데이터 분리
x_test = pre_test.drop(["is_converted"], axis=1)
x_test.head()

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,lead_desc_length,inquiry_type,product_category,product_subcategory,product_modelname,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner
0,0.0,//brazil,id,0.069725,47466,endcustomer,enterprise,0.03802,0,0,0,consulting,0.001584,,,,,none,lgesp,,1,0,0.001183,0.139558,retail,electronics&telco,278
1,0.25,400nstateoffranklinrdcloudit/johnsoncity/unite...,it,,5405,endcustomer,smb,,0,0,0,,0.017419,quotationorpurchaseconsultation,,,,none,lgeus,,0,0,1.3e-05,,transportation,others,437
2,1.0,//u.a.e,id,,13597,specifier/influencer,smb,,0,0,0,informationtechnology,0.108472,quotationorpurchaseconsultation,hospitaltv,ut662mseries,50ut662m(mea),manager,lgegf,lessthan3months,0,0,6e-05,0.436158,hospital&healthcare,generalhospital,874
3,0.5,/madison/unitedstates,id,0.115293,17204,,enterprise,,0,0,0,sales,0.001584,quotationorpurchaseconsultation,,digitalsignageorcommercialtvs,,none,lgeus,morethanayear,0,0,0.001183,0.139558,retail,,194
4,1.0,/saopaulo/brazil,id,0.071431,2329,endcustomer,enterprise,0.001435,1,0,1,engineering,0.083927,quotationorpurchaseconsultation,ledsignage,lgledbloc,lsaa012,others,lgesp,lessthan3months,1,1,0.003079,0.193276,corporate/office,engineering,167


In [29]:
train_x.head()

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,lead_desc_length,inquiry_type,product_category,product_subcategory,product_modelname,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner
0,1.0,/quezoncity/philippines,as,0.063118,32160,end-customer,enterprise,0.0,0,0,0,purchasing,0.048298,quotationorpurchaseconsultation,multi-split,0,0,entrylevel,lgeph,lessthan3months,1,0,0.003079,0.055676,corporate/office,engineering,0
1,1.0,/ph-00/philippines,as,0.063118,23122,end-customer,enterprise,0.008608,0,0,0,mediaandcommunication,0.075218,quotationorpurchaseconsultation,multi-split,0,0,ceo/founder,lgeph,lessthan3months,1,0,0.003079,0.055676,corporate/office,advertising,1
2,1.0,/kolkata/india,as,0.085425,1755,end-customer,enterprise,0.1033,0,0,0,engineering,0.043547,productinformation,single-split,0,0,partner,lgeil,lessthan3months,1,0,0.003079,0.055676,corporate/office,construction,2
3,1.0,/bhubaneswar/india,as,0.085425,4919,end-customer,enterprise,0.0,0,0,0,entrepreneurship,0.034046,quotationorpurchaseconsultation,vrf,0,0,ceo/founder,lgeil,lessthan3months,1,0,0.003079,0.055676,corporate/office,it/software,3
4,1.0,/hyderabad/india,as,0.085425,17126,specifier/influencer,enterprise,0.0,0,0,0,consulting,0.07601,quotationorpurchaseconsultation,multi-split,0,0,partner,lgeil,lessthan3months,0,0,0.003079,0.055676,corporate/office,0,4


In [30]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5271 entries, 0 to 5270
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              5271 non-null   float64
 1   customer_country         5271 non-null   object 
 2   business_unit            5271 non-null   object 
 3   com_reg_ver_win_rate     1788 non-null   float64
 4   customer_idx             5271 non-null   object 
 5   customer_type            3814 non-null   object 
 6   enterprise               5271 non-null   object 
 7   historical_existing_cnt  1275 non-null   float64
 8   id_strategic_ver         5271 non-null   object 
 9   it_strategic_ver         5271 non-null   object 
 10  idit_strategic_ver       5271 non-null   object 
 11  customer_job             3832 non-null   object 
 12  lead_desc_length         5271 non-null   float64
 13  inquiry_type             3979 non-null   object 
 14  product_category        

In [31]:
test_pred = model.predict(x_test.fillna(0))
print(test_pred)
sum(int(pred == 'True') for pred in test_pred)

['False' 'True' 'False' ... 'False' 'False' 'True']


684

### 제출 파일 작성

In [32]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**