# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

def seed_everything(seed):
    np.random.seed(seed) #numpy를 사용할 경우 고정
seed_everything(42)

### 데이터 셋 읽어오기

In [None]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

### 삭제할 컬럼

In [None]:
drop_columns = [
    'customer_type',
    'ver_win_ratio_per_bu',
    'com_reg_ver_win_rate',
    'historical_existing_cnt',
    'product_subcategory',
    'product_modelname',
    'business_subarea',
    'idit_strategic_ver',
    'id_strategic_ver',
    'it_strategic_ver'
]

pre_train = df_train.drop(labels = drop_columns,axis=1)
pre_test = df_test.drop(labels = drop_columns + ['id'],axis=1)

In [19]:
print(len(df_train.columns))
print(len(pre_train.columns))

29
19


In [20]:
# 1. 분석의 목적과 목적에 맞는 변수 확인
# 데이터의 구조와 변수 목록을 살펴봅니다.
print("데이터 정보:")
print(df_train.info())

# 2. 데이터형 확인 및 데이터 오류/누락 여부 확인
# 각 열의 데이터 유형 및 결측치 확인
print("\n\n\n데이터 유형 및 결측치:")
print(df_train.dtypes)
print(df_train.isnull().sum())

# 3. 데이터 분포 확인
# 데이터의 기술 통계량을 계산하여 분포를 확인합니다.
print("\n\n\n데이터 기술 통계량:")
print(df_train.describe())

데이터 정보:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  float64
 1   customer_country         58317 non-null  object 
 2   business_unit            59299 non-null  object 
 3   com_reg_ver_win_rate     14568 non-null  float64
 4   customer_idx             59299 non-null  int64  
 5   customer_type            15338 non-null  object 
 6   enterprise               59299 non-null  object 
 7   historical_existing_cnt  13756 non-null  float64
 8   id_strategic_ver         3444 non-null   float64
 9   it_strategic_ver         1121 non-null   float64
 10  idit_strategic_ver       4565 non-null   float64
 11  customer_job             40566 non-null  object 
 12  lead_desc_length         59299 non-null  int64  
 13  inquiry_type             58358 non-null  object 
 14  product_catego

In [21]:
# 결측치가 있는 변수 및 결측률 파악 - 결측률이 0.7이상인 column 17개 제거 ?

mis_val = df_train.isnull().sum()
mis_val_bool = mis_val >= 1
mis_val_df = pd.concat([mis_val, mis_val_bool], axis = 1)
mis_val_df = mis_val_df.rename(columns = {0 : 'mis_val', 1 : 'mis_val_bool'})

mis_val_data = mis_val_df.loc[mis_val_df['mis_val_bool'] == True, :]
mis_val_data['ratio'] = mis_val_data['mis_val'] / 59299
mis_val_data_sorted = mis_val_data.sort_values(by='ratio', ascending=True)
mis_val_data_sorted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mis_val_data['ratio'] = mis_val_data['mis_val'] / 59299


Unnamed: 0,mis_val,mis_val_bool,ratio
inquiry_type,941,True,0.015869
customer_country.1,982,True,0.01656
customer_country,982,True,0.01656
customer_job,18733,True,0.315908
product_category,19374,True,0.326717
expected_timeline,30863,True,0.520464
business_area,40882,True,0.689421
ver_win_rate_x,40882,True,0.689421
customer_type,43961,True,0.741345
ver_win_ratio_per_bu,43995,True,0.741918


In [22]:
df_train[['customer_country','customer_country.1']].head()

Unnamed: 0,customer_country,customer_country.1
0,/Quezon City/Philippines,/Quezon City/Philippines
1,/PH-00/Philippines,/PH-00/Philippines
2,/Kolkata /India,/Kolkata /India
3,/Bhubaneswar/India,/Bhubaneswar/India
4,/Hyderabad/India,/Hyderabad/India


In [23]:
# customer_country와 customer_country.1이 동시에 결측값을 가지는 행의 개수 
df_train[df_train['customer_country.1'].isnull() & df_train['customer_country'].isnull()].shape[0]

982

In [24]:
pre_train = pre_train.dropna(subset=['customer_country'])

In [25]:
mis_val = pre_train.isnull().sum()
mis_val_bool = mis_val >= 1
mis_val_df = pd.concat([mis_val, mis_val_bool], axis = 1)
mis_val_df = mis_val_df.rename(columns = {0 : 'mis_val', 1 : 'mis_val_bool'})

mis_val_data = mis_val_df.loc[mis_val_df['mis_val_bool'] == True, :]
mis_val_data['ratio'] = mis_val_data['mis_val'] / 59299
mis_val_data_sorted = mis_val_data.sort_values(by='ratio', ascending=True)
mis_val_data_sorted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mis_val_data['ratio'] = mis_val_data['mis_val'] / 59299


Unnamed: 0,mis_val,mis_val_bool,ratio
inquiry_type,369,True,0.006223
customer_job,18184,True,0.306649
product_category,18920,True,0.319061
expected_timeline,29991,True,0.505759
ver_win_rate_x,40704,True,0.68642
business_area,40704,True,0.68642


In [26]:
pre_train['customer_country'] = pre_train['customer_country'].apply(lambda x: x.split('/')[-1])
df_test['customer_country'] = df_test['customer_country'].apply(lambda x: x.split('/')[-1])

In [27]:
pre_train.head()

Unnamed: 0,bant_submit,customer_country,business_unit,customer_idx,enterprise,customer_job,lead_desc_length,inquiry_type,product_category,customer_country.1,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,business_area,lead_owner,is_converted
0,1.0,Philippines,AS,32160,Enterprise,purchasing,62,Quotation or purchase consultation,multi-split,/Quezon City/Philippines,entry level,LGEPH,less than 3 months,1,0,0.003079,corporate / office,0,True
1,1.0,Philippines,AS,23122,Enterprise,media and communication,96,Quotation or purchase consultation,multi-split,/PH-00/Philippines,ceo/founder,LGEPH,less than 3 months,1,0,0.003079,corporate / office,1,True
2,1.0,India,AS,1755,Enterprise,engineering,56,Product Information,single-split,/Kolkata /India,partner,LGEIL,less than 3 months,1,0,0.003079,corporate / office,2,True
3,1.0,India,AS,4919,Enterprise,entrepreneurship,44,Quotation or purchase consultation,vrf,/Bhubaneswar/India,ceo/founder,LGEIL,less than 3 months,1,0,0.003079,corporate / office,3,True
4,1.0,India,AS,17126,Enterprise,consulting,97,Quotation or purchase consultation,multi-split,/Hyderabad/India,partner,LGEIL,less than 3 months,0,0,0.003079,corporate / office,4,True


## 2. 데이터 전처리

### 레이블 인코딩

In [28]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [29]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

label_columns = list(set(label_columns)-set(drop_columns))

df_all = pd.concat([pre_train[label_columns], pre_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [30]:
df_all.head()

Unnamed: 0,customer_job,product_category,enterprise,response_corporate,expected_timeline,business_unit,business_area,customer_country,inquiry_type,customer_position,customer_country.1
0,416,148,0,33,245,0,0,2812,25,44,9070
1,299,148,0,33,245,0,0,2812,25,18,8406
2,156,201,0,21,245,0,0,2713,20,75,6535
3,162,272,0,21,245,0,0,2713,25,18,3388
4,82,148,0,21,245,0,0,2713,25,75,5799


다시 학습 데이터와 제출 데이터를 분리합니다.

In [31]:
for col in label_columns:  
    pre_train[col] = df_all.iloc[: len(pre_train)][col]
    pre_test[col] = df_all.iloc[len(pre_train) :][col]

In [32]:
t = pre_train.copy()
corr = t.corr()
corr.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,bant_submit,customer_country,business_unit,customer_idx,enterprise,customer_job,lead_desc_length,inquiry_type,product_category,customer_country.1,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,business_area,lead_owner,is_converted
bant_submit,1.0,0.029924,0.093514,-0.111744,0.049249,-0.163744,0.129802,0.006216,-0.112932,0.043622,-0.368547,-0.178652,-0.441636,0.10187,0.12958,0.100099,-0.01324,-0.12872,0.004953
customer_country,0.029924,1.0,-0.024723,-0.031221,-0.052297,-0.078696,-0.023916,0.089985,0.003265,-0.064245,-0.101968,-0.165758,0.039061,0.028974,0.03479,-0.072832,-0.007972,0.06426,0.041692
business_unit,0.093514,-0.024723,1.0,-0.079789,0.135267,-0.043978,-0.018265,-0.339731,-0.236213,-0.049652,0.007117,-0.078395,-0.065927,0.036166,0.138858,-0.028205,-0.085159,0.386873,0.072735
customer_idx,-0.111744,-0.031221,-0.079789,1.0,-0.16177,0.109655,-0.04763,0.007948,-0.045086,-0.031657,0.063134,-0.055118,0.017813,-0.038895,-0.04997,-0.018197,0.018889,-0.039002,-0.057532
enterprise,0.049249,-0.052297,0.135267,-0.16177,1.0,-0.025576,0.197756,-0.074995,0.068323,0.083174,-0.024997,0.151881,-0.018829,0.084181,0.123083,0.095935,-0.077526,0.12606,0.125135
customer_job,-0.163744,-0.078696,-0.043978,0.109655,-0.025576,1.0,-0.025114,0.015963,-0.002785,0.045567,0.142038,0.085404,0.044628,-0.062278,-0.057354,-0.025459,0.040447,-0.036285,-0.018811
lead_desc_length,0.129802,-0.023916,-0.018265,-0.04763,0.197756,-0.025114,1.0,-0.035574,0.014451,-0.002319,-0.090144,0.035408,-0.03214,0.095507,0.114769,0.018479,-0.003521,0.037923,0.115217
inquiry_type,0.006216,0.089985,-0.339731,0.007948,-0.074995,0.015963,-0.035574,1.0,0.126731,0.054653,-0.028454,0.021065,-0.023195,-0.080991,-0.054334,-8.3e-05,0.032558,-0.172774,-0.034227
product_category,-0.112932,0.003265,-0.236213,-0.045086,0.068323,-0.002785,0.014451,0.126731,1.0,0.010604,0.051294,0.105224,0.053052,0.015065,0.008366,0.07561,-0.007955,-0.017701,0.023117
customer_country.1,0.043622,-0.064245,-0.049652,-0.031657,0.083174,0.045567,-0.002319,0.054653,0.010604,1.0,-9.5e-05,0.257276,-0.01428,0.004848,-0.053754,-0.104346,-0.021505,0.053473,0.031412


In [33]:
pre_train = pre_train.drop(labels = ['customer_country.1'],axis=1)
pre_test = pre_test.drop(labels = ['customer_country.1'],axis=1)

In [34]:
print(pre_train.columns)
print("\n\n\n")
print(pre_test.columns)

Index(['bant_submit', 'customer_country', 'business_unit', 'customer_idx',
       'enterprise', 'customer_job', 'lead_desc_length', 'inquiry_type',
       'product_category', 'customer_position', 'response_corporate',
       'expected_timeline', 'ver_cus', 'ver_pro', 'ver_win_rate_x',
       'business_area', 'lead_owner', 'is_converted'],
      dtype='object')




Index(['bant_submit', 'customer_country', 'business_unit', 'customer_idx',
       'enterprise', 'customer_job', 'lead_desc_length', 'inquiry_type',
       'product_category', 'customer_position', 'response_corporate',
       'expected_timeline', 'ver_cus', 'ver_pro', 'ver_win_rate_x',
       'business_area', 'lead_owner', 'is_converted'],
      dtype='object')


### 2-2. 학습, 검증 데이터 분리

In [35]:
x_train, x_val, y_train, y_val = train_test_split(
    pre_train.drop("is_converted", axis=1),
    pre_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

## 3. 모델 학습

### 모델 정의 

In [37]:
model = DecisionTreeClassifier(random_state=42)

### 모델 학습

In [38]:
model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [39]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [40]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[  745   211]
 [  235 10473]]

정확도: 0.9618
정밀도: 0.7602
재현율: 0.7793
F1: 0.7696


## 4. 제출하기

### 테스트 데이터 예측

In [41]:
# 예측에 필요한 데이터 분리
x_test = pre_test.drop(["is_converted"], axis=1)

In [42]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

1050

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**