# 영업 성공 여부 분류 경진대회

# 할 것

결측치 처리
파생변수 생성

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

def seed_everything(seed):
    np.random.seed(seed) #numpy를 사용할 경우 고정
seed_everything(42)

### 데이터 셋 읽어오기

In [2]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [3]:
df_train.head()

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,lead_desc_length,inquiry_type,product_category,product_subcategory,product_modelname,customer_country.1,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,,purchasing,62,Quotation or purchase consultation,multi-split,,,/Quezon City/Philippines,entry level,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,,media and communication,96,Quotation or purchase consultation,multi-split,,,/PH-00/Philippines,ceo/founder,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,,engineering,56,Product Information,single-split,,,/Kolkata /India,partner,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,,entrepreneurship,44,Quotation or purchase consultation,vrf,,,/Bhubaneswar/India,ceo/founder,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,,consulting,97,Quotation or purchase consultation,multi-split,,,/Hyderabad/India,partner,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


In [4]:
df_test.columns

Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

In [5]:
# 결측치가 있는 변수 및 결측률 파악 

mis_val = df_train.isnull().sum()
mis_val_bool = mis_val >= 1
mis_val_df = pd.concat([mis_val, mis_val_bool], axis = 1)
mis_val_df = mis_val_df.rename(columns = {0 : 'mis_val', 1 : 'mis_val_bool'})

mis_val_data = mis_val_df.loc[mis_val_df['mis_val_bool'] == True, :]
mis_val_data['ratio'] = mis_val_data['mis_val'] / 59299
mis_val_data_sorted = mis_val_data.sort_values(by='ratio', ascending=True)
mis_val_data_sorted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mis_val_data['ratio'] = mis_val_data['mis_val'] / 59299


Unnamed: 0,mis_val,mis_val_bool,ratio
inquiry_type,941,True,0.015869
customer_country.1,982,True,0.01656
customer_country,982,True,0.01656
customer_job,18733,True,0.315908
product_category,19374,True,0.326717
expected_timeline,30863,True,0.520464
business_area,40882,True,0.689421
ver_win_rate_x,40882,True,0.689421
customer_type,43961,True,0.741345
ver_win_ratio_per_bu,43995,True,0.741918


In [6]:
# idit_strategic_ver , it_strategic_ver , id_strategic_ver 세 columns 값들이 array([nan,  1.])
# historical_existing_cnt nan = 0

#inquiry_type - 대소문자 통합 -> 전부 소문자로

# print(df_train['customer_type'].str.lower().unique())
# print(df_train['customer_type'].unique())

In [7]:
# customer_country , customer_country.1의 값이 일치함.
df_train = df_train.drop(labels = ['customer_country.1'],axis=1)
df_test = df_test.drop(labels = ['customer_country.1','id'],axis=1)

dtypes = df_train.dtypes

# 각 열의 고유값 개수 확인
unique_value_counts = df_train.nunique()

# 수치형 변수가 아닌 컬럼을 저장할 리스트 초기화
object_columns = []

# 각 열에 대해 데이터 타입이 'object'이거나 고유값 개수가 일정 이하인 경우를 확인하여 non_numeric_columns 리스트에 추가
for col in df_train.columns:
    if dtypes[col] == 'object':
        object_columns.append(col)

# 수치형 변수가 아닌 컬럼 출력
print("수치형 변수가 아닌 컬럼:", object_columns)

수치형 변수가 아닌 컬럼: ['customer_country', 'business_unit', 'customer_type', 'enterprise', 'customer_job', 'inquiry_type', 'product_category', 'product_subcategory', 'product_modelname', 'customer_position', 'response_corporate', 'expected_timeline', 'business_area', 'business_subarea']


### 전처리
inquiry_type,customer_country - 대소문자 통합
customer_country 결측치 행 삭제

In [8]:
def preprocess_1(col):
    df_train[col] = df_train[col].str.lower()
    df_test[col] = df_test[col].str.lower()
    
    mask_train = df_train[col].notnull()
    mask_test = df_test[col].notnull()
    df_train.loc[mask_train, col] = df_train.loc[mask_train, col].apply(lambda x: x.replace(" ", ""))
    df_test.loc[mask_test, col] = df_test.loc[mask_test, col].apply(lambda x: x.replace(" ", ""))    

In [9]:
for n in object_columns:
    preprocess_1(n)
#test data에 customer_country = nan 이 없어서 행삭제 해도 될듯. 삭제양도 많지 않음.
df_train = df_train.dropna(subset=['customer_country'])

# 보류
# df_train['customer_country'] = df_train['customer_country'].apply(lambda x: x.split('/')[-1])
# df_test['customer_country'] = df_test['customer_country'].apply(lambda x: x.split('/')[-1])

In [10]:
print(df_train['inquiry_type'].unique())

['quotationorpurchaseconsultation' 'productinformation' 'other'
 'usageortechnicalconsultation' 'trainings' 'services' 'salesinquiry'
 'etc.' 'technicalsupport' 'technicalconsultation' 'requestforpartnership'
 nan 'sales' 'technical' 'other_' 'quotation_or_purchase_consultation'
 'requestademo' 'requestfordistributorship'
 'requestforquotationorpurchase' 'requestfortechnicalconsulting'
 '(selectid_needs)' 'aio' 'needs' 'purchase' 'technical_consultation'
 'customersuggestions' 'eventinquiry' 'others' 'oem/odmrequest'
 'hospitaltv' 'iwanttoknowthedetailsaboutit' 'educationalequipments'
 'digitalplatform' 'tvinteractive' 'teach' 'displaytextbookandphotos'
 'quotation_' 'purchaseorquotation' 'displayproduct' 'firstinfoandpricing'
 'estoybuscandoparaecuadoresteproductolgmagnitmicroled,paraunclientede138pulgadas,conenviómarítimo.'
 'holamepuedencotizar19pantallasinteractivasde100pulgadasentregadasenguayaquil-ecuador.'
 'probeamprecio' 'hoteltvproducts' 'pantallasinteractivasparaclinicas'
 '

### 레이블 인코딩

In [11]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [12]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

# label_columns = list(set(label_columns)-set(drop_columns))

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [13]:
df_all.head()

Unnamed: 0,customer_country,business_subarea,business_area,business_unit,customer_type,enterprise,customer_job,inquiry_type,product_category,product_subcategory,product_modelname,customer_position,response_corporate,expected_timeline
0,6870,27,0,0,9,0,414,31,142,140,654,43,33,246
1,6502,0,0,0,9,0,301,31,142,140,654,17,33,246
2,4483,15,0,0,9,0,156,26,198,140,654,73,21,246
3,1449,46,0,0,9,0,162,31,271,140,654,17,21,246
4,3785,58,0,0,27,0,82,31,142,140,654,73,21,246


다시 학습 데이터와 제출 데이터를 분리합니다.

In [14]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train):][col]

In [15]:
t = df_train.copy()
corr = t.corr()
corr.style.background_gradient(cmap = 'coolwarm')

  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,lead_desc_length,inquiry_type,product_category,product_subcategory,product_modelname,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
bant_submit,1.0,0.014132,0.093514,-0.4019,-0.111744,0.044764,0.049249,-0.202554,,,,-0.162599,0.129802,-0.053672,-0.110363,0.041586,-0.102689,-0.366774,-0.178652,-0.438212,0.10187,0.12958,0.100099,-0.017558,-0.01324,-0.086293,-0.12872,0.004953
customer_country,0.014132,1.0,-0.08537,0.361025,-0.022651,-0.130596,0.078896,0.095438,,,,0.052881,0.004748,0.029911,0.016525,-0.091512,0.125827,0.005792,0.276812,0.001464,0.00548,-0.053779,-0.099545,-0.048863,-0.022945,-0.00067,0.047428,0.027623
business_unit,0.093514,-0.08537,1.0,0.457015,-0.079789,-0.091532,0.135267,-0.222576,,,,-0.043355,-0.018265,-0.297078,-0.227605,-0.035911,-0.26356,0.007644,-0.078395,-0.065033,0.036166,0.138858,-0.028205,0.516659,-0.085159,-0.018213,0.386873,0.072735
com_reg_ver_win_rate,-0.4019,0.361025,0.457015,1.0,-0.001267,-0.272203,0.174313,-0.085987,,,,0.140052,-0.104642,-0.064038,-0.022478,-0.025378,0.079409,0.179765,0.329749,0.103701,-0.089233,-0.080051,-0.263012,0.44157,-0.063165,0.000988,0.428008,0.345959
customer_idx,-0.111744,-0.022651,-0.079789,-0.001267,1.0,-0.004735,-0.16177,0.13122,,,,0.108913,-0.04763,0.003826,-0.048245,0.008388,0.060016,0.063256,-0.055118,0.017617,-0.038895,-0.04997,-0.018197,0.006204,0.018889,0.031539,-0.039002,-0.057532
customer_type,0.044764,-0.130596,-0.091532,-0.272203,-0.004735,1.0,-0.136302,-0.063984,,,,-0.029323,-0.120434,0.104915,-0.003426,-0.012211,0.045726,-0.03664,-0.166604,-0.023068,-0.345735,-0.08271,0.004497,-0.122615,0.04976,0.147593,-0.04031,-0.081691
enterprise,0.049249,0.078896,0.135267,0.174313,-0.16177,-0.136302,1.0,-0.082836,,,,-0.024549,0.197756,-0.039554,0.071589,0.054888,-0.127969,-0.02629,0.151881,-0.018429,0.084181,0.123083,0.095935,0.129902,-0.077526,-0.059667,0.12606,0.125135
historical_existing_cnt,-0.202554,0.095438,-0.222576,-0.085987,0.13122,-0.063984,-0.082836,1.0,,,,0.082041,0.025895,0.060556,0.125732,-0.001116,0.03271,0.092657,0.122554,0.084742,-0.014562,0.022316,0.082987,-0.008509,0.027247,0.009627,0.191709,-0.005119
id_strategic_ver,,,,,,,,,,,,,,,,,,,,,,,,,,,,
it_strategic_ver,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [16]:
print(df_train.columns)
print('\n\n\n')
print(df_test.columns)

Index(['bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_position', 'response_corporate',
       'expected_timeline', 'ver_cus', 'ver_pro', 'ver_win_rate_x',
       'ver_win_ratio_per_bu', 'business_area', 'business_subarea',
       'lead_owner', 'is_converted'],
      dtype='object')




Index(['bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_positio

### 2-2. 학습, 검증 데이터 분리

In [17]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

## 3. 모델 학습

### 모델 정의 

In [18]:
model = DecisionTreeClassifier(random_state=42)

### 모델 학습

In [19]:
model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [20]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [21]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[  765   191]
 [  197 10511]]

정확도: 0.9667
정밀도: 0.7952
재현율: 0.8002
F1: 0.7977


## 4. 제출하기

### 테스트 데이터 예측

In [46]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted"], axis=1)

In [47]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

889

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**