# 영업 성공 여부 분류 경진대회

# 할 것

결측치 처리
파생변수 생성

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

def seed_everything(seed):
    np.random.seed(seed) #numpy를 사용할 경우 고정
seed_everything(42)

### 데이터 셋 읽어오기

In [2]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [3]:
df_train.head()

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,lead_desc_length,inquiry_type,product_category,product_subcategory,product_modelname,customer_country.1,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,,purchasing,62,Quotation or purchase consultation,multi-split,,,/Quezon City/Philippines,entry level,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,,media and communication,96,Quotation or purchase consultation,multi-split,,,/PH-00/Philippines,ceo/founder,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,,engineering,56,Product Information,single-split,,,/Kolkata /India,partner,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,,entrepreneurship,44,Quotation or purchase consultation,vrf,,,/Bhubaneswar/India,ceo/founder,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,,consulting,97,Quotation or purchase consultation,multi-split,,,/Hyderabad/India,partner,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


In [4]:
# 결측치가 있는 변수 및 결측률 파악 

mis_val = df_train.isnull().sum()
mis_val_bool = mis_val >= 1
mis_val_df = pd.concat([mis_val, mis_val_bool], axis = 1)
mis_val_df = mis_val_df.rename(columns = {0 : 'mis_val', 1 : 'mis_val_bool'})

mis_val_data = mis_val_df.loc[mis_val_df['mis_val_bool'] == True, :]
mis_val_data['ratio'] = mis_val_data['mis_val'] / 59299
mis_val_data_sorted = mis_val_data.sort_values(by='ratio', ascending=True)
mis_val_data_sorted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mis_val_data['ratio'] = mis_val_data['mis_val'] / 59299


Unnamed: 0,mis_val,mis_val_bool,ratio
inquiry_type,941,True,0.015869
customer_country.1,982,True,0.01656
customer_country,982,True,0.01656
customer_job,18733,True,0.315908
product_category,19374,True,0.326717
expected_timeline,30863,True,0.520464
business_area,40882,True,0.689421
ver_win_rate_x,40882,True,0.689421
customer_type,43961,True,0.741345
ver_win_ratio_per_bu,43995,True,0.741918


In [6]:
# idit_strategic_ver , it_strategic_ver , id_strategic_ver 세 columns 값들이 array([nan,  1.])
# historical_existing_cnt nan = 0

#inquiry_type - 대소문자 통합 -> 전부 소문자로

df_train['inquiry_type'].unique()

array(['Quotation or purchase consultation', 'Product Information',
       'Quotation or Purchase Consultation', 'Other',
       'Usage or technical consultation', 'Trainings', 'Services',
       'Sales Inquiry', 'Etc.', 'Technical Support',
       'Usage or Technical Consultation', 'Technical Consultation',
       'Request for Partnership', nan, 'sales', 'technical',
       'usage or technical consultation',
       'usage_or_technical_consultation', 'other',
       'quotation_or_purchase_consultation', 'other_', 'Request a Demo',
       'Request for Distributorship', 'Request for quotation or purchase',
       'Request for technical consulting', '(Select ID_Needs)',
       'One Quick:Flex', 'AIO', 'Needs', 'Purchase',
       'technical_consultation', 'Customer Suggestions', 'Event Inquiry',
       'Others', 'OEM/ODM Request', 'Hospital TV', 'others',
       'i want to know the details about it', 'EDUCATIONAL EQUIPMENTS',
       'Digital platform', 'TV interactive', 'teach',
       'Di

### inquiry_type - 대소문자 통합

In [10]:
df_train['inquiry_type'] = df_train['inquiry_type'].str.lower()

In [None]:
# 1. 분석의 목적과 목적에 맞는 변수 확인
# 데이터의 구조와 변수 목록을 살펴봅니다.
print("데이터 정보:")
print(df_train.info())

# 2. 데이터형 확인 및 데이터 오류/누락 여부 확인
# 각 열의 데이터 유형 및 결측치 확인
print("\n\n\n데이터 유형 및 결측치:")
print(df_train.dtypes)
print(df_train.isnull().sum())

# 3. 데이터 분포 확인
# 데이터의 기술 통계량을 계산하여 분포를 확인합니다.
print("\n\n\n데이터 기술 통계량:")
print(df_train.describe())

In [None]:
df_train[['customer_country','customer_country.1']].head()

In [None]:
# customer_country와 customer_country.1이 동시에 결측값을 가지는 행의 개수 
df_train[df_train['customer_country.1'].isnull() & df_train['customer_country'].isnull()].shape[0]

In [None]:
pre_train = pre_train.dropna(subset=['customer_country'])

In [None]:
mis_val = pre_train.isnull().sum()
mis_val_bool = mis_val >= 1
mis_val_df = pd.concat([mis_val, mis_val_bool], axis = 1)
mis_val_df = mis_val_df.rename(columns = {0 : 'mis_val', 1 : 'mis_val_bool'})

mis_val_data = mis_val_df.loc[mis_val_df['mis_val_bool'] == True, :]
mis_val_data['ratio'] = mis_val_data['mis_val'] / 59299
mis_val_data_sorted = mis_val_data.sort_values(by='ratio', ascending=True)
mis_val_data_sorted

In [None]:
pre_train['customer_country'] = pre_train['customer_country'].apply(lambda x: x.split('/')[-1])
df_test['customer_country'] = df_test['customer_country'].apply(lambda x: x.split('/')[-1])

In [None]:
pre_train.head()

## 2. 데이터 전처리

### 레이블 인코딩

In [None]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [None]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

label_columns = list(set(label_columns)-set(drop_columns))

df_all = pd.concat([pre_train[label_columns], pre_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [None]:
df_all.head()

다시 학습 데이터와 제출 데이터를 분리합니다.

In [None]:
for col in label_columns:  
    pre_train[col] = df_all.iloc[: len(pre_train)][col]
    pre_test[col] = df_all.iloc[len(pre_train) :][col]

In [None]:
t = pre_train.copy()
corr = t.corr()
corr.style.background_gradient(cmap = 'coolwarm')

In [None]:
pre_train = pre_train.drop(labels = ['customer_country.1'],axis=1)
pre_test = pre_test.drop(labels = ['customer_country.1'],axis=1)

In [None]:
print(pre_train.columns)
print("\n\n\n")
print(pre_test.columns)

### 2-2. 학습, 검증 데이터 분리

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    pre_train.drop("is_converted", axis=1),
    pre_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

## 3. 모델 학습

### 모델 정의 

In [None]:
model = DecisionTreeClassifier(random_state=42)

### 모델 학습

In [None]:
model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

## 4. 제출하기

### 테스트 데이터 예측

In [None]:
# 예측에 필요한 데이터 분리
x_test = pre_test.drop(["is_converted"], axis=1)

In [None]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**