## 모듈 import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib

# Modeling
from pycaret.classification  import *
from sklearn.tree import DecisionTreeClassifier

# Warnings
import sys, warnings
if not sys.warnoptions: warnings.simplefilter("ignore")

# Scailing
from sklearn.preprocessing import PowerTransformer

# Label Encoding
from sklearn.preprocessing import LabelEncoder

print('Pandas : %s'%(pd.__version__))
print('Numpy : %s'%(np.__version__))
print('matplotlib : %s'%(matplotlib.__version__))
print('pycaret : %s'%(pycaret.__version__))
!python --version

Pandas : 1.3.2
Numpy : 1.19.5
matplotlib : 3.3.4
pycaret : 2.3.3
Python 3.8.8


사용한 버전  
Pandas : 1.3.2  
Numpy : 1.19.5  
matplotlib : 3.3.4  
pycaret : 2.3.3  
Python 3.8.8

## data 불러오기

In [2]:
# 데이터가 저장된 경로 설정
data_path = './daplatformers2021/'

# 데이터 불러오기
train = pd.read_csv(data_path + 'trainset.csv')
test = pd.read_csv(data_path + 'testset_final.csv')
submission = pd.read_csv(data_path + 'sample_submission_final.csv')

In [3]:
train = train.drop(columns=['매니저최초가입일', '매니저최초서비스일', '매니저주소'])

### 전처리

In [4]:
data = pd.concat([train,test]).reset_index(drop=True);data.head(3)

Unnamed: 0,SEQ,접수일,접수시각,장기서비스여부,최초서비스일,전체회차,현재회차,서비스일자,서비스시작시간,서비스종료시간,기존고객여부,결재형태,서비스주소,주거형태,평수,고객가입일,반려동물,부재중여부,우선청소,쿠폰사용여부,매니저생년월일,매니저성별,매니저사용휴대폰,매니저이동방법,근무가능지역,CS교육이수여부,청소교육이수여부,부재중서비스가능여부,추천인여부,매칭성공여부
0,T06420,2019-07-09,,1,2019-07-15,3,3,2019-07-29,9:00:00,13:00:00,1,무통장입금,충남 천안시,일반주택,,2019-04-22,,,,0,1956-04-19,0,안드로이드,대중교통,서울,0.0,0.0,1,0,0.0
1,T15430,2019-07-12,,1,2019-07-15,4,3,2019-08-20,10:00:00,16:00:00,1,무통장입금,충남 천안시,일반주택,,2019-05-21,없음,,,0,1976-06-10,0,안드로이드,대중교통,천안/아산,1.0,1.0,1,0,0.0
2,T23790,2020-11-18,,1,2020-11-23,10,6,2020-12-11,9:00:00,13:00:00,1,신용카드,충남 천안시,일반주택,40평대이상,2020-10-14,없음,0.0,,0,1970-04-05,0,안드로이드,대중교통,천안/아산,0.0,0.0,0,0,0.0


- 결측치 채움

In [5]:
# 결측치 0으로 채움
data['청소교육이수여부'] = data['청소교육이수여부'].fillna(0)
data['CS교육이수여부'] = data['CS교육이수여부'].fillna(0)
data['부재중여부'] = data['부재중여부'].fillna(0)
# 결측치 최빈값으로 채움
data['매니저이동방법'] = data['매니저이동방법'].fillna(data['매니저이동방법'].mode()[0])
data['매니저사용휴대폰'] = data['매니저사용휴대폰'].fillna(data['매니저사용휴대폰'].mode()[0])
data['고객가입일'] = data['고객가입일'].fillna(data['고객가입일'].mode()[0]) #데이터 바뀌고 test에 생긴 에러
data['결재형태'] = data['결재형태'].fillna(data['결재형태'].mode()[0]) #데이터 바뀌고 test에 생긴 에러

data['접수시각'] = data['접수시각'].fillna(data['접수시각'].mode()[0])
data['반려동물'] = data['반려동물'].fillna('없음')
# 결측치 판단하여 없음으로 채움
data['우선청소'] = data['우선청소'].fillna('없음')

### 평수 평균 확인 -> 31.xx -> 30평대로 채움
data['평수'] = data['평수'].fillna('30평대')

In [6]:
#test와 train의 구조를 맞춰줌
data['매니저생년월일'] = data['매니저생년월일'].astype('str').apply(lambda x: x[0:4])

- Make features

In [7]:
# 새로운 피처 만들기
data['매니저나이'] = data['매니저생년월일'].apply(lambda x: 2021-int(x))
data['매니저나이대'] = data['매니저나이'].apply(lambda x: f'{str(x)[0]}0대')

# 서비스시간 피처 생성
data['서비스시간'] =  pd.to_datetime(data['서비스시작시간'], format="%H:%M:%S") -\
                        pd.to_datetime(data['서비스종료시간'], format="%H:%M:%S")
data['서비스시간'] = data['서비스시간'].astype('str')

# 현재 월을 기준으로 가입한지 몇개월이 됐는지 계산
data['고객가입개월수_str'] = data['고객가입일'].apply(lambda x:str((2021-(int(x[0:4])+1))*12 + 12-int(x[5:7])+1 + 11) )
data['고객가입개월수_int'] = data['고객가입일'].apply(lambda x:(2021-(int(x[0:4])+1))*12 + 12-int(x[5:7])+1 + 11)

# 두가지의 변수를 곱해줌
data['나이*개월수'] = data['매니저나이']*data['고객가입개월수_int']

In [8]:
# 저장해놓기
data_SEQ = data.SEQ
# 인코딩을 위하여 제거 / 이후 나중에 병합함
y_data = data['매칭성공여부']
data = data.drop(columns = ['SEQ','매칭성공여부'])

- Encoding & Scailing

In [9]:
#범주형변수 수동으로 분리
data[['장기서비스여부','기존고객여부','부재중여부','쿠폰사용여부','매니저성별',
         'CS교육이수여부','청소교육이수여부','부재중서비스가능여부','추천인여부']
     ] = data[['장기서비스여부','기존고객여부','부재중여부','쿠폰사용여부','매니저성별',
         'CS교육이수여부','청소교육이수여부','부재중서비스가능여부','추천인여부'
     ]].astype('str')

# 범주형 변수와 수치형 변수를 분리
cat_features = data.select_dtypes(include=['object']).columns.to_list()
num_features = data.select_dtypes(exclude='object').columns.to_list() 

#수치형변수 scailing
data[num_features] = PowerTransformer(standardize=True).fit_transform(data[num_features])

#라벨인코딩
for f in cat_features:
    data[f] = LabelEncoder().fit_transform(data[f])

In [10]:
#피처 선정
data = data[['매니저생년월일', '서비스주소', '고객가입일', '부재중서비스가능여부', '매니저이동방법', '근무가능지역', 
             '청소교육이수여부', '매니저나이대', '서비스종료시간', '전체회차', '접수시각', '나이*개월수', '고객가입개월수_str']]

In [11]:
#위에서 떼어놨던 피처 병합
data = pd.concat([data_SEQ, data],axis=1)
data = pd.concat([data, y_data],axis=1);data.head(3)

Unnamed: 0,SEQ,매니저생년월일,서비스주소,고객가입일,부재중서비스가능여부,매니저이동방법,근무가능지역,청소교육이수여부,매니저나이대,서비스종료시간,전체회차,접수시각,나이*개월수,고객가입개월수_str,매칭성공여부
0,T06420,3,48,6,1,0,3,0,4,3,-0.65041,379,2.015637,24,0.0
1,T15430,23,48,18,1,0,6,1,2,6,-0.131658,379,0.569809,23,0.0
2,T23790,17,48,202,0,0,6,0,3,3,1.998486,379,-0.876181,4,0.0


- Data split

In [12]:
#train과 test를 분리
train = data.iloc[:23009,:]
test = data.iloc[23009:,:]
test = test.drop(columns = ['매칭성공여부']).reset_index(drop=True)

## Modeling with pycaret

- Model setting

In [14]:
# 셀렉션 안쓰면 사용
train = train.drop(columns=['SEQ'])

In [15]:
cl = setup(train, 
            preprocess = False, 
            train_size = 0.999,  
            target = '매칭성공여부', 
            silent = True, 
            use_gpu = False, 
            session_id = 42,
            fold_shuffle = True
            )

Unnamed: 0,Description,Value
0,session_id,42
1,Target,매칭성공여부
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(23009, 14)"
5,Missing Values,False
6,Numeric Features,13
7,Categorical Features,0
8,Transformed Train Set,"(22985, 13)"
9,Transformed Test Set,"(24, 13)"


In [16]:
top = compare_models(sort='AUC', fold=5, n_select = 1, include=['knn'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9319,0.9069,0.5796,0.665,0.6189,0.5818,0.5836,0.614


- Create Model

In [17]:
#knn -> 튜닝하면 성능향상
knn = create_model('knn')
#hyperparameter tuning
knn = tune_model(knn, optimize = 'AUC', choose_better = True, fold = 5, n_iter = 30)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9759,0.967,0.795,0.9432,0.8628,0.8497,0.8534
1,0.97,0.9447,0.7426,0.9288,0.8253,0.8091,0.8152
2,0.9739,0.9536,0.7659,0.952,0.8489,0.8348,0.8407
3,0.9741,0.9644,0.7864,0.9326,0.8533,0.8392,0.8429
4,0.9724,0.95,0.7705,0.9288,0.8422,0.8272,0.8316
Mean,0.9732,0.956,0.7721,0.9371,0.8465,0.832,0.8367
SD,0.002,0.0085,0.0181,0.0091,0.0125,0.0135,0.0128


- Fit & predict

In [18]:
model = knn
pred_holdout = predict_model(model, test)

In [19]:
#pycaret에서는 score이 label을 맞출 확률이기때문에 output을 제출 양식에 맞게 바꿔줍니다
pred = pd.Series([pred_holdout['Score'][i] if pred_holdout['Label'][i] =='1.0' else (1-pred_holdout['Score'][i]) for i in range(len(pred_holdout['Score']))]
)

In [20]:
submission['pred'] = pred

### Make submission 

In [21]:
fname = 'knn_0.84066.csv'
submission.to_csv(fname, index=False)
print("'{}' is ready to submit." .format(fname))

'knn_0.84066.csv' is ready to submit.


# END