#### 작업형2 기출 유형(심화) 여행 보험 패키지 상품을 구매할 확률 값을 구하시오
 - 예측할 값(y): TravelInsurance (여행보험 패키지를 구매 했는지 여부 0:구매안함, 1:구매)
 - 평가: roc-auc 평가지표
 - data: t2-1-train.csv, t2-1-test.csv

In [153]:
import os
from os.path import join

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

## 1. 데이터 불러오기


In [154]:
train = join('data', 't2-1-train.csv')
test = join('data', 't2-1-test.csv')
submission = join('data', 't2-1-sample_submission.csv')

In [155]:
train = pd.read_csv(train)
test = pd.read_csv(test)
sub = pd.read_csv(submission)

In [156]:
train.head(5)

Unnamed: 0,id,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,10000,28,Private Sector/Self Employed,Yes,1250000.0,6,1,No,No,0
1,10001,31,Private Sector/Self Employed,Yes,1250000.0,7,1,No,No,0
2,10002,29,Private Sector/Self Employed,Yes,1200000.0,7,0,No,No,1
3,10003,33,Government Sector,Yes,650000.0,6,1,No,No,1
4,10004,28,Private Sector/Self Employed,Yes,800000.0,6,0,No,Yes,1


In [157]:
test.head(5)

Unnamed: 0,id,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad
0,0,32,Government Sector,Yes,650000.0,5,0,No,No
1,1,27,Government Sector,Yes,500000.0,5,0,No,No
2,2,32,Government Sector,Yes,350000.0,2,0,No,No
3,3,26,Private Sector/Self Employed,Yes,1400000.0,6,0,No,Yes
4,4,27,Private Sector/Self Employed,No,350000.0,3,0,No,No


### 1.1 데이터 기초통계량 및 결측치 확인

In [158]:
train.describe(include='object')

Unnamed: 0,Employment Type,GraduateOrNot,FrequentFlyer,EverTravelledAbroad
count,1490,1490,1490,1490
unique,2,2,2,2
top,Private Sector/Self Employed,Yes,No,No
freq,1056,1270,1175,1209


In [159]:
test.describe(include='object')
# train과 test의 Employment Type 개수가 다름

Unnamed: 0,Employment Type,GraduateOrNot,FrequentFlyer,EverTravelledAbroad
count,497,497,497,497
unique,3,2,2,2
top,Private Sector/Self Employed,Yes,No,No
freq,360,422,395,398


In [160]:
train['Employment Type'].value_counts(), test['Employment Type'].value_counts()

(Private Sector/Self Employed    1056
 Government Sector                434
 Name: Employment Type, dtype: int64,
 Private Sector/Self Employed    360
 Government Sector               134
 Casual employment                 3
 Name: Employment Type, dtype: int64)

In [161]:
train.describe()

Unnamed: 0,id,Age,AnnualIncome,FamilyMembers,ChronicDiseases,TravelInsurance
count,1490.0,1490.0,1486.0,1490.0,1490.0,1490.0
mean,10744.5,29.6,931123.8,4.755705,0.280537,0.352349
std,430.270264,2.887829,376487.4,1.603613,0.449412,0.477862
min,10000.0,25.0,300000.0,2.0,0.0,0.0
25%,10372.25,28.0,600000.0,4.0,0.0,0.0
50%,10744.5,29.0,900000.0,5.0,0.0,0.0
75%,11116.75,32.0,1250000.0,6.0,1.0,1.0
max,11489.0,35.0,1800000.0,9.0,1.0,1.0


In [162]:
test.describe()

Unnamed: 0,id,Age,AnnualIncome,FamilyMembers,ChronicDiseases
count,497.0,497.0,494.0,497.0,497.0
mean,248.0,29.800805,939372.5,4.744467,0.269618
std,143.615807,2.986286,379298.8,1.629211,0.444208
min,0.0,25.0,300000.0,2.0,0.0
25%,124.0,28.0,600000.0,4.0,0.0
50%,248.0,29.0,900000.0,5.0,0.0
75%,372.0,33.0,1250000.0,6.0,1.0
max,496.0,35.0,1750000.0,9.0,1.0


In [163]:
train.isnull().sum()
# AnnualIncome 결측치 4개

id                     0
Age                    0
Employment Type        0
GraduateOrNot          0
AnnualIncome           4
FamilyMembers          0
ChronicDiseases        0
FrequentFlyer          0
EverTravelledAbroad    0
TravelInsurance        0
dtype: int64

In [164]:
test.isnull().sum()
# AnnualIncome 결측치 3개

id                     0
Age                    0
Employment Type        0
GraduateOrNot          0
AnnualIncome           3
FamilyMembers          0
ChronicDiseases        0
FrequentFlyer          0
EverTravelledAbroad    0
dtype: int64

In [165]:
# AnnualIncome의 평균으로 결측값 대체
train['AnnualIncome'] = train['AnnualIncome'].fillna(train['AnnualIncome'].mean())
test['AnnualIncome'] = test['AnnualIncome'].fillna(test['AnnualIncome'].mean())

In [166]:
train.isnull().sum(), test.isnull().sum() 
# 결측치 제거 완료

(id                     0
 Age                    0
 Employment Type        0
 GraduateOrNot          0
 AnnualIncome           0
 FamilyMembers          0
 ChronicDiseases        0
 FrequentFlyer          0
 EverTravelledAbroad    0
 TravelInsurance        0
 dtype: int64,
 id                     0
 Age                    0
 Employment Type        0
 GraduateOrNot          0
 AnnualIncome           0
 FamilyMembers          0
 ChronicDiseases        0
 FrequentFlyer          0
 EverTravelledAbroad    0
 dtype: int64)

###  더미 만들기

In [167]:
# train = pd.get_dummies(train)
# test = pd.get_dummies(test)

In [168]:
x_train = train.drop(columns=['TravelInsurance'])
y_train = train['TravelInsurance']

In [169]:
all_df = pd.concat([x_train, test])
all_df = pd.get_dummies(all_df)
x_train = all_df[:x_train.shape[0]]
test = all_df[x_train.shape[0]:]


In [170]:
x_train

Unnamed: 0,id,Age,AnnualIncome,FamilyMembers,ChronicDiseases,Employment Type_Casual employment,Employment Type_Government Sector,Employment Type_Private Sector/Self Employed,GraduateOrNot_No,GraduateOrNot_Yes,FrequentFlyer_No,FrequentFlyer_Yes,EverTravelledAbroad_No,EverTravelledAbroad_Yes
0,10000,28,1250000.0,6,1,0,0,1,0,1,1,0,1,0
1,10001,31,1250000.0,7,1,0,0,1,0,1,1,0,1,0
2,10002,29,1200000.0,7,0,0,0,1,0,1,1,0,1,0
3,10003,33,650000.0,6,1,0,1,0,0,1,1,0,1,0
4,10004,28,800000.0,6,0,0,0,1,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,11485,28,800000.0,4,0,0,0,1,0,1,1,0,1,0
1486,11486,34,1000000.0,9,0,0,0,1,0,1,1,0,1,0
1487,11487,26,450000.0,5,1,0,0,1,0,1,1,0,1,0
1488,11488,25,1150000.0,3,1,0,0,1,1,0,1,0,0,1


In [171]:
scaler = MinMaxScaler()

x_train['AnnualIncome'] = scaler.fit_transform(x_train[['AnnualIncome']])
test['AnnualIncome'] = scaler.fit_transform(test[['AnnualIncome']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train['AnnualIncome'] = scaler.fit_transform(x_train[['AnnualIncome']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['AnnualIncome'] = scaler.fit_transform(test[['AnnualIncome']])


In [172]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1043, 14), (447, 14), (1043,), (447,))

In [176]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
pred = model.predict_proba(X_val)

In [181]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, pred[:,1])

0.7944312274209181

In [182]:
pred = model.predict_proba(test)

In [185]:
sub['TravelInsurance'] = pred[:,1]

In [186]:
sub

Unnamed: 0,id,TravelInsurance
0,0,0.35
1,1,0.42
2,2,0.19
3,3,0.99
4,4,0.25
...,...,...
492,492,0.21
493,493,0.82
494,494,0.22
495,495,0.99


In [None]:
# sub.to_csv('hi.csv')