데이터 : data_atype.zip (수업노트에서 다운로드)

## 머신러닝
- 문제정의, 라이브러리/데이터 불러오기
- 탐색적 데이터 분석 (EDA)
- 데이터 전처리
- 피처엔지니어링
- (Train/Validation 나누기)
- 모델 선택/훈련/평가/최적화
- 예측
- (csv 생성)

## 1. 베이스라인
- 문제정의, 라이브러리 및 데이터 불러오기
- 데이터 전처리 (단순 일괄 처리)
- 모델 선택, 훈련
- 평가

## 2. 베이스라인
- 훈련/검증용 데이터 분리
- 모델 선택, 훈련
    - 의사결정나무
    - 랜덤포레스트
    - XGBoost
- 평가

## 문제1
- "<= 50K -> 0"
- "> 50K -> 1"
- 평가: 정확도

In [1]:
# 라이브러리 및 데이터 불러오기
import pandas as pd

X_train = pd.read_csv('data_atype_y/X_train.csv')
X_test = pd.read_csv('data_atype_y/X_test.csv')
y_train = pd.read_csv('data_atype_y/y_train.csv')

In [2]:
# 데이터 크기
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(29304, 15)
(3257, 15)
(29304, 2)


In [3]:
# 데이터 샘플
display(X_train.head(3))
y_train.head(3)

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,3331,34.0,State-gov,177331,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,Black,Male,4386,0,40.0,United-States
1,19749,58.0,Private,290661,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40.0,United-States
2,1157,48.0,Private,125933,Some-college,10,Widowed,Exec-managerial,Unmarried,Black,Female,0,1669,38.0,United-States


Unnamed: 0,id,income
0,3331,>50K
1,19749,<=50K
2,1157,<=50K


In [4]:
# 타겟 수 확인
y_train.shape[0]

29304

In [5]:
# type확인
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29304 entries, 0 to 29303
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              29304 non-null  int64  
 1   age             29292 non-null  float64
 2   workclass       27642 non-null  object 
 3   fnlwgt          29304 non-null  int64  
 4   education       29304 non-null  object 
 5   education.num   29304 non-null  int64  
 6   marital.status  29304 non-null  object 
 7   occupation      27636 non-null  object 
 8   relationship    29304 non-null  object 
 9   race            29304 non-null  object 
 10  sex             29304 non-null  object 
 11  capital.gain    29304 non-null  int64  
 12  capital.loss    29304 non-null  int64  
 13  hours.per.week  29291 non-null  float64
 14  native.country  28767 non-null  object 
dtypes: float64(2), int64(5), object(8)
memory usage: 3.4+ MB


In [6]:
# 수치형 데이터 
cols =[
       'age',
       'fnlwgt',
       'education.num',
       'capital.gain',
       'capital.loss',
       'hours.per.week'
]

In [7]:
# 수치형 데이처 통계
X_train[cols].describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,29292.0,29304.0,29304.0,29304.0,29304.0,29291.0
mean,38.553223,189748.8,10.080842,1093.858722,86.744506,40.434229
std,13.628811,105525.0,2.570824,7477.43564,401.518928,12.324036
min,-38.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117789.0,9.0,0.0,0.0,40.0
50%,37.0,178376.5,10.0,0.0,0.0,40.0
75%,48.0,237068.2,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [8]:
# 결측값
X_train[cols].isnull().sum()

age               12
fnlwgt             0
education.num      0
capital.gain       0
capital.loss       0
hours.per.week    13
dtype: int64

In [9]:
# 간단한 결측치 처리
X_train['age'] = X_train['age'].fillna(X_train['age'].mean())
X_train['hours.per.week'] = X_train['hours.per.week'].fillna(X_train['hours.per.week'].mean())

X_test['age'] = X_test['age'].fillna(X_test['age'].mean())
X_test['hours.per.week'] = X_test['hours.per.week'].fillna(X_test['hours.per.week'].mean())

In [10]:
# 결측치 확인
X_train[cols].isnull().sum()

age               0
fnlwgt            0
education.num     0
capital.gain      0
capital.loss      0
hours.per.week    0
dtype: int64

In [11]:
X_test[cols].isnull().sum()

age               0
fnlwgt            0
education.num     0
capital.gain      0
capital.loss      0
hours.per.week    0
dtype: int64

In [12]:
# 베이스 라인에서는 그외 전처리 및 피처 엔지니어링 생략

In [13]:
# target값 변경
# <=50K -> 0
# >50K -> 1
y = (y_train['income'] == '>50K').astype(int)

In [14]:
# 데이터 확인
y

0        1
1        0
2        0
3        1
4        0
        ..
29299    0
29300    0
29301    0
29302    0
29303    0
Name: income, Length: 29304, dtype: int64

## 머신러닝 모델

In [15]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train[cols], y)
pred  = rf.predict(X_test[cols])


In [16]:
# 데이터 크기 확인
len(X_test)

3257

In [17]:
y_test = pd.read_csv('data_atype_y/y_test.csv')
y_test

Unnamed: 0,id,income
0,11574,<=50K
1,15847,<=50K
2,17655,<=50K
3,19790,>50K
4,31812,<=50K
...,...,...
3252,20589,<=50K
3253,5668,<=50K
3254,27652,<=50K
3255,14735,<=50K


In [18]:
# 예측 및 csv 파일 생성
# pred[:10]
df = pd.DataFrame(
    {
        'id': X_test['id'],
        'income': pred
    }
)
df.to_csv('4_피처엔지니어링_1.csv', index=False)

In [19]:
# 데이터 확인(y_train)
y_train

Unnamed: 0,id,income
0,3331,>50K
1,19749,<=50K
2,1157,<=50K
3,693,>50K
4,12522,<=50K
...,...,...
29299,15999,<=50K
29300,21604,<=50K
29301,26839,<=50K
29302,16681,<=50K


In [21]:
# 평가 (수험자는 알 수 없는 부분임) accuracy
from sklearn.metrics import accuracy_score
ans = (y_test['income'] == '>50K').astype(int)
accuracy_score(ans, pred)

0.8105618667485416

In [22]:
y_test

Unnamed: 0,id,income
0,11574,<=50K
1,15847,<=50K
2,17655,<=50K
3,19790,>50K
4,31812,<=50K
...,...,...
3252,20589,<=50K
3253,5668,<=50K
3254,27652,<=50K
3255,14735,<=50K


## 문제2
- "<= 50K -> 0"
- "> 50K -> 1"
- 평가: roc_auc 예측 해야할 값은 : 확률

## 검증용 데이터 분리

In [23]:
# 학습용 데이터와 검증용 데이터로 구분
from sklearn.model_selection import train_test_split
y = (y_train['income'] == '>50K').astype(int)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.1, random_state=42)

In [24]:
# 데이터 크기
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((26373, 15), (2931, 15), (26373,), (2931,))

In [26]:
# 의사결정나무
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_tr[cols], y_tr)
pred = dt.predict_proba(X_val[cols])

In [29]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, pred[:, 1])

0.6893131820554703

In [30]:
# 랜덤포레스트
# 의사결정나무
from sklearn.ensemble import RandomForestClassifier
dt = RandomForestClassifier()
dt.fit(X_tr[cols], y_tr)
pred = dt.predict_proba(X_val[cols])

In [31]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, pred[:, 1])

0.8203653244525179

In [36]:
# XGBoost
from xgboost import XGBClassifier
dt = XGBClassifier()
dt.fit(X_tr[cols], y_tr)
pred = dt.predict_proba(X_val[cols])

In [33]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, pred[:, 1])

0.8649952219518607

In [37]:
# 평가 데이터로 예측 및 csv파일 생성
# 평가 데이터로 예측 및 csv파일 생성
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val,pred[:,1])
pred = dt.predict_proba(X_test[cols])
submit = pd.DataFrame(
            {
                'id':X_test['id'],
                'income':pred[:,1]
            }
        )
submit.to_csv("22222.csv", index=False)

# 평가
- 수험자는 알 수 없는 영역임

In [39]:
from sklearn.metrics import roc_auc_score
y_test = pd.read_csv("data_atype_y/y_test.csv")
ans = (y_test['income'] != '<=50K').astype(int)
roc_auc_score(ans, pred[:,1])

0.8780026455026455