# 머신러닝
- 문제정의, 라이브러리/데이터 불러오기
- 탐색적 데이터 분석 (EDA)
- 데이터 전처리
- 피처엔지니어링
- (Train/Validation 나누기)
- 모델 선택/훈련/평가/최적화
- 예측
- (csv 생성)

In [29]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
     ---------------------------------------- 99.8/99.8 MB 6.6 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3




1. 베이스라인
- 문제정의, 라이브러리 및 데이터 불러오기
- 데이터 전처리 (단순 일괄 처리)
- 모델 선택, 훈련
- 평가

2. 베이스라인
- 훈련/검증용 데이터 분리
- 모델 선택, 훈련
    - 의사결정나무
    - 랜덤 포레스트
    - XGBoost
- 평가

문제 1
# <= 50K -> 0
# > 50K -> 1
# 평가 : 정확도

In [1]:
# 라이브러리 및 데이터 불러오기
import pandas as pd
X_train = pd.read_csv("data_atype_y/X_train.csv")
X_test = pd.read_csv("data_atype_y/X_test.csv")
y_train = pd.read_csv("data_atype_y/y_train.csv")
y_test = pd.read_csv("y_test.csv")

In [2]:
# 데이터 크기
X_train.shape, X_test.shape, y_train.shape

((29304, 15), (3257, 15), (29304, 2))

In [3]:
# 데이터 샘플
X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,3331,34.0,State-gov,177331,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,Black,Male,4386,0,40.0,United-States
1,19749,58.0,Private,290661,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40.0,United-States
2,1157,48.0,Private,125933,Some-college,10,Widowed,Exec-managerial,Unmarried,Black,Female,0,1669,38.0,United-States
3,693,58.0,Private,100313,Some-college,10,Married-civ-spouse,Protective-serv,Husband,White,Male,0,1902,40.0,United-States
4,12522,41.0,Private,195661,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,54.0,United-States


In [4]:
# 타겟 수 확인
y_train['income'].value_counts()

<=50K    22263
>50K      7041
Name: income, dtype: int64

In [5]:
y_train.head()

Unnamed: 0,id,income
0,3331,>50K
1,19749,<=50K
2,1157,<=50K
3,693,>50K
4,12522,<=50K


In [6]:
# type 확인
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29304 entries, 0 to 29303
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              29304 non-null  int64  
 1   age             29292 non-null  float64
 2   workclass       27642 non-null  object 
 3   fnlwgt          29304 non-null  int64  
 4   education       29304 non-null  object 
 5   education.num   29304 non-null  int64  
 6   marital.status  29304 non-null  object 
 7   occupation      27636 non-null  object 
 8   relationship    29304 non-null  object 
 9   race            29304 non-null  object 
 10  sex             29304 non-null  object 
 11  capital.gain    29304 non-null  int64  
 12  capital.loss    29304 non-null  int64  
 13  hours.per.week  29291 non-null  float64
 14  native.country  28767 non-null  object 
dtypes: float64(2), int64(5), object(8)
memory usage: 3.4+ MB


In [7]:
# 수치형 데이터
cols = ['age', 'fnlwgt', 'education.num','capital.gain', 'capital.loss' , 'hours.per.week']

In [8]:
# 수치형 데이터 통계
X_train[cols].describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,29292.0,29304.0,29304.0,29304.0,29304.0,29291.0
mean,38.553223,189748.8,10.080842,1093.858722,86.744506,40.434229
std,13.628811,105525.0,2.570824,7477.43564,401.518928,12.324036
min,-38.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117789.0,9.0,0.0,0.0,40.0
50%,37.0,178376.5,10.0,0.0,0.0,40.0
75%,48.0,237068.2,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [9]:
# 결측값
X_train[cols].isnull().sum()

age               12
fnlwgt             0
education.num      0
capital.gain       0
capital.loss       0
hours.per.week    13
dtype: int64

In [10]:
# 간단한 결측치 처리
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [11]:
# 결측치 확인
X_train[cols].isnull().sum()

age               0
fnlwgt            0
education.num     0
capital.gain      0
capital.loss      0
hours.per.week    0
dtype: int64

In [12]:
X_test[cols].isnull().sum()

age               0
fnlwgt            0
education.num     0
capital.gain      0
capital.loss      0
hours.per.week    0
dtype: int64

In [13]:
y = (y_train['income'] == '>50K').astype(int)
y[:3]

0    1
1    0
2    0
Name: income, dtype: int32

In [14]:
X_train[cols]

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
0,34.0,177331,10,4386,0,40.0
1,58.0,290661,9,0,0,40.0
2,48.0,125933,10,0,1669,38.0
3,58.0,100313,10,0,1902,40.0
4,41.0,195661,10,0,0,54.0
...,...,...,...,...,...,...
29299,28.0,47168,6,0,0,40.0
29300,44.0,231793,16,0,0,38.0
29301,41.0,201435,9,0,0,40.0
29302,43.0,137722,9,0,0,40.0


# 머신러닝 모델

In [15]:
# 랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier

In [16]:
rf = RandomForestClassifier()
rf.fit(X_train[cols],y)
pred = rf.predict(X_test[cols])

In [17]:
# 예측 및 csv 파일 생성
pred[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [18]:
len(X_test)

3257

In [19]:
submit = pd.DataFrame(
            {
                'id' : X_test['id'],
                'income' : pred
            }
        )

submit.to_csv("00000.csv",index=False)

In [20]:
# 평가 (수험자는 알 수 없는 부분) accuracy
from sklearn.metrics import accuracy_score
ans = (y_test['income'] == '>50K').astype(int)
accuracy_score(ans,pred)

0.8117899907890697

# 문제2
# <= 50K -> 0
# > 50K -> 1
# 평가 : roc_auc 예측해야할 값은 : 확률

# 검증용 데이터 분리

In [21]:
# 학습용 데이터와 검증용 데이터로 구분
from sklearn.model_selection import train_test_split
y = (y_train['income'] == '>50K').astype(int)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.1, random_state=2022)

In [22]:
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((26373, 15), (2931, 15), (26373,), (2931,))

In [34]:
# 의사결정나무
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_tr[cols],y_tr)
pred = dt.predict_proba(X_val[cols])
roc_auc_score(y_val, pred[:,1])

0.7010976176707373

In [35]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_tr[cols],y_tr)
pred = rf.predict_proba(X_val[cols])
roc_auc_score(y_val, pred[:,1])

0.8463908511702504

In [31]:
# XGBoost
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_tr[cols],y_tr)
pred = xgb.predict_proba(X_val[cols])
pred[:, 1]

array([0.03814121, 0.00552378, 0.47223645, ..., 0.15650894, 0.03893219,
       0.99036294], dtype=float32)

In [37]:
# 평가 데이터로 예측 및 csv파일 생성
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, pred[:,1])
pred = xgb.predict_proba(X_test[cols])
submit = pd.DataFrame(
            {
                'id' : X_test['id'],
                'income' : pred[:,1]
            }
        )

submit.to_csv("2222.csv",index=False)