## 지도학습 - 분류형
- target 기준 이항, 다항으로 나눔

In [1]:
import pandas as pd
df_TFD = pd.read_csv("TitanicFromDisaster_train.csv")
df_TFD[:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [2]:
df_TFD.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## 데이터
- 목표변수(target) : 'Survived'
- 설명변수 (feature) : 'Pclass','Age','Fare'
- 모델링 하기 전에 범주형 데이터의 값이 문자일 경우 숫자로 변환을 해줘야 한다.

In [3]:
df_TFD['Survived'].unique()

array([0, 1])

In [4]:
df_TFD_extract = df_TFD.loc[:,['Survived', 'Pclass', 'Age']].dropna()
df_TFD_extract.isnull().sum()

Survived    0
Pclass      0
Age         0
dtype: int64

In [5]:
df_TFD_extract[:2]

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0


## 특성공학(Feature Engineering)
- 수치 평준화 : 수치형과 범주형 각각 적용
- 개수 균형화 : target 수량 균형화

### Scaling 수치형에 적용
- Standard Scaling : 평균 0, 표준편차 1
- Min-Max Scaling : 0-1 사이
- Robust Scaling : 이상치가 많은 데이터 셋(중앙값 기준)

In [6]:
from sklearn.preprocessing import MinMaxScaler
minMaxScaler = MinMaxScaler()
minMaxScaler.fit(df_TFD_extract['Age'].values.reshape(-1,1)) # Series는 reshape, Dataframe은 그냥 입력

In [7]:
df_TFD_extract['Age_scaler']=minMaxScaler.transform(df_TFD_extract['Age'].values.reshape(-1,1))
# df_TFD_extract[:2]
df_TFD_extract['Age_scaler'].describe() # min-max 가 0-1 사이

count    714.000000
mean       0.367921
std        0.182540
min        0.000000
25%        0.247612
50%        0.346569
75%        0.472229
max        1.000000
Name: Age_scaler, dtype: float64

### Onehot encoding : 범주형에 적용

In [8]:
df_TFD_extract['Pclass'].unique()

array([3, 1, 2])

In [9]:
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder_pclass =OneHotEncoder()
oneHotEncoder_pclass.fit(df_TFD_extract[['Pclass']])

In [10]:
oneHotEncoder_pclass.categories_

[array([1, 2, 3])]

In [11]:
oneHotEncoder_pclass.get_feature_names_out()

array(['Pclass_1', 'Pclass_2', 'Pclass_3'], dtype=object)

In [12]:
encoder_pclass = oneHotEncoder_pclass.transform(df_TFD_extract[['Pclass']]).toarray()
encoder_pclass.shape

(714, 3)

In [13]:
# dataframe 전환
df_encoder_pclass = pd.DataFrame(data=encoder_pclass,columns=oneHotEncoder_pclass.get_feature_names_out())
df_encoder_pclass[:2]

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0.0,0.0,1.0
1,1.0,0.0,0.0


In [14]:
# 행으로 concate
df_TFD_extract = pd.concat((df_TFD_extract.reset_index(drop=True),df_encoder_pclass.reset_index(drop=True)),axis=1)
# df_TFD_extract[:2]
df_TFD_extract.shape

(714, 7)

### 정형화 단계 - target과 feature 분리

In [15]:
target_train = df_TFD_extract['Survived']
feature_train = df_TFD_extract.drop(columns=['Survived','Age','Pclass'])
target_train.shape, feature_train.shape

((714,), (714, 4))

In [16]:
# feature_train['Pclass'].unique() # onehot encoding(특성 공학) 적용 필요

In [17]:
# feature_train['Age'].unique()

## 모델

In [18]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model

In [19]:
model.fit(feature_train,target_train)

In [20]:
model.coef_,model.intercept_ # y = -1.22653571x_1(Pclass) + -0.04149665x_2(Age) + 3.532956

(array([[-2.55171959,  1.08518672,  0.07852919, -1.16373903]]),
 array([0.77842704]))

## 평가

In [27]:
df_TFD_extract[10:11]

Unnamed: 0,Survived,Pclass,Age,Age_scaler,Pclass_1,Pclass_2,Pclass_3
10,1,1,58.0,0.723549,1.0,0.0,0.0


In [30]:
feature_train[10:14]

Unnamed: 0,Age_scaler,Pclass_1,Pclass_2,Pclass_3
10,0.723549,1.0,0.0,0.0
11,0.246042,0.0,0.0,1.0
12,0.484795,0.0,0.0,1.0
13,0.170646,0.0,0.0,1.0


In [22]:
model.predict(feature_train[10:14])
# 실제 값 : 1, 0, 0, 0 / 결과값 : 0, 0, 0, 0

array([1, 0, 0, 0])

In [23]:
model.predict_proba(feature_train[10:14]) # 확률값

array([[0.49566997, 0.50433003],
       [0.73363449, 0.26636551],
       [0.83512218, 0.16487782],
       [0.69439604, 0.30560396]])

### 평가 수치
- 정확도
-

In [24]:
# 정확도
from sklearn.metrics import accuracy_score
target_train_predict = model.predict(feature_train)
target_train_predict.shape, target_train.shape

((714,), (714,))

In [25]:
accuracy_score(target_train, target_train_predict)

0.6946778711484594

### F1 score
- 정밀도, 재현율 : F1

In [26]:
from sklearn.metrics import classification_report
print(classification_report(target_train,target_train_predict))

              precision    recall  f1-score   support

           0       0.72      0.80      0.76       424
           1       0.65      0.54      0.59       290

    accuracy                           0.69       714
   macro avg       0.68      0.67      0.67       714
weighted avg       0.69      0.69      0.69       714



## 서비스

In [31]:
input_age = 22
input_pclass = 2
# [[0.723549, 1.0, 0.0, 0.0]]

In [40]:
result_age = minMaxScaler.transform([[input_age]])   # age scaling : 2차원으로 넣어줘야 함
result_age

array([[0.27117366]])

In [41]:
result_pclass = oneHotEncoder_pclass.transform([[input_pclass]]).toarray() # pclass encoding : 2차원으로 넣어줘야 함
result_pclass



array([[0., 1., 0.]])

In [43]:
# concate
import numpy as np
inputs = np.concatenate((result_age,result_pclass), axis=1)
inputs

array([[0.27117366, 0.        , 1.        , 0.        ]])

In [44]:
model.predict(inputs)



array([1])