## 분류기 만들기

타이타닉 데이터의 생존여부 분류
 - 규칙: 성별(sex)=1 생존하지 않은 것으로 분류

In [13]:
from sklearn.base import BaseEstimator
import numpy as np

class MyDummyCLassifier(BaseEstimator):
    def fit(self, x,y):
        pass
    
    
    def predict(self,x):
        pred=np.zeros((x.shape[0],1))
        for i in range(x.shape[0]):
            if x['Sex'].iloc[i]==1:
                pred[i]=0
            else:
                pred[i]=1
        return pred

### 타이타닉 데이터 가져오기

In [14]:
import pandas as pd
titanic_df=pd.read_csv('./data/titanic.csv')
titanic_df.head()



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
y_titanic_df=titanic_df['Survived']
x_titanic_df=titanic_df.drop('Survived',axis=1) #titanic_df에서 'Survived' 컬럼만 뺀 나머지 모든 컬럼

In [19]:
from sklearn.preprocessing import LabelEncoder

#Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

#머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

#레이블 인코딩 수행 함수
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

#앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df) 
    df = drop_features(df)
    df = format_features(df)
    return df

In [20]:
x_titanic_df=transform_features(x_titanic_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('N', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

In [39]:
#데이터셋 분할
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_titanic_df,
                                               y_titanic_df,
                                               test_size=0.2,
                                               random_state=11)

In [37]:
myclf=MyDummyCLassifier()
myclf.fit(x_train,y_train)

In [38]:
from sklearn.metrics import accuracy_score

my_pred=myclf.predict(x_test)
accuracy_score(y_test,my_pred)

0.8212290502793296

In [40]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,my_pred)

array([[71, 47],
       [37, 24]])

In [41]:
from sklearn.metrics import precision_score,recall_score
precision_score(y_test,my_pred),recall_score(y_test,my_pred)

(np.float64(0.3380281690140845), np.float64(0.39344262295081966))

### 로지스틱회귀, 랜덤포레스트,KNN의 정밀도, 재현율 비교하기

In [58]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    print(confusion)
    print('*',20)
    print(accuracy, precision, recall)

In [59]:
from sklearn.linear_model import LogisticRegression

lr_clf=LogisticRegression(max_iter=2000)
lr_clf.fit(x_train,y_train)
pred=lr_clf.predict(x_test)

#정확도,정밀도,재현율
get_clf_eval(y_test,pred)

[[104  14]
 [ 13  48]]
* 20
0.8491620111731844 0.7741935483870968 0.7868852459016393


In [71]:
pred_proba=lr_clf.predict_proba(x_test)
pos_proba=pred_proba[:,1] #양성클래스일 확률

threshold=0.4 #임계치
custom_proba=(pos_proba>threshold).astype(int) #임계치보다 크면 1 
confusion_matrix(y_test,custom_proba)
get_clf_eval(y_test, custom_proba)

[[98 20]
 [10 51]]
* 20
0.8324022346368715 0.7183098591549296 0.8360655737704918


In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score

# 랜덤포레스트
rf_clf = RandomForestClassifier()
rf_clf.fit(x_train, y_train)
rf_pred = rf_clf.predict(x_test)

print("RandomForest 정밀도:", precision_score(y_test, rf_pred))
print("RandomForest 재현율:", recall_score(y_test, rf_pred))

# KNN
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train, y_train)
knn_pred = knn_clf.predict(x_test)

print("KNN 정밀도:", precision_score(y_test, knn_pred))
print("KNN 재현율:", recall_score(y_test, knn_pred))

RandomForest 정밀도: 0.7931034482758621
RandomForest 재현율: 0.7540983606557377
KNN 정밀도: 0.66
KNN 재현율: 0.5409836065573771


# 정밀도와 재현율의 변화

정밀도와 재현율의 불균형이 심할 때,  
혹은 비즈니스의 요구사항이 있을 때  
임계치를 조정해야 한다.

임계치를 낮추면, 정밀도는 낮아지고, 재현율은 올라간다  

In [60]:
from sklearn.metrics import f1_score
f1_score(y_test,pred)

np.float64(0.7804878048780488)

In [61]:
from sklearn.metrics import f1_score,classification_report
f1_score(y_test,pred) #정밀도와 재현율의 평균

np.float64(0.7804878048780488)

In [63]:
print(classification_report(y_test,pred)) #평가보고서

              precision    recall  f1-score   support

           0       0.89      0.88      0.89       118
           1       0.77      0.79      0.78        61

    accuracy                           0.85       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.85      0.85      0.85       179



In [65]:
import pandas as pd
pd.Series(lr_clf.coef_[0]).sort_values() # 피처의 중요도는 계수

1   -2.499594
0   -0.897322
3   -0.278152
7   -0.109094
4   -0.090091
6   -0.089830
2   -0.034793
5    0.000574
dtype: float64