# 분류기 만들기

 - 규칙 : 성별(sex) = 1 생존하지 않은 것으로 분류 

In [9]:
from sklearn.base import BaseEstimator
import numpy as np

class MyDummyClassifier(BaseEstimator):
    def fit(self, X, Y):
        pass
    def predict(self, X):
        pred = np.zeros((X.shape[0],1)) 
        for i in range(X.shape[0]): # i는 0,1,2,3,4...x만큼 나옴 
            if X['Sex'].iloc[i] == 1: # i번째 값이 남자면 (사망), 여자면(생존)
                pred[i]=0  
            else:
                pred[i]=1
        return pred

In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

ti_df = pd.read_csv('data/titanic.csv')
# ti_df.head(1)

y_ti_df = ti_df['Survived']
x_ti_df = ti_df.drop('Survived', axis=1)

# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행 함수
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df) 
    df = drop_features(df)
    df = format_features(df)
    return df

In [11]:
x_ti_df = transform_features(x_ti_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('N', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_ti_df,
                                                    y_ti_df,
                                                    test_size = 0.2,
                                                    random_state = 0
                                                        )

In [13]:
myclf = MyDummyClassifier()
myclf.fit(x_train,y_train)

In [14]:
from sklearn.metrics import accuracy_score

my_pred = myclf.predict(x_test)
accuracy_score(y_test,my_pred)

0.7877094972067039

# 혼동 행렬(Confusion matrix)

In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, my_pred)

array([[92, 18],
       [20, 49]])

In [16]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, my_pred), recall_score(y_test,my_pred)


(np.float64(0.7313432835820896), np.float64(0.7101449275362319))

# 로지스틱회귀, 랜덤포레스트, KNN의 정밀도, 재현율 비교하기

In [17]:
# df = pd.read_csv('./data/titanic.csv')

# X = df[['Pclass', 'SibSp']]
# y = df['Survived']

# # 훈련, 테스트 분리
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y,                 # 데이터프레임 기준
#     test_size=0.2,        # 테스트셋 비율 (20%)
#     random_state=42       # 재현 가능성 유지
# )

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

ti_model = LogisticRegression(max_iter=3000)
ti_model.fit(X_train,y_train)
ti_pred = ti_model.predict(X_test)


In [52]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    print(confusion)
    print('',20)
    print(accuracy, precision, recall)
    
models_sc =  {
        'KNN':       KNeighborsClassifier(n_neighbors=5),
        'RF' :       RandomForestClassifier(),
        'LR' :       LogisticRegression(max_iter=3000)
        }


for i,n in models_sc.items():
    n.fit(x_train,y_train)
    pred = n.predict(x_test)
    get_clf_eval(y_test, pred)


[[94 16]
 [31 38]]
 20
0.7374301675977654 0.7037037037037037 0.5507246376811594
[[100  10]
 [ 23  46]]
 20
0.8156424581005587 0.8214285714285714 0.6666666666666666
[[92 18]
 [16 53]]
 20
0.8100558659217877 0.7464788732394366 0.7681159420289855


In [20]:
get_clf_eval(y_test,my_pred)

[[92 18]
 [20 49]]
 20
0.7877094972067039 0.7313432835820896 0.7101449275362319


In [21]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(x_train, y_train)

pred_proba =  lr_clf.predict_proba(x_test)
pos_proba = pred_proba[:,1] #양성클래스일 확률

threshold = 0.4 #임계치
custom_proba = (pos_proba>=threshold).astype(int) #임계치보다 크면 1
confusion_matrix(y_test, custom_proba)
get_clf_eval(y_test, custom_proba)

[[86 24]
 [13 56]]
 20
0.7932960893854749 0.7 0.8115942028985508


# 정밀도와 재현율의 변화

정밀도와 재현율의 불균형이 심할 때,
혹은 비지니스 요구사항이 있을 때
임계치를 조정해야 한다

임계치를 낮추면, 정밀도는 낮아지고, 재현율은 올라간다



In [2]:
from sklearn.metrics import f1_score, classification_report
f1_score(y_test,pred) #정밀도와 재현율의 평균

NameError: name 'y_test' is not defined

In [36]:
print(classification_report(y_test, custom_proba))

              precision    recall  f1-score   support

           0       0.87      0.78      0.82       110
           1       0.70      0.81      0.75        69

    accuracy                           0.79       179
   macro avg       0.78      0.80      0.79       179
weighted avg       0.80      0.79      0.80       179



In [30]:
import pandas as pd
pd.Series(lr_clf.coef_[0]).sort_values()

1   -2.593416
0   -0.901628
3   -0.368137
7   -0.107352
4   -0.059052
6   -0.058762
2   -0.042756
5    0.001286
dtype: float64

# 복습

In [26]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

iris = load_iris()
iris_df = pd.DataFrame(data = iris.data, columns= iris.feature_names)
iris_df['label'] = iris.target
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [13]:
from sklearn.model_selection import train_test_split

ir_xtrain, ir_xtest, ir_ytrain, ir_ytest = train_test_split(iris.data,
                                                            iris.target,
                                                            test_size=0.2,
                                                            random_state= 30)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

ir_model = LogisticRegression()
ir_model.fit(ir_xtrain, ir_ytrain)
ir_pred = ir_model.predict(ir_xtest)
acc = accuracy_score(ir_ytest, ir_pred)
print(f'acc = {acc}')

acc = 0.9666666666666667


In [37]:
model_zip = {
             'knn'  : KNeighborsClassifier(n_neighbors=5),
             'RF'   : RandomForestClassifier(),
             'LR'   : LogisticRegression(max_iter=3000)
}

for name, model in model_zip.items():
    model.fit(ir_xtrain,ir_ytrain)
    zi_pred = model.predict(ir_xtest)
    acc = accuracy_score(ir_ytest, zi_pred)
    print(f'{name}모델의 정확도 :{acc}')
    

knn모델의 정확도 :0.9333333333333333
RF모델의 정확도 :0.9333333333333333
LR모델의 정확도 :0.9666666666666667


In [39]:
# 스케일링

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(ir_xtrain)
ir_trscaled = scaler.transform(ir_xtrain)
ir_txscaled = scaler.transform(ir_xtest)


In [40]:
model_zip = {
             'knn'  : KNeighborsClassifier(n_neighbors=5),
             'RF'   : RandomForestClassifier(),
             'LR'   : LogisticRegression(max_iter=3000)
}

for name, model in model_zip.items():
    model.fit(ir_scaled,ir_ytrain)
    sc_zi_pred = model.predict(ir_txscaled)
    acc = accuracy_score(ir_ytest, sc_zi_pred)
    print(f'{name}모델의 정확도 :{acc}')

knn모델의 정확도 :0.9333333333333333
RF모델의 정확도 :0.9333333333333333
LR모델의 정확도 :0.9333333333333333
