In [13]:
import pandas as pd
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=10000,
    n_features=2,
    n_redundant=0,
    n_clusters_per_class=1,
    weights=[0.99],
    flip_y=0,
    random_state=1,
)
dfX = pd.DataFrame(X, columns=["a", "b"])
dfy = pd.DataFrame(y, columns=["y"])
df = pd.concat([dfX, dfy], axis=1)
df

Unnamed: 0,a,b,y
0,0.222014,0.540207,0
1,1.347439,1.412824,0
2,0.537238,0.372730,0
3,2.134462,1.404819,0
4,2.315827,1.356858,0
...,...,...,...
9995,2.440385,1.695643,0
9996,-0.790502,0.194243,0
9997,1.878130,0.829500,0
9998,2.585933,1.927995,0


In [14]:
X1 = df[["a", "b"]]  # 독립변수
y1 = df["y"]
df["y"].value_counts()  # 불균형 데이터셋

y
0    9900
1     100
Name: count, dtype: int64

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X1, y1, test_size=0.2, stratify=y1, random_state=10
)

In [16]:
# 불균형 데이터셋으로 만든 모형
model1 = LogisticRegression(random_state=0)
model1.fit(X_train, y_train)

print("학습용: ", model1.score(X_train, y_train))
print("검증용: ", model1.score(X_test, y_test))

학습용:  0.99425
검증용:  0.995


In [17]:
from sklearn.metrics import confusion_matrix

pred1 = model1.predict(X_test)
confusion_matrix(y_test, pred1)

array([[1980,    0],
       [  10,   10]], dtype=int64)

In [18]:
from sklearn.metrics import classification_report

# 소수 클래스의 정확도와 precision, precision,recall,f1-score 확인
print(classification_report(y_test, pred1))

# score는 0.9945로 높으나 recall의 경우 10/(10+10) = 0.5로 낮아지는 문제가 발생함
# 모형의 전반적인 정확도(accuracy)는 높지만 소수 클래스의 재현율(recall)이 0.5로 낮은 문제점

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1980
           1       1.00      0.50      0.67        20

    accuracy                           0.99      2000
   macro avg       1.00      0.75      0.83      2000
weighted avg       1.00      0.99      0.99      2000



In [19]:
# 균형 데이터

X, y = make_classification(
    n_samples=10000,
    n_features=2,
    n_redundant=0,
    n_clusters_per_class=1,
    flip_y=0,
    random_state=1,
)
dfX = pd.DataFrame(X, columns=["a", "b"])
dfy = pd.DataFrame(y, columns=["y"])
df2 = pd.concat([dfX, dfy], axis=1)

df2["y"].value_counts()

y
0    5000
1    5000
Name: count, dtype: int64

In [20]:
X2 = df2[["a", "b"]]  # 독립변수
y2 = df2["y"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X2, y2, test_size=0.2, stratify=y2, random_state=10
)

In [22]:
model2 = LogisticRegression(random_state=42)
model2.fit(X_train, y_train)

print("학습용: ", model2.score(X_train, y_train))
print("검증용: ", model2.score(X_test, y_test))

학습용:  0.896125
검증용:  0.891


In [23]:
pred2 = model2.predict(X_test)
print(classification_report(y_test, pred2))
# 정확도와 재현율이 비슷하게 처리됨

              precision    recall  f1-score   support

           0       0.88      0.91      0.89      1000
           1       0.90      0.87      0.89      1000

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000

