<a href="https://colab.research.google.com/github/mabataki2/AI-Class/blob/main/Week4/Wine_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report


# 1. 데이터 로드 및 병합

red_wine = pd.read_csv('/content/drive/MyDrive/winequality-red.csv', sep=';')
white_wine = pd.read_csv('/content/drive/MyDrive/winequality-white.csv', sep=';')

# 와인 타입 구분을 위한 컬럼 추가 (옵션이지만 좋은 습관)
red_wine['type'] = 0  # 레드 와인
white_wine['type'] = 1 # 화이트 와인


# 두 데이터셋 병합
df_wine = pd.concat([red_wine, white_wine], ignore_index=True)

In [35]:
df_wine.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  type                  6497 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 660.0 KB


In [36]:
# 결측치 확인
wine.isnull().sum()

Unnamed: 0,0
"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality""",0


In [37]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 1 columns):
 #   Column                                                                                                                                                                   Non-Null Count  Dtype 
---  ------                                                                                                                                                                   --------------  ----- 
 0   fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"  6497 non-null   object
dtypes: object(1)
memory usage: 50.9+ KB


In [38]:
# 2. X (특성)와 Y (레이블) 분리
# 'quality'를 목표 변수로 설정 (와인 품질 분류)
X = df_wine.drop('quality', axis=1)
Y = df_wine['quality']

Y

Unnamed: 0,quality
0,5
1,5
2,5
3,6
4,5
...,...
6492,6
6493,5
6494,6
6495,7


In [39]:
# 3. 이진 분류 문제로 단순화 (품질 7 이상을 '좋음'(1), 미만을 '보통'(0)으로 분류)
Y_binary = (Y >= 7).astype(int)

In [40]:
# 4. 데이터 분리 및 표준화
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y_binary, test_size=0.2, random_state=42, stratify=Y_binary
)

In [41]:
# 표준화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [42]:
# 5. 모델 정의 및 학습
models = {
    "Logistic Regression (LR)": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree (DT)": DecisionTreeClassifier(random_state=42),
    "Random Forest (RF)": RandomForestClassifier(random_state=42),
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier()
}
accuracy_results = {}

print("--- 전통 머신러닝 모델 학습 및 평가 ---")

for name, model in models.items():
    model.fit(X_train_scaled, Y_train)
    Y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(Y_test, Y_pred)
    accuracy_results[name] = accuracy
    print(f"[{name}] 정확도: {accuracy:.4f}")


--- 전통 머신러닝 모델 학습 및 평가 ---
[Logistic Regression (LR)] 정확도: 0.8223
[Decision Tree (DT)] 정확도: 0.8538
[Random Forest (RF)] 정확도: 0.8892
[K-Nearest Neighbors (KNN)] 정확도: 0.8323


In [43]:
# 결과 출력
results_df = pd.DataFrame(accuracy_results.items(), columns=['Model', 'Accuracy']).sort_values(by='Accuracy', ascending=False)
print("\n--- 최종 정확도 비교 ---")
print(results_df.to_markdown(index=False))



--- 최종 정확도 비교 ---
| Model                     |   Accuracy |
|:--------------------------|-----------:|
| Random Forest (RF)        |   0.889231 |
| Decision Tree (DT)        |   0.853846 |
| K-Nearest Neighbors (KNN) |   0.832308 |
| Logistic Regression (LR)  |   0.822308 |


In [44]:
# 6. LR 모델의 기울기(계수) 및 절편 출력
print("\n--- 로지스틱 회귀 (LR) 모델 계수/절편 ---")
# 표준화된 특성의 컬럼 이름 가져오기
feature_names = X.columns


--- 로지스틱 회귀 (LR) 모델 계수/절편 ---


In [45]:
# Logistic Regression 모델의 계수(Coefficient)와 절편(Intercept)
lr_coef = models["Logistic Regression (LR)"].coef_[0]
lr_intercept = models["Logistic Regression (LR)"].intercept_[0]

In [46]:
print(f"절편 (Intercept): {lr_intercept:.4f}")
print("계수 (Coefficients):")
for name, coef in zip(feature_names, lr_coef):
    print(f"  {name:25s}: {coef:.4f}")

절편 (Intercept): -1.9095
계수 (Coefficients):
  fixed acidity            : 0.6153
  volatile acidity         : -0.5977
  citric acid              : -0.0065
  residual sugar           : 1.0381
  chlorides                : -0.2397
  free sulfur dioxide      : 0.2132
  total sulfur dioxide     : -0.2255
  density                  : -1.2249
  pH                       : 0.3912
  sulphates                : 0.3609
  alcohol                  : 0.5700
  type                     : -0.2954


In [47]:
# RF의 성능이 제일 좋았음.
# 로지스틱 회귀(LR)이 제일 결과가 좋지 않았음.
# 계수를 확인해보면 density, residual sugar, volatile acidity 가
# 와인 품질에 관여하는 바가 제일 큼을 확인할 수 있음.