<a href="https://colab.research.google.com/github/minsoojo/MLprogramming/blob/main/3week/winequality_white.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# -----------------------------
# 1) 데이터 불러오기
# -----------------------------
file_path = "/content/winequality-white.csv"
df = pd.read_csv(file_path, sep=";")
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [35]:
df.count()

Unnamed: 0,0
fixed acidity,4898
volatile acidity,4898
citric acid,4898
residual sugar,4898
chlorides,4898
free sulfur dioxide,4898
total sulfur dioxide,4898
density,4898
pH,4898
sulphates,4898


In [36]:
df.isnull().sum()

Unnamed: 0,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0
sulphates,0


In [37]:
df['quality'].value_counts()

Unnamed: 0_level_0,count
quality,Unnamed: 1_level_1
6,2198
5,1457
7,880
8,175
4,163
3,20
9,5


In [38]:
# -----------------------------
# 2) quality를 이진 분류로 변환
# -----------------------------
df["good_quality"] = (df["quality"] >= 6).astype(int)  # 6 이상이면 1, 아니면 0

X = df.drop(["quality", "good_quality"], axis=1)  # 독립변수
y = df["good_quality"]                           # 종속변수 (이진)

In [39]:
# -----------------------------
# 3) 학습용/테스트용 데이터 분리
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 4) 로지스틱 회귀 모델 학습
# -----------------------------
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# -----------------------------
# 5) 절편과 계수
# -----------------------------
print("절편(intercept):", log_model.intercept_[0])
print("\n계수(coefficients):")
for feature, coef in zip(X.columns, log_model.coef_[0]):
    print(f"{feature:20s}: {coef:.6f}")

# -----------------------------
# 6) 예측 및 평가
# -----------------------------
y_pred = log_model.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

절편(intercept): -8.816450724533265

계수(coefficients):
fixed acidity       : -0.184172
volatile acidity    : -5.549194
citric acid         : 0.100565
residual sugar      : 0.062803
chlorides           : 0.466298
free sulfur dioxide : 0.015313
total sulfur dioxide: -0.003138
density             : 0.125700
pH                  : 0.130661
sulphates           : 1.130984
alcohol             : 1.035293

Accuracy: 0.7377551020408163

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.49      0.55       321
           1       0.77      0.86      0.82       659

    accuracy                           0.74       980
   macro avg       0.70      0.67      0.68       980
weighted avg       0.73      0.74      0.73       980



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---

***

***

In [40]:
# quility를 이진분류하지 않고 각각 클래스로 로지스틱
# -----------------------------
# 2) X, y 분리
# -----------------------------
X = df.drop("quality", axis=1)   # 독립변수
y = df["quality"]                # 종속변수 (다중 클래스)

# -----------------------------
# 3) 학습용/테스트용 데이터 분리
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 4) 로지스틱 회귀 모델 학습
# -----------------------------
log_model = LogisticRegression(max_iter=1000, multi_class="multinomial", solver="lbfgs")
log_model.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---

In [41]:
# -----------------------------
# 5) 예측 및 평가
# -----------------------------
y_pred = log_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("----------------------------------------------------------------------")
print("절편(intercept):")
print(log_model.intercept_)   # 클래스별 절편 (다중 클래스이므로 여러 개)
#클래스 개수 만큼 나옴

print("\n계수(coefficients):")
print(log_model.coef_)        # 클래스별 계수 (각 특성마다)
#특성 개수 만큼 나옴
print("----------------------------------------------------------------------")

Accuracy: 0.7448979591836735

Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00        25
           5       0.91      1.00      0.95       291
           6       0.68      0.96      0.80       432
           7       0.47      0.14      0.21       192
           8       0.00      0.00      0.00        35

    accuracy                           0.74       980
   macro avg       0.34      0.35      0.33       980
weighted avg       0.66      0.74      0.68       980

----------------------------------------------------------------------
절편(intercept):
[-0.07636624  0.14001066  0.4658871   0.44958895 -0.51214364 -0.40006974
 -0.06690708]

계수(coefficients):
[[ 6.16215435e-01  1.07414682e-01  3.98154237e-03 -5.53842869e-02
   9.01092377e-03  2.81092871e-02 -4.53769418e-03 -7.07605333e-02
  -1.89817531e-01 -9.18357677e-02 -3.94295425e-01 -9.86162530e-01]
 [ 1.00180780

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




---



In [42]:
# 각 피처를 x로 각각 선형회귀

# -----------------------------
# 2) X, y 분리
# -----------------------------
X = df.drop("quality", axis=1)   # 독립변수 (특징)
y = df["quality"]                # 종속변수 (타깃)

# -----------------------------
# 3) 학습용 / 테스트용 데이터 분리
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 4) 선형회귀 모델 학습
# -----------------------------
model = LinearRegression()
model.fit(X_train, y_train)


In [43]:

# -----------------------------
# 5) 예측 및 평가
# -----------------------------
y_pred = model.predict(X_test)

print("MSE :", mean_squared_error(y_test, y_pred))
print("R^2 Score :", r2_score(y_test, y_pred))



MSE : 0.25589857306498576
R^2 Score : 0.6695836677785918


In [44]:
# -----------------------------
# 5) 절편과 회귀계수 출력
# -----------------------------
print("절편(intercept):", model.intercept_)
print("\n회귀계수(coefficients):")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature:20s}: {coef:.6f}")

절편(intercept): 90.3068047627733

회귀계수(coefficients):
fixed acidity       : 0.043994
volatile acidity    : -0.330942
citric acid         : -0.003552
residual sugar      : 0.042009
chlorides           : -0.265818
free sulfur dioxide : 0.002013
total sulfur dioxide: 0.000326
density             : -88.118650
pH                  : 0.363063
sulphates           : 0.311449
alcohol             : 0.035851
good_quality        : 1.342727
