In [31]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [32]:
df = pd.read_csv("/content/drive/MyDrive/car_evaluation.csv", header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [33]:
df.columns = ['price', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.head()

Unnamed: 0,price,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [34]:
# 결측치 확인
df.isnull().sum()

Unnamed: 0,0
price,0
maint,0
doors,0
persons,0
lug_boot,0
safety,0
class,0


In [35]:
# 데이터 레이블링
encoder = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:  # 모든 column 숫자로 변환
    df[col] = encoder.fit_transform(df[col])
print(df.head())

   price  maint  doors  persons  lug_boot  safety  class
0      3      3      0        0         2       1      2
1      3      3      0        0         2       2      2
2      3      3      0        0         2       0      2
3      3      3      0        0         1       1      2
4      3      3      0        0         1       2      2


In [36]:
# 레이블 확인
df['class'].value_counts()
# 2 = unacc
# 0 = acc
# 1 = good
# 3 = vgood

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
2,1210
0,384
1,69
3,65


In [37]:
# X_train, y_train, X_test, y_test 분리
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
# X = df.drop['class'].values
# y = df['class'].values

In [38]:
y

array([2, 2, 2, ..., 2, 1, 3])

In [39]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# DT 모델 학습
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# 예측 및 평가
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cross_val_acc = cross_val_score(dt_model, X, y, cv=5).mean() # 교차 검증
print(f"Decision Tree Accuracy: {accuracy:.4f}")
print(f"Cross-Validation Accuracy: {cross_val_acc:.4f}")
cm = confusion_matrix(y_test, y_pred)
print(f"{name} Confusion Matrix:\n{cm}\n")

Decision Tree Accuracy: 0.9740
Cross-Validation Accuracy: 0.7946
K-Nearest Neighbors Confusion Matrix:
[[ 76   6   1   0]
 [  1  10   0   0]
 [  0   0 235   0]
 [  1   0   0  16]]



In [42]:
# RF 모델 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 예측 및 평가
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cross_val_acc = cross_val_score(rf_model, X, y, cv=5).mean()
print(f"Random Forest Accuracy: {accuracy:.4f}")
print(f"Cross-Validation Accuracy: {cross_val_acc:.4f}")
cm = confusion_matrix(y_test, y_pred)
print(f"{name} Confusion Matrix:\n{cm}\n")

Random Forest Accuracy: 0.9769
Cross-Validation Accuracy: 0.8311
K-Nearest Neighbors Confusion Matrix:
[[ 76   7   0   0]
 [  0  11   0   0]
 [  0   0 235   0]
 [  1   0   0  16]]



In [44]:
# SVM 모델 학습
svm_model = SVC()
svm_model.fit(X_train, y_train)

# 예측 및 평가
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cross_val_acc = cross_val_score(svm_model, X, y, cv=5).mean()
print(f"Random Forest Accuracy: {accuracy:.4f}")
print(f"Cross-Validation Accuracy: {cross_val_acc:.4f}")
cm = confusion_matrix(y_test, y_pred)
print(f"{name} Confusion Matrix:\n{cm}\n")

Random Forest Accuracy: 0.8988
Cross-Validation Accuracy: 0.8033
K-Nearest Neighbors Confusion Matrix:
[[ 67   5  11   0]
 [  6   4   0   1]
 [ 10   0 225   0]
 [  2   0   0  15]]



In [45]:
# LR 모델 학습
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)

# 예측 및 평가
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cross_val_acc = cross_val_score(lr_model, X, y, cv=5).mean()
print(f"Random Forest Accuracy: {accuracy:.4f}")
print(f"Cross-Validation Accuracy: {cross_val_acc:.4f}")
cm = confusion_matrix(y_test, y_pred)
print(f"{name} Confusion Matrix:\n{cm}\n")

Random Forest Accuracy: 0.8988
Cross-Validation Accuracy: 0.6493
K-Nearest Neighbors Confusion Matrix:
[[ 67   5  11   0]
 [  6   4   0   1]
 [ 10   0 225   0]
 [  2   0   0  15]]



In [46]:
# KNN 모델 학습 (k=3 사용)
k = 3
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(X_train, y_train)

# 예측 및 평가
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cross_val_acc = cross_val_score(knn_model, X, y, cv=5).mean()
print(f'K-Nearest Neighbors Accuracy: {accuracy:.4f}')
print(f"Cross-Validation Accuracy: {cross_val_acc:.4f}")
cm = confusion_matrix(y_test, y_pred)
print(f"{name} Confusion Matrix:\n{cm}\n")

K-Nearest Neighbors Accuracy: 0.9191
Cross-Validation Accuracy: 0.8167
K-Nearest Neighbors Confusion Matrix:
[[ 73   4   6   0]
 [  4   7   0   0]
 [ 12   0 223   0]
 [  2   0   0  15]]

