# 붓꽃 분류(Naive Bayes)
- 통계적 분류기
- 주어진 데이터가 특정 클래스에 속하는지를 조건부확률을 통해서 예측
- 텍스트데이터처럼 희소한 고차원인 경우 높은 정확도와 속도 제공
- 적용분야: 스팸메일분류, 문서주제 분류, 네트워크 침입자 분류, 추천 시스템

In [2]:
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

## 데이터 로딩 및 분할

In [3]:
x,y = load_iris(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=10,stratify=y)

## 모델 생성

In [4]:
model = GaussianNB()
model.fit(x_train,y_train)

## 모델 평가

In [5]:
y_hat = model.predict(x_test)
print(f'정확도:{accuracy_score(y_test,y_hat):.3f}')

정확도:1.000


In [6]:
print(confusion_matrix(y_test,y_hat))

[[10  0  0]
 [ 0 10  0]
 [ 0  0 10]]


# [문제] 독버섯 분류하기

In [32]:
from sklearn.datasets import load_iris
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix, classification_report, roc_auc_score

import pandas as pd
import numpy as np

df = pd.read_csv("mushrooms.csv")
pd.get_dummies(df['type'])

columns_to_encode = list(df.columns)

label_encoders = {}
for column in columns_to_encode:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])#각 컬럼의 고유한 값들을 정수형으로 인코딩하는데 사용
    label_encoders[column] = le

# 라벨 인코딩 한줄 df = df.apply(lambda col:LabelEncoder().fit_transform(col))



x = df.drop('type', axis=1)
y = df['type']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=10,stratify=y)

model = MultinomialNB()
model.fit(x_train,y_train)
y_hat = model.predict(x_test)

## 모델평가

In [31]:
cm = confusion_matrix(y_test,y_hat)
print(f'정확도:{accuracy_score(y_test,y_hat)}')
print(f'AUC:{roc_auc_score(y_test,model.predict_proba(x_test)[:,1]):.3f}')

정확도:0.8356923076923077
AUC:0.902


## 데이터 분리 및 인코딩

### 라벨 인코딩

In [None]:
from sklearn.preprocessing import LabelEncoder
df = df.apply(lambda col:LabelEncoder().fit_transform(col))

### 원-핫 인코딩

In [34]:
x= df.drop('type',axis=1)
x = pd.get_dummies(x)
y = df['type']
y= y.map({'edible':0,'poisonous':1})

In [35]:
print(x.shape)

(8124, 22)


In [36]:
display(x.head())

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,2,3,0,1,7,1,0,1,0,0,...,3,7,7,0,2,1,4,0,3,4
1,2,3,9,1,0,1,0,0,0,0,...,3,7,7,0,2,1,4,1,2,0
2,0,3,8,1,1,1,0,0,1,0,...,3,7,7,0,2,1,4,1,2,2
3,2,2,8,1,7,1,0,1,1,0,...,3,7,7,0,2,1,4,0,3,4
4,2,3,3,0,6,1,1,0,0,1,...,3,7,7,0,2,1,0,1,0,0
