# 실습3. Scikit-learn
---
## 1. ML 모델개발 실습

In [None]:
import numpy as np
import pandas as pd
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Iris 데이터셋 예제 로드

In [None]:
# 예제: 데이터 불러오기
iris = datasets.load_iris()

In [None]:
print(type(iris))
print(iris)

In [None]:
iris.target_names

In [None]:
X, y = iris.data, iris.target

In [None]:
X.shape, y.shape

In [None]:
X

In [None]:
y

### 데이터셋 Split

In [None]:
# train, test 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train

In [None]:
y_train

### 머신러닝 모델 학습

- 의사결정트리 모델을 이용한 학습실행

In [None]:
# Decision Tree 를 사용하여 classification 해보기
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)

In [None]:
y_pred = tree_clf.predict(X_test)
accuracy_score(y_test, y_pred)

### 추론 결과확인

In [None]:
y_pred

In [None]:
y_test

- Confustion Maxtrix

In [None]:
pd.crosstab(y_test, y_pred, rownames=['real'], colnames=['pred'])

### 다른 ML 알고리즘 적용

In [None]:
# SVM (support vector machine) 을 사용하여 classification 해보기
from sklearn import svm
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)
y_pred = svm_clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
pd.crosstab(y_test, y_pred, rownames=['real'], colnames=['pred'])

---
## 2. Scikit-learn의 유틸리티들

### 데이터셋 Split

In [None]:
# train, test 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y)

# 데이터건수 확인
print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# 데이터건수 확인
print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

# 데이터건수 확인
print(len(X_train), len(X_test), len(y_train), len(y_test))

- 반복수행시에도 동일한 결과를 리턴하도록 random_seed 설정

In [None]:
random_seed = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = random_seed)

# 데이터건수 확인
print(len(X_train), len(X_test), len(y_train), len(y_test))

### Scaler

In [None]:
X_train[:10]

- StandardScaler : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [None]:
from sklearn.preprocessing import StandardScaler
scaler1 = StandardScaler().fit(X_train)
X1 = scaler1.transform(X_train)
X1[:10]

- MinMaxScaler : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html?highlight=minmax#sklearn.preprocessing.MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler2 = MinMaxScaler().fit(X_train)
X2 = scaler2.transform(X_train)
X2[:10]

### OneHotEncoder
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder

In [None]:
df = pd.DataFrame([ [2,1,'male','A',3],
                    [3,2,'female','C',5],
                    [3,4,'male','B',7],
                    [5,5,'female','A',10],
                    [7,5,'female','B',12],
                    [2,5,'male','A',7],
                    [9,2,'male','C',13]
], columns=['hours', 'attendance', 'sex', 'cate', 'score'])
df

- 원핫인코딩

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
ohe.fit(df[['sex']])
ohe.transform(df[['sex']])

In [None]:
ohe.fit(df[['cate']])
ohe.transform(df[['cate']])

### LabelEncoder
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html?highlight=labelencoder#sklearn.preprocessing.LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder
lbe = LabelEncoder()
lbe.fit(df['sex'])
lbe.transform(df['sex'])

In [None]:
lbe = LabelEncoder()
lbe.fit(df['cate'])
lbe.transform(df['cate'])

### Make_column_transformer
- https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_transformer.html

In [None]:
X = pd.DataFrame ({'city': ['London', 'London', 'Paris', 'Sallisaw'],
                   'rating1': [5, 3, 4, 5],
                   'rating2': [4, 5, 4, 3]})
X

In [None]:
from sklearn.compose import make_column_transformer
column_trans = make_column_transformer( (OneHotEncoder(), ['city']),
                                        (StandardScaler(), ['rating1']),
                                        remainder=MinMaxScaler())

In [None]:
column_trans.fit_transform(X)

### SimpleImputer
- https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer

In [None]:
from sklearn.impute import SimpleImputer

# strategy = “median”, “most_frequent”, “constant” 등 사용가능
imp = SimpleImputer(missing_values=np.nan, strategy='mean') 
X1 = [[1, 2], [np.nan, 3], [7, 6]]
imp.fit(X1)

In [None]:
X2 = [[np.nan, 2], [6, np.nan], [7, 6]]
imp.transform(X2)

- X1의 컬럼별 평균값을 이용하여 Impute 실행된 것 확인

In [None]:
pd.DataFrame(X1).mean()

### IterativeImputer
- https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html#sklearn.impute.IterativeImputer

In [None]:
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer

imp = IterativeImputer()
X1 = [[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]]
imp.fit(X1)

- 두 컬럼간 correlation정보를 반영하여 regression 기반으로 Impute 실행

In [None]:
X2 = [[np.nan, 2], [6, np.nan], [np.nan, 6], [np.nan, 100]]
imp.transform(X2)

### KNNImputer
- https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html?highlight=knnimputer#sklearn.impute.KNNImputer

In [None]:
from sklearn.impute import KNNImputer

X1 = [[1, 2, 3], [3, 6, 8], [np.nan, 7, 7], [8, 8, 7]]
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X1)

### PCA를 이용한 차원축소

In [None]:
# Unsupersised Algorithm: PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
transformed_X = pca.fit_transform(iris.data)

In [None]:
iris.data[:10]

In [None]:
transformed_X[:10]