# 실습하기

## 데이터 준비하기

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, confusion_matrix, classification_report
from sklearn.cluster import KMeans

df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv")
print(df.describe(include='all'))
print(df.info())

        sepal_length  sepal_width  petal_length  petal_width species
count     150.000000   150.000000    150.000000   150.000000     150
unique           NaN          NaN           NaN          NaN       3
top              NaN          NaN           NaN          NaN  setosa
freq             NaN          NaN           NaN          NaN      50
mean        5.843333     3.057333      3.758000     1.199333     NaN
std         0.828066     0.435866      1.765298     0.762238     NaN
min         4.300000     2.000000      1.000000     0.100000     NaN
25%         5.100000     2.800000      1.600000     0.300000     NaN
50%         5.800000     3.000000      4.350000     1.300000     NaN
75%         6.400000     3.300000      5.100000     1.800000     NaN
max         7.900000     4.400000      6.900000     2.500000     NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        ------

## 데이터 전처리

### 결측치 처리

In [10]:
print(df.isnull().sum())

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64


### 데이터 정규화

In [11]:
scaler = MinMaxScaler()
df['sepal_length'] = scaler.fit_transform(df[['sepal_length']])
df['sepal_width'] = scaler.fit_transform(df[['sepal_width']])
df['petal_length'] = scaler.fit_transform(df[['petal_length']])
df['petal_width'] = scaler.fit_transform(df[['petal_width']])


## 라벨 인코딩

In [12]:
df['species'] = LabelEncoder().fit_transform(df['species'])

## 분석 데이터셋 준비

In [13]:
x = df.drop('species',axis=1)
print(x)

     sepal_length  sepal_width  petal_length  petal_width
0        0.222222     0.625000      0.067797     0.041667
1        0.166667     0.416667      0.067797     0.041667
2        0.111111     0.500000      0.050847     0.041667
3        0.083333     0.458333      0.084746     0.041667
4        0.194444     0.666667      0.067797     0.041667
..            ...          ...           ...          ...
145      0.666667     0.416667      0.711864     0.916667
146      0.555556     0.208333      0.677966     0.750000
147      0.611111     0.416667      0.711864     0.791667
148      0.527778     0.583333      0.745763     0.916667
149      0.444444     0.416667      0.694915     0.708333

[150 rows x 4 columns]


## 모델 평가 : 최적의 cluster 값 찾기

In [21]:
inertias = []
for i in range(1,10):
    model = KMeans(n_clusters = i, max_iter = 50, n_init = 10, random_state = 42)
    model.fit(x)
    inertias.append(model.inertia_)
    print(i,inertias[i-1])

1 41.16611042137329
2 12.127790750538193
3 6.9822164737852335
4 5.51693347204037
5 4.580948640117294
6 3.976424248209841
7 3.473622320733247
8 3.1456415905967794
9 2.814016644149029


## 군집화하기

In [16]:
cluster = KMeans(n_clusters= 3, random_state = 42, n_init = 10, max_iter= 500)
cluster.fit(x)

In [17]:
cluster_center = cluster.cluster_centers_
cluster_prediction = cluster.predict(x)
print(pd.DataFrame(cluster_center))
print(cluster_prediction)

          0         1         2         3
0  0.196111  0.595000  0.078305  0.060833
1  0.441257  0.307377  0.575715  0.549180
2  0.707265  0.450855  0.797045  0.824786
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
 2 2 1 2 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 2 1 2
 2 1]


In [18]:
df['cluster'] = cluster_prediction

# 연습하기

In [22]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, confusion_matrix, classification_report 
from sklearn.cluster import KMeans

df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv")
print(df.describe(include='all'))
print(df.info())

# scaler = MinMaxScaler()
# df['sepal_length'] = scaler.fit_transform(df[['sepal_length']])
# df['sepal_width'] = scaler.fit_transform(df[['sepal_width']])
# df['petal_length'] = scaler.fit_transform(df[['petal_length']])
# df['petal_width'] = scaler.fit_transform(df[['petal_width']])

df['species'] = LabelEncoder().fit_transform(df['species'])

x = df.drop('species',axis=1)
for i in range(1,10):
    inertias = []
    model = KMeans(n_init = 10, n_clusters=i, max_iter= 50, random_state = 42)
    model.fit(x)
    inertias.append(model.inertia_)
    print(i,inertias)

clusters = KMeans(n_init=10,n_clusters=3, max_iter = 50, random_state= 42)
clusters.fit(x)
cluster_center = clusters.cluster_centers_
print(cluster_center)
prediction = clusters.predict(x)
print(prediction)

        sepal_length  sepal_width  petal_length  petal_width species
count     150.000000   150.000000    150.000000   150.000000     150
unique           NaN          NaN           NaN          NaN       3
top              NaN          NaN           NaN          NaN  setosa
freq             NaN          NaN           NaN          NaN      50
mean        5.843333     3.057333      3.758000     1.199333     NaN
std         0.828066     0.435866      1.765298     0.762238     NaN
min         4.300000     2.000000      1.000000     0.100000     NaN
25%         5.100000     2.800000      1.600000     0.300000     NaN
50%         5.800000     3.000000      4.350000     1.300000     NaN
75%         6.400000     3.300000      5.100000     1.800000     NaN
max         7.900000     4.400000      6.900000     2.500000     NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        ------