In [1]:
#Importando Bibliotecas
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
#Lendo meu dataframe
dataframe = pd.read_csv("seattle-weather.csv")

In [16]:
#Verificando os 10 primeiros elementos
dataframe.head(10)

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain
5,2012-01-06,2.5,4.4,2.2,2.2,rain
6,2012-01-07,0.0,7.2,2.8,2.3,rain
7,2012-01-08,0.0,10.0,2.8,2.0,sun
8,2012-01-09,4.3,9.4,5.0,3.4,rain
9,2012-01-10,1.0,6.1,0.6,3.4,rain


In [4]:
#Selecionando o que vai ser clusterizado
parametros = ["precipitation","temp_max","temp_min","wind"]

In [5]:
#Retirando dados que não são válidos como data e strings
dataframe = dataframe.dropna(subset=parametros)

In [6]:
#Copiamos o dataset para podemos clusterizar
df = dataframe[parametros].copy()
df

Unnamed: 0,precipitation,temp_max,temp_min,wind
0,0.0,12.8,5.0,4.7
1,10.9,10.6,2.8,4.5
2,0.8,11.7,7.2,2.3
3,20.3,12.2,5.6,4.7
4,1.3,8.9,2.8,6.1
...,...,...,...,...
1456,8.6,4.4,1.7,2.9
1457,1.5,5.0,1.7,1.3
1458,0.0,7.2,0.6,2.6
1459,0.0,5.6,-1.0,3.4


In [7]:
#Precisamos preprocessar os dados
#Ajustando a escala das minhas colunas para que nenhuma domina a outra já que o kmeans é sensivel 

In [8]:
#Importando uma forma de pré processamento básico MinMax
from sklearn.preprocessing import MinMaxScaler

In [9]:
scaler = MinMaxScaler()
#Nomalizando Precipitação
scaler.fit(df[["precipitation"]])
df["precipitation"] = scaler.transform(df[["precipitation"]])
#Normalizando temperature_max
scaler.fit(df[["temp_max"]])
df["temp_max"] = scaler.transform(df[["temp_max"]])
#Normalizando temperature_min
scaler.fit(df[["temp_min"]])
df["temp_min"] = scaler.transform(df[["temp_min"]])
#Normalizando winf
scaler.fit(df[["wind"]])
df["wind"] = scaler.transform(df[["wind"]])
print(df)

      precipitation  temp_max  temp_min      wind
0          0.000000  0.387097  0.476378  0.472527
1          0.194991  0.327957  0.389764  0.450549
2          0.014311  0.357527  0.562992  0.208791
3          0.363148  0.370968  0.500000  0.472527
4          0.023256  0.282258  0.389764  0.626374
...             ...       ...       ...       ...
1456       0.153846  0.161290  0.346457  0.274725
1457       0.026834  0.177419  0.346457  0.098901
1458       0.000000  0.236559  0.303150  0.241758
1459       0.000000  0.193548  0.240157  0.329670
1460       0.000000  0.193548  0.196850  0.340659

[1461 rows x 4 columns]


In [10]:
df.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,0.054194,0.484922,0.603731,0.312213
std,0.119503,0.197574,0.197756,0.158003
min,0.0,0.0,0.0,0.0
25%,0.0,0.327957,0.452756,0.197802
50%,0.0,0.462366,0.606299,0.285714
75%,0.050089,0.639785,0.759843,0.395604
max,1.0,1.0,1.0,1.0


In [12]:
km = KMeans(n_clusters=5)
y_predicted = km.fit_predict(df)
y_predicted

  super()._check_params_vs_input(X, default_n_init=10)


array([1, 1, 4, ..., 3, 3, 3])

In [17]:
df["cluster"] = y_predicted
df.head(10)

Unnamed: 0,precipitation,temp_max,temp_min,wind,cluster
0,0.0,0.387097,0.476378,0.472527,1
1,0.194991,0.327957,0.389764,0.450549,1
2,0.014311,0.357527,0.562992,0.208791,4
3,0.363148,0.370968,0.5,0.472527,2
4,0.023256,0.282258,0.389764,0.626374,1
5,0.044723,0.16129,0.366142,0.197802,3
6,0.0,0.236559,0.389764,0.208791,3
7,0.0,0.311828,0.389764,0.175824,3
8,0.076923,0.295699,0.476378,0.32967,3
9,0.017889,0.206989,0.30315,0.32967,3


In [18]:
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
df4 = df[df.cluster==3]

In [20]:
# Supondo que você tenha colunas chamadas 'feature1' e 'feature2' no DataFrame df
plt.scatter(df1['feature1'], df1['feature2'], color='red', label='Cluster 0')
plt.scatter(df2['feature1'], df2['feature2'], color='blue', label='Cluster 1')
plt.scatter(df3['feature1'], df3['feature2'], color='green', label='Cluster 2')
plt.scatter(df4['feature1'], df4['feature2'], color='purple', label='Cluster 3')

# Adicione os centróides dos clusters
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], color='black', marker='X', s=200, label='Centroids')

plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Clusters')
plt.legend()
plt.show()

KeyError: 'feature1'