In [13]:
from pandas import read_csv
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

from base.pandas_constants import AlgorithmConstants, DataFrameConstants
df = read_csv(AlgorithmConstants.FINAL_DF_PATH)
df.head()


Unnamed: 0,rain_day,rain_hour,elevation,ground_type,ground_amplitude,slope_degree,slope_percentage,danger_level
0,21.9,0.4,36.0,A2,20 a 100m,10 a 30º,0 a 60%,Baixa
1,3.8,1.0,33.0,LVd4,20 a 100m,10 a 30º,0 a 60%,Media
2,9.2,0.4,43.0,URBANO,20 a 100m,10 a 30º,0 a 60%,Media
3,9.6,0.6,37.0,LVd4,20 a 100m,10 a 30º,0 a 60%,Media
4,9.2,0.4,25.0,LVd4,20 a 100m,10 a 30º,0 a 60%,Media


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1362 entries, 0 to 1361
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rain_day          1362 non-null   float64
 1   rain_hour         1362 non-null   float64
 2   elevation         1362 non-null   float64
 3   ground_type       1362 non-null   object 
 4   ground_amplitude  1362 non-null   object 
 5   slope_degree      1362 non-null   object 
 6   slope_percentage  1362 non-null   object 
 7   danger_level      1362 non-null   object 
dtypes: float64(3), object(5)
memory usage: 85.2+ KB


In [15]:
label_encoders = {}
categorical_columns = [
    DataFrameConstants.GROUND_TYPE,
    DataFrameConstants.GROUND_AMPLITUDE,
    DataFrameConstants.SLOPE_DEGREE,
    DataFrameConstants.SLOPE_PERCENTAGE,
    DataFrameConstants.DANGER_LEVEL
]

for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

features = df[[DataFrameConstants.RAIN_DAY, DataFrameConstants.RAIN_HOUR, DataFrameConstants.ELEVATION] + categorical_columns]
print(features)

      rain_day  rain_hour  elevation  ground_type  ground_amplitude  \
0         21.9        0.4       36.0            0                 3   
1          3.8        1.0       33.0            4                 3   
2          9.2        0.4       43.0            6                 3   
3          9.6        0.6       37.0            4                 3   
4          9.2        0.4       25.0            4                 3   
...        ...        ...        ...          ...               ...   
1357       5.0        0.0       22.0            5                 1   
1358      27.0        0.0       22.0            5                 3   
1359       3.0        0.0       22.0            5                 3   
1360       0.0        0.0       31.0            6                 3   
1361       0.0        0.0       16.0            6                 3   

      slope_degree  slope_percentage  danger_level  
0                2                 4             1  
1                2                 4     

In [None]:

best_k = 0
best_inertia = None
best_columns = None
range_values = range(2, 11)

for num_clusters in range_values:
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    kmeans.fit(features)
    if best_inertia is None or kmeans.inertia_ < best_inertia:
        best_k = num_clusters
        best_inertia = kmeans.inertia_
        best_columns = features.columns.tolist()

print(f'Best K: {best_k}, Best Inertia: {best_inertia}')

In [18]:
best_k = 0
best_inertia = None
best_columns = None
best_config = None
range_values = [2, 4, 8, 16, 32, 64, 128]
random_states = [0, 2, 4, 8, 16, 32, 64]
max_iters = [300, 500, 1000]
algorithms = ['lloyd', 'elkan']
best_k_means = None
results = []

for num_clusters in range_values:
    for random_state in random_states:
        for algorithm in algorithms:
            kmeans = KMeans(n_clusters=num_clusters, random_state=random_state, algorithm=algorithm)
            kmeans.fit(features)
            print(f'Inertia: {kmeans.inertia_}, Random State: {random_state}, Algorithm: {algorithm}, Columns: {features.columns.tolist()}')
            if best_inertia is None or kmeans.inertia_ < best_inertia:
                best_k = num_clusters
                best_k_means = kmeans
                best_inertia = kmeans.inertia_
                best_columns = features.columns.tolist()
                best_config = (random_state, algorithm)

print(f'Best K: {best_k}, Best Inertia: {best_inertia}, Best Config: {best_config}')

Inertia: 522431.8564782609, Random State: 0, Algorithm: lloyd, Columns: ['rain_day', 'rain_hour', 'elevation', 'ground_type', 'ground_amplitude', 'slope_degree', 'slope_percentage', 'danger_level']
Inertia: 522431.8564782609, Random State: 0, Algorithm: elkan, Columns: ['rain_day', 'rain_hour', 'elevation', 'ground_type', 'ground_amplitude', 'slope_degree', 'slope_percentage', 'danger_level']
Inertia: 522431.8564782609, Random State: 2, Algorithm: lloyd, Columns: ['rain_day', 'rain_hour', 'elevation', 'ground_type', 'ground_amplitude', 'slope_degree', 'slope_percentage', 'danger_level']
Inertia: 522431.8564782609, Random State: 2, Algorithm: elkan, Columns: ['rain_day', 'rain_hour', 'elevation', 'ground_type', 'ground_amplitude', 'slope_degree', 'slope_percentage', 'danger_level']
Inertia: 522431.8564782609, Random State: 4, Algorithm: lloyd, Columns: ['rain_day', 'rain_hour', 'elevation', 'ground_type', 'ground_amplitude', 'slope_degree', 'slope_percentage', 'danger_level']
Inertia: 5

In [20]:
print(best_k_means.predict(features.head()))

[89 95 54 32 67]
