# $k$-Means and Cluster Storing

Author: `Márcio Lopes Jr` 

*Master's student of `Computer Engineering, Intelligent Information Processing` at UFRN-Natal*.

In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import os

# Clustering
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, Normalizer, PowerTransformer

## Paths for Clustering Files

Municipality classification $ A_c $ -> **`Ac.csv`**

Cluster centres $ C_{ci} $ -> **`Cci.csv`**

In [None]:
# Subfolder to store cluster data
res_subfolder = "files"

# Cluster data files
path_cities_classes = f"data/{res_subfolder}/Ac.csv"
path_cluster_centres_abs = f"data/{res_subfolder}/Cci.csv"

/content/drive/My Drive/The Sound of Drums/Notebooks/data/tests4/km_figure_{}C_{}T.svg


## Preprocessing

Steps:
1. Create data copy without outliers
2. Normalise datasets
3. Keep record of the general mean (mean value of features for all municipalities)
4. Reduce datasets with PCA

In [None]:
# Loads data
df = pd.read_csv("data/A0.csv").set_index('cd_ibge')

# training df without outliers
lower, upper = df.ptb_rate.mean() - (df.ptb_rate.std()*3), df.ptb_rate.mean() + (df.ptb_rate.std()*3)
df_train = df[(df.ptb_rate > lower) & (df.ptb_rate < upper)]
df_copy = df.copy()

# preprocessing transformers
normalizer = Normalizer()
yeojohnson = PowerTransformer()
minmax = MinMaxScaler()

# Transformed train sample
train_sample = df_train.copy()
train_sample[:] = minmax.fit_transform(yeojohnson.fit_transform(normalizer.fit_transform(train_sample)))
sample = df.copy()
sample[:] = minmax.fit_transform(yeojohnson.fit_transform(normalizer.fit_transform(sample)))

# Reducing a few dimensions with PCA
pca = PCA(n_components=0.95)
pca.fit(train_sample.iloc[:,:-3])
pca_train_sample = pd.DataFrame(pca.transform(train_sample.iloc[:,:-3]))
pca_sample = pd.DataFrame(pca.transform(sample.iloc[:,:-3]))
pca_sample.index = sample.index
pca_train_sample.index = train_sample.index

## Main Block - Clustering

Steps:
1. Train k-means on `pca_train_sample` (no outliers)
2. Generate clusters on `pca_sample` (all included)
3. Save municipalities' clusters to external file
4. Calculate cluster PTB means
5. If mean is above/below threshold, save cluster centre

In [None]:
min_cluster = 2
max_cluster = 30
runs = 10
threshold = .1

for i in range(min_cluster, max_cluster+1):
    print('', end='\r')
    
    for t in range(0, runs):
        # Find clusters for i clusters
        km = KMeans(i, init='random', max_iter=10000, algorithm='full', tol=1e-7)
        km.fit(pca_train_sample)
        df2 = sample.copy()
        pred = km.predict(pca_sample)
        df2['cluster'] = km.predict(pca_sample)
        sample['cluster'] = pred
        df_copy['cluster'] = pred
        df2['ptb_rate'] = df.ptb_rate

        # Store cities' clusters for each test round
        log_cidades = df2[['ptb_rate', 'cluster']].copy()
        log_cidades['test_number'] = t
        log_cidades['n_clusters'] = i
        log_cidades.to_csv(path_cities_classes,  mode='a', header=(not os.path.exists(path_cities_classes)))

        # Get clusters PTB mean for selection 
        distribution = df_copy.groupby(['cluster']).agg({'ptb_rate':['mean', 'median']})
        distribution.columns = ['media', 'mediana']
        sample.drop(columns=['cluster'], inplace=True, errors='ignore')
        df_copy.drop(columns=['cluster'], inplace=True, errors='ignore')
        
        # Select clusters above and below threshold
        large_ptb_rate_groups = distribution[(distribution.media > (1+threshold)*df.ptb_rate.mean())].index.values
        small_ptb_rate_groups = distribution[(distribution.media < (1-threshold)*df.ptb_rate.mean())].index.values
        groups_to_analyse = list(large_ptb_rate_groups) + list(small_ptb_rate_groups)
        
        df2['type_of_cluster'] = np.int64(df2.cluster.isin(large_ptb_rate_groups)) - np.int64(df2.cluster.isin(small_ptb_rate_groups))
        print(i, t, len(large_ptb_rate_groups), len(small_ptb_rate_groups))

        # Individual group analysis
        for c in groups_to_analyse:
            # Absolute centre (median)
            arr = pd.Series(df_copy[df2.cluster == c].median(axis=0))
            arr.index = sample.columns
            arr_df = pd.DataFrame(arr).reset_index().pivot_table(columns=df_copy.columns)
            arr_df['cluster'] = c
            arr_df['test_number'] = t
            arr_df['n_clusters'] = i
            arr_df.to_csv(path_cluster_centres_abs, mode='a', header=(not os.path.exists(path_cluster_centres_abs)))