In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.mixture import BayesianGaussianMixture

In [7]:
idh = pd.read_csv('../data/processed/IDH_data.csv')
mortality = pd.read_csv('../data/processed/mortality_data.csv')
sanitation = pd.read_csv('../data/processed/sanitation_data.csv')

Inicialmente filtramos os dados para o ano de 2016. Após isso, selecionamos apenas dados de países que contenham todos os dados de saneamento. Assim, impedimos a ausência de algum parâmetro na aplicação da clustering.

In [8]:
idh16=idh.loc[idh['year'] == 2016]
mortality16=mortality.loc[mortality['year'] == 2016]
sanitation16=sanitation.loc[sanitation['year'] == 2016]

sanitation_majority = sanitation16.loc[sanitation16['Indicator'].eq('Proportion of population using at least basic sanitation services') | sanitation16['Indicator'].eq('Proportion of population using at least basic drinking water services')
  | sanitation16['Indicator'].eq('Proportion of population practising open defecation')]

aux = sanitation_majority.groupby('country').Indicator.agg(['count'])

countries = aux.loc[aux['count']==3].index.to_numpy()

data = []
for countrie in countries:
  countrie_stats = []
  countrie_stats.append(countrie)
  countrie_stats.append(sanitation16.loc[sanitation16['country'].eq(countrie) & sanitation16['Indicator'].eq('Proportion of population using at least basic sanitation services')]['OBS_VALUE'].to_numpy()[0]  )  
  countrie_stats.append(sanitation16.loc[sanitation16['country'].eq(countrie) & sanitation16['Indicator'].eq('Proportion of population using at least basic drinking water services')]['OBS_VALUE'].to_numpy()[0]  )
  countrie_stats.append(sanitation16.loc[sanitation16['country'].eq(countrie) & sanitation16['Indicator'].eq('Proportion of population practising open defecation')]['OBS_VALUE'].to_numpy()[0]  )
  data.append(countrie_stats)

columns = ['country', 'Proportion of population using at least basic sanitation services','Proportion of population using at least basic drinking water services','Proportion of population practising open defecation']
df = pd.DataFrame(data = data, columns=columns)

Com os dados prontos, vamos executar o modelo de clustering e analisar como ocorre o agrupamento dos países.

In [9]:
bgm = BayesianGaussianMixture(n_components = 20, n_init = 10)
bgm.fit(df.drop(columns=['country']))

predictions = bgm.predict(df.drop(columns=['country']))

coluna = ['cluster']
df_cluster = pd.DataFrame(data = predictions, columns=coluna)

final_df = df.join(df_cluster)

clusters_possiveis = final_df['cluster'].unique()
for cluster in clusters_possiveis:
  print('\ncluster '+ str(cluster)+':')
  print(final_df.loc[final_df['cluster'] == cluster]['country'])


cluster 1:
0            Afghanistan
3         American Samoa
5                 Angola
13            Bangladesh
25               Burundi
27              Cameroon
34               Comoros
39              Djibouti
45     Equatorial Guinea
48              Ethiopia
54                 Gabon
57                 Ghana
65                Guinea
66         Guinea-Bissau
68                 Haiti
82                 Kenya
95                Malawi
98                  Mali
107             Mongolia
109              Morocco
110           Mozambique
112                Nauru
117            Nicaragua
119              Nigeria
122             Pakistan
125     Papua New Guinea
134               Rwanda
140              Senegal
143         Sierra Leone
148              Somalia
152                Sudan
156           Tajikistan
164               Uganda
171              Vanuatu
173                Yemen
174               Zambia
175             Zimbabwe
Name: country, dtype: object

cluster 0:
1                   Al

Com as clusters definidas, podemos analisar a diferença entre as medianas de cada indicador de saneamento básico e notar suas diferenças. Assim, é possível perceber de modo geral os indicadores de cada grupo.

In [10]:
clusters_possiveis = final_df['cluster'].unique()

for cluster in clusters_possiveis:
  p_cluster = final_df.loc[final_df['cluster'] == cluster]['country'].values
  df_cluster = final_df.loc[final_df['country'].isin(p_cluster)]
  print('cluster ' + str(cluster)+':')
  print('Proportion of population using at least basic sanitation services: ', df_cluster['Proportion of population using at least basic sanitation services'].median())
  print('Proportion of population using at least basic drinking water services: ', df_cluster['Proportion of population using at least basic drinking water services'].median())
  print('Proportion of population practising open defecation ', df_cluster['Proportion of population practising open defecation'].median())

cluster 1:
Proportion of population using at least basic sanitation services:  42.99559
Proportion of population using at least basic drinking water services:  64.468964
Proportion of population practising open defecation  13.260993
cluster 0:
Proportion of population using at least basic sanitation services:  97.724113
Proportion of population using at least basic drinking water services:  99.187973
Proportion of population practising open defecation  0.0
cluster 3:
Proportion of population using at least basic sanitation services:  26.4891825
Proportion of population using at least basic drinking water services:  68.68840800000001
Proportion of population practising open defecation  43.966120000000004
cluster 4:
Proportion of population using at least basic sanitation services:  78.30978
Proportion of population using at least basic drinking water services:  93.124565
Proportion of population practising open defecation  6.02069925
