In [None]:
# Загрузка данных
import os
import pandas as pd
pd.set_option('display.max_columns', 25)
# Визуализация
import plotly.express as px
import plotly.graph_objects as go
# Сжатие и кластеризация
from umap import UMAP
from sklearn.cluster import KMeans, AgglomerativeClustering

In [None]:
# Период данных
YEAR_1 = '2012'
YEAR_2 = '2018'

In [None]:
umap_components = 3
n_clusters = 3

In [None]:
files = [
 'Adolescent fertility rate (births per 1,000 women ages 15-19).csv',
 'CO2 emissions (metric tons per capita).csv',
 'Employment in agriculture, female.csv',
 'Employment in agriculture, male.csv',
 'Employment in industry, female.csv',
 'Employment in industry, male.csv',
 'Employment in services, female.csv',
 'Employment in services, male.csv',
 'Exports of goods and services (% of GDP).csv',
 'Fertility rate, total (births per woman).csv',
 'Fuel exports (% of exports).csv',
 'GDP per capita, PPP.csv',
 'Imports of goods and services (% of GDP).csv',
 'Life expectancy.csv',
 'Ores and metals exports (% of exports).csv',
 'Population ages 0-14 (% of total population).csv',
 'Population ages 15-64 (% of total population).csv',
 'Population ages 65 and above (% of total population).csv',
 'Population growth (annual %).csv',

# Показатели с небольшим числом наблюдений не включаем
#  'Research and development expenditure (% of GDP).csv',
#  'Cereal yield (kg per hectare).csv',
#  'High-technology exports (% of exports).csv',
#  'Income share held by fourth 20%.csv',
#  'Income share held by highest 10%.csv',
#  'Income share held by highest 20%.csv',
#  'Income share held by lowest 10%.csv',
#  'Income share held by lowest 20%.csv',
#  'Income share held by second 20%.csv',
#  'Income share held by third 20%.csv',
#  'Intentional homicides (per 100,000 people).csv',

]

In [None]:
df = pd.DataFrame()
for file in files:
    # Открываем файл, считаем среднее за 2012-2018 годы по показателю
    series = pd.read_csv(f'data/{file}', skiprows=4, index_col='Country Name').loc[:, YEAR_1:YEAR_2].mean(axis=1)
    df[file.replace('.csv', '')] = series

In [None]:
df.info()
df = df.dropna()
print(len(df))

In [None]:
# Нормализация данных
df = (df - df.min()) / (df.max() - df.min())

In [None]:
# df.to_excel('data_norm.xlsx')

In [None]:
# Сжатие размерности

umap_names = [f'UMAP {n+1}' for n in range(umap_components)]
umap = UMAP(n_components=umap_components, random_state=0)

for umap_name, umap_values in zip(umap_names, umap.fit_transform(df).T):
    df[umap_name] = umap_values

In [None]:
# Кластеризация

# clust = KMeans(n_clusters=n_clusters, random_state=0)
clust = AgglomerativeClustering(n_clusters=n_clusters, linkage='single')
df['cluster'] = clust.fit_predict(df[umap_names])

In [None]:
# Визуализация сжатых данных (UMAP)

COLORS = ['green', 'red', 'orange']

fig = px.scatter_3d(df, x='UMAP 1', y='UMAP 2', z='UMAP 3',
                    hover_name=df.index, color='cluster', color_continuous_scale=COLORS)

fig.update_layout(plot_bgcolor='white', width=1920-2*8, height=1080-2*8, font_size=15, margin=dict(l=0,r=0,t=0,b=0))
fig.update_layout(coloraxis_showscale=False)
#
# fig.write_html('Presentation/UMAP Dots.html')
#
fig.show()

In [None]:
# Средние значения по кластерам
df_cluster_mean = df.groupby('cluster').mean().sort_values(by='Life expectancy')
df_cluster_mean

In [None]:
# df_cluster_mean.to_excel('cluster_mean.xlsx')

In [None]:
# Подбор оптимального числа кластеров
# Метод локтя
def k_tuning(k):
    clust = KMeans(n_clusters=k)
    clust.fit_predict(df[['UMAP 1', 'UMAP 2', 'UMAP 3']])
    return clust.inertia_

In [None]:
k_tuning_list = [k_tuning(k) for k in range(1, 20 + 1)]
one_cluster = k_tuning_list[0]

In [None]:
fig = go.Figure()

fig.add_traces(go.Scatter(x=list(range(1, 20 + 1)),y=k_tuning_list, mode='lines+markers', marker_color='black'))
fig.add_traces(go.Scatter(x=[1, 20], y=[k_tuning_list[0], k_tuning_list[-1]], marker_color='red', line_dash='dash'))
fig.update_layout(plot_bgcolor='white', width=1920-2*8, height=1080-2*8, font_size=15, margin=dict(l=0,r=0,t=0,b=0))

#
# fig.write_html('Presentation/UMAP N Clusters.html')
#

fig.show()

In [None]:
# Считывает коды стран
# Необходимо для карты
file = os.listdir('data')[0]
codes = pd.read_csv(f'data/{file}', skiprows=4, index_col='Country Name')['Country Code']
df['code'] = codes

In [None]:
# Карта кластеров
fig = px.choropleth(df, locations='code',
                    color='cluster',
                    hover_name=df.index,
                    projection='natural earth',
                    color_continuous_scale=COLORS)

fig.update_layout(coloraxis_showscale=False)
fig.update_layout(plot_bgcolor='white', width=1920-2*8, height=1080-2*8, margin=dict(l=0,r=0,t=0,b=0))
#
# fig.write_html('Presentation/UMAP Map.html')
#
fig.show()

In [None]:
df.to_excel('umap_result.xlsx')

In [None]:
# Вычисляем средние показатели неравенства по кластерам
df_extra = df[['GDP per capita, PPP', 'cluster']].copy()
for file in os.listdir('data_extra'):
    series = pd.read_csv(f'data/{file}', skiprows=4, index_col='Country Name').loc[:, YEAR_1:YEAR_2].mean(axis=1)
    df_extra[file.replace('.csv', '')] = series

In [None]:
df_extra_cluster_mean = df_extra.groupby('cluster').mean().sort_values(by='GDP per capita, PPP')
df_extra_cluster_mean

In [None]:
# df_extra_cluster_mean.to_excel('extra_cluster_mean.xlsx')