In [1]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import json
import folium
from sklearn.cluster import KMeans, DBSCAN

In [2]:
# Read votation data
df = pd.read_pickle("data/data.pkl")

In [3]:
df.iloc[:100]

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,Canton,Pays
0,Aeugst am Albis,29.11.1998 Initiative Droleg,1070.0,487.0,45.5,478.0,167.0,311.0,34.9,Affoltern,Zürich,Suisse
1,Aeugst am Albis,14.06.2015 Initiative sur les bourses d'études,1380.0,706.0,51.2,695.0,186.0,509.0,26.8,Affoltern,Zürich,Suisse
2,Aeugst am Albis,25.09.2016 Loi fédérale sur le renseignement,1400.0,670.0,47.9,659.0,417.0,242.0,63.3,Affoltern,Zürich,Suisse
3,Aeugst am Albis,03.03.1991 Encouragement des transports publics,835.0,321.0,38.4,312.0,128.0,184.0,41.0,Affoltern,Zürich,Suisse
4,Aeugst am Albis,12.02.2017 Réforme de l'imposition des entrepr...,1395.0,759.0,54.4,750.0,318.0,432.0,42.4,Affoltern,Zürich,Suisse
5,Aeugst am Albis,07.03.2010 Recherche sur l'être humain,1262.0,588.0,46.6,579.0,476.0,103.0,82.2,Affoltern,Zürich,Suisse
6,Aeugst am Albis,08.02.2004 Initiative pour l'internement des d...,1135.0,590.0,52.0,578.0,266.0,312.0,46.0,Affoltern,Zürich,Suisse
7,Aeugst am Albis,"22.09.2002 L'or à l'AVS, aux cantons et à la F...",1113.0,610.0,54.8,604.0,325.0,267.0,54.9,Affoltern,Zürich,Suisse
8,Aeugst am Albis,24.11.2002 Contre les abus dans le droit d'asile,1117.0,662.0,59.3,648.0,282.0,366.0,43.5,Affoltern,Zürich,Suisse
9,Aeugst am Albis,01.12.1985 Suppression de la vivisection,683.0,363.0,53.1,358.0,135.0,223.0,37.7,Affoltern,Zürich,Suisse


In [4]:
commune_list = list(set(df['Commune'].values))
commune_dict = { val : idx for idx , val in enumerate(commune_list)   }

votation_list = list(set(df['Votation'].values))
votation_dict = { val : idx for idx , val in enumerate(votation_list)   }

X = np.ones((len(commune_list) , len(votation_list) ) , dtype=float)

In [5]:
for x in df [['Commune','Votation','Oui en %']].fillna(50).values :
    X [commune_dict[x[0]]][votation_dict[x[1]]] = x[2]

In [6]:
X

array([[ 59.1,  21.4,  65.7, ...,  79.7,  52.5,  38.3],
       [ 35.2,  13.6,  54.7, ...,  79.5,  65.2,  16.8],
       [ 35.9,  18.7,  53.8, ...,  61.8,  74.4,  65.5],
       ..., 
       [ 10.6,  15. ,  58.8, ...,  57.1,  76.6,  17.1],
       [ 49.5,  13.4,  48.6, ...,  78.7,  81.7,  41.1],
       [ 47.6,  21. ,  40.3, ...,  90.6,  80.9,  42.2]])

In [7]:
switzerland_coord = [46.765213, 8.252444]
town_geo_path = r'data/switzerland_borders/admin_level_8.geojson'
geo_json_data = json.load(open(town_geo_path, encoding="utf8"))

In [8]:
color_list = ['#ff0000' , '#00ff00' , '#0000ff' , '#ffff00' , '#ff00ff' , '#00ffff'  , '#000000']


In [9]:
def style_function(feature):
    #print(feature['name'])
    language = languages_series.get(feature['name'], None)
    return {
        'fillOpacity': 0.5,
        'weight': 0,
        'fillColor':  color_language(language)
    }

def draw_map_kmeans (n_clusters) :
    kmeans_res = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    groups = kmeans_res.labels_
    
    centers = kmeans_res.cluster_centers_
    print( [ [ np.linalg.norm(x-y) for y in centers] for x in centers])
    
    to_map = pd.DataFrame({'Commune' : commune_list , 'Group' : groups})
    
    map1 = folium.Map(location=switzerland_coord, zoom_start=8)
    map1.choropleth(geo_data = geo_json_data, \
                                    data = to_map, \
                                    columns = ['Commune', 'Group'], \
                                    key_on = 'feature.name', \
                                    fill_color = 'RdYlGn', \
                                     fill_opacity = 0.7, \
                                    line_opacity = 0.2, \
                                    legend_name = 'group')
    
    return map1

In [10]:
for i in range (2,6) :  
    draw_map_kmeans(i).save('data/map_ml/kmeans'+str(i)+'.html')

[[0.0, 208.67401237835256], [208.67401237835256, 0.0]]
[[0.0, 196.46976613823145, 144.45625158496566], [196.46976613823145, 0.0, 247.08214376381318], [144.45625158496566, 247.08214376381318, 0.0]]
[[0.0, 207.85672978130714, 141.34595602506749, 193.87074264647023], [207.85672978130714, 0.0, 247.64281574780981, 221.59382027546036], [141.34595602506749, 247.64281574780981, 0.0, 236.36037086236314], [193.87074264647023, 221.59382027546036, 236.36037086236314, 0.0]]
[[0.0, 275.89090092058183, 237.04607787057091, 222.98561992569216, 140.6452525531449], [275.89090092058183, 0.0, 242.72212200864689, 141.06638785088913, 215.67520469036879], [237.04607787057091, 242.72212200864689, 0.0, 214.700838379114, 194.22622625708948], [222.98561992569216, 141.06638785088913, 214.700838379114, 0.0, 221.63966288565692], [140.6452525531449, 215.67520469036879, 194.22622625708948, 221.63966288565692, 0.0]]


In [11]:
groups = DBSCAN().fit(X)
groups.labels_

array([-1, -1, -1, ..., -1, -1, -1], dtype=int64)