In [7]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import json
import folium
from sklearn.cluster import KMeans, DBSCAN

import matplotlib.pyplot as plt


from sklearn.decomposition import PCA

In [8]:
# Read votation data
df = pd.read_pickle("data/data.pkl")

In [9]:
df.head()

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,Canton
0,Aeugst am Albis,29.11.1998 Initiative Droleg,1070.0,487.0,45.5,478.0,167.0,311.0,34.9,Affoltern,Zürich
1,Aeugst am Albis,14.06.2015 Initiative sur les bourses d'études,1380.0,706.0,51.2,695.0,186.0,509.0,26.8,Affoltern,Zürich
2,Aeugst am Albis,25.09.2016 Loi fédérale sur le renseignement,1400.0,670.0,47.9,659.0,417.0,242.0,63.3,Affoltern,Zürich
3,Aeugst am Albis,03.03.1991 Encouragement des transports publics,835.0,321.0,38.4,312.0,128.0,184.0,41.0,Affoltern,Zürich
4,Aeugst am Albis,12.02.2017 Réforme de l'imposition des entrepr...,1395.0,759.0,54.4,750.0,318.0,432.0,42.4,Affoltern,Zürich


In [10]:
commune_list = list(set(df['Commune'].values))
print(len(commune_list))
commune_dict = { val : idx for idx , val in enumerate(commune_list)   }

votation_list = list(set(df['Votation'].values))
votation_dict = { val : idx for idx , val in enumerate(votation_list)   }

X = np.ones((len(commune_list) , len(votation_list) ) , dtype=float)

2233


In [11]:
for x in df [['Commune','Votation','Oui en %']].fillna(50).values :
    X [commune_dict[x[0]]][votation_dict[x[1]]] = x[2]

In [12]:
X

array([[ 26.2,  25. ,  80.8, ...,  20.7,  60.1,  43.4],
       [ 25.8,  21.3,  65.6, ...,  26.1,  61.3,  29.8],
       [ 19.6,  24.5,  72.2, ...,  28.4,  64.4,  37.8],
       ..., 
       [ 33. ,  30.4,  77. , ...,  30.4,  64.9,  64.9],
       [ 49.8,  32.6,  78.3, ...,  43. ,  58.1,  65.9],
       [ 17.8,  35. ,  88.1, ...,  21.1,  66. ,  48.8]])

In [24]:
switzerland_coord = [46.765213, 8.252444]
town_geo_path = r'data/switzerland_borders/municipalities.geojson'
geo_json_data = json.load(open(town_geo_path, encoding="utf8"))

In [21]:
color_list = ['#ff0000' , '#00ff00' , '#0000ff' , '#ffff00' , '#ff00ff' , '#00ffff'  , '#000000']
color_list_rgb = [ 'rgb(255, 0,0)','rgb(0, 255,0)','rgb(0, 0,255)','rgb(255, 255,0)','rgb(255, 0,255)','rgb(0, 255,255)','rgb(0, 0,0)',]


In [14]:
def style_function(feature):
    #print(feature['name'])
    language = languages_series.get(feature['name'], None)
    return {
        'fillOpacity': 0.5,
        'weight': 0,
        'fillColor':  color_language(language)
    }

def draw_map_kmeans (n_clusters , X , file_PCA = None ) :
    kmeans_res = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    groups = kmeans_res.labels_
    
    plt.figure(100+n_clusters)
    
    
    model_PCA = PCA ( n_components=2)
    X_PCA = model_PCA.fit_transform(X)
    
    for current_group in range (n_clusters) :
        group_y = [X_PCA[i] for i in range(len(X_PCA)) if groups[i] == current_group]
        plt.scatter( [x[0] for x in group_y], [x[1] for x in group_y], c= color_list[current_group])
    
    if (file_PCA != None) :
        plt.savefig(file_PCA+'PCAA_kmeans'+str(n_clusters)+'.png')
    else :
        print(plt.show())
    plt.gcf().clear()
    
    centers = kmeans_res.cluster_centers_
    print( [ [ np.linalg.norm(x-y) for y in centers] for x in centers])
    
    to_map = pd.DataFrame({'Commune' : commune_list , 'Group' : groups})
    
    map1 = folium.Map(location=switzerland_coord, zoom_start=8)
    map1.choropleth(geo_data = geo_json_data, \
                                    data = to_map, \
                                    columns = ['Commune', 'Group'], \
                                    key_on = 'feature.name', \
                                    fill_color = 'RdYlGn', \
                                     fill_opacity = 0.7, \
                                    line_opacity = 0.2, \
                                    legend_name = 'group')
    
    return map1

In [None]:
for i in range (2,6) :  
    draw_map_kmeans(i,X, file_PCA='data/map_ml/').save('data/map_ml/kmeans'+str(i)+'.html')

[[0.0, 209.56305548418058], [209.56305548418058, 0.0]]
[[0.0, 246.2860159230695, 197.93960600348566], [246.2860159230695, 0.0, 144.4099056981226], [197.93960600348566, 144.4099056981226, 0.0]]


In [29]:
def draw_map_DBSCAN (X , file_PCA = None) : 
    min_samples = 20

    X_array = [ np.array(x_) for x_ in X]
    range_X = range(len(X))
    Xmeans = np.mean([ np.mean(\
                            np.sort([np.linalg.norm(X_array[x]-X_array[y]) \
                             for x in range_X  if x!=y])[:(min_samples*2-1)] \
                           )\
                   for y in range_X ] )
    groups =  DBSCAN(eps=Xmeans, min_samples=min_samples).fit(X).labels_
    
    
    model_PCA = PCA ( n_components=2)
    X_PCA = model_PCA.fit_transform(X)
    
    n_clusters = max(groups)+1
    for current_group in range (-1,n_clusters) :
        group_y = [X_PCA[i] for i in range(len(X_PCA)) if groups[i] == current_group]
        plt.scatter( [x[0] for x in group_y], [x[1] for x in group_y], c= color_list[current_group])
    
    
    if (file_PCA != None) :
        
        plt.savefig(file_PCA+'PCAA_DBSCAN.png')
    else :
        print(plt.show())
        
    plt.gcf().clear()
    
    to_map = pd.DataFrame({'Commune' : commune_list , 'Group' : groups})
    
    map1 = folium.Map(location=switzerland_coord, zoom_start=8)
    map1.choropleth(geo_data = geo_json_data, \
                                    data = to_map, \
                                    columns = ['Commune', 'Group'], \
                                    key_on = 'feature.name', \
                                    fill_color = 'RdYlGn', \
                                     fill_opacity = 0.7, \
                                    line_opacity = 0.2, \
                                    legend_name = 'group')
    
    return map1

In [28]:
draw_map_DBSCAN (X,'data/map_ml/').save('data/map_ml/DBSCAN.html')

<matplotlib.figure.Figure at 0x1516eb9dda0>

# ml for theme

In [31]:
data_t = pd.read_pickle('data/data_theme.pkl')
data_t.head()

Unnamed: 0,Votation,Thématique
0,14.06.1981 Protection des consommateurs,Economie
1,29.11.1981 Régime financier,Finances publiques
2,14.06.1981 Egalité entre hommes et femmes,Politique sociale
3,06.06.1982 Code pénal suisse,Régime politique
4,06.06.1982 Loi sur les étrangers,Politique sociale


In [32]:
df.head()

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,Canton
0,Aeugst am Albis,29.11.1998 Initiative Droleg,1070.0,487.0,45.5,478.0,167.0,311.0,34.9,Affoltern,Zürich
1,Aeugst am Albis,14.06.2015 Initiative sur les bourses d'études,1380.0,706.0,51.2,695.0,186.0,509.0,26.8,Affoltern,Zürich
2,Aeugst am Albis,25.09.2016 Loi fédérale sur le renseignement,1400.0,670.0,47.9,659.0,417.0,242.0,63.3,Affoltern,Zürich
3,Aeugst am Albis,03.03.1991 Encouragement des transports publics,835.0,321.0,38.4,312.0,128.0,184.0,41.0,Affoltern,Zürich
4,Aeugst am Albis,12.02.2017 Réforme de l'imposition des entrepr...,1395.0,759.0,54.4,750.0,318.0,432.0,42.4,Affoltern,Zürich


In [33]:
data_theme = df.merge(data_t , on = 'Votation' )
data_theme.head()

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,Canton,Thématique
0,Aeugst am Albis,29.11.1998 Initiative Droleg,1070.0,487.0,45.5,478.0,167.0,311.0,34.9,Affoltern,Zürich,Politique sociale
1,Affoltern am Albis,29.11.1998 Initiative Droleg,5729.0,2286.0,39.9,2236.0,678.0,1558.0,30.3,Affoltern,Zürich,Politique sociale
2,Bonstetten,29.11.1998 Initiative Droleg,2596.0,1063.0,40.9,1045.0,358.0,687.0,34.3,Affoltern,Zürich,Politique sociale
3,Hausen am Albis,29.11.1998 Initiative Droleg,2081.0,807.0,38.8,792.0,281.0,511.0,35.5,Affoltern,Zürich,Politique sociale
4,Hedingen,29.11.1998 Initiative Droleg,1858.0,810.0,43.6,791.0,246.0,545.0,31.1,Affoltern,Zürich,Politique sociale


In [34]:
#data_bg = data by group
for data_bg in data_theme[['Thématique','Votation','Commune','Oui en %']].groupby('Thématique') :
    theme = data_bg[0]
    data_bg = pd.DataFrame(data = data_bg[1])

    votation_list_t = list(set(data_bg['Votation'].values))
    votation_dict_t = { val : idx for idx , val in enumerate(votation_list_t)   }

    Xt = np.ones((len(commune_list) , len(votation_list_t) ) , dtype=float)
    
    for x in data_bg [['Commune','Votation','Oui en %']].fillna(50).values :
        Xt [commune_dict[x[0]]][votation_dict_t[x[1]]] = x[2]
    draw_map_kmeans(2,Xt,'data/maps_theme_ml/'+theme).save('data/maps_theme_ml/kmeans_'+theme+'.html') 
    draw_map_DBSCAN (Xt,'data/maps_theme_ml/'+theme).save('data/maps_theme_ml/DBSCAN_'+theme+'.html')
    

[[0.0, 69.263024352988879], [69.263024352988879, 0.0]]
[[0.0, 58.326359018782043], [58.326359018782043, 0.0]]
[[0.0, 41.701031699501421], [41.701031699501421, 0.0]]
[[0.0, 102.44122250310804], [102.44122250310804, 0.0]]
[[0.0, 44.899674692763618], [44.899674692763618, 0.0]]
[[0.0, 120.06876641783963], [120.06876641783963, 0.0]]
[[0.0, 69.593848635856332], [69.593848635856332, 0.0]]
[[0.0, 79.606055928943789], [79.606055928943789, 0.0]]


<matplotlib.figure.Figure at 0x15102782ac8>

# ML par partie