In [5]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import json
import folium
from sklearn.cluster import KMeans, DBSCAN

In [6]:
# Read votation data
df = pd.read_pickle("data/data.pkl")

In [7]:
df.head()

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,Canton,Pays
0,Aeugst am Albis,29.11.1998 Initiative Droleg,1070.0,487.0,45.5,478.0,167.0,311.0,34.9,Affoltern,Zürich,Suisse
1,Aeugst am Albis,14.06.2015 Initiative sur les bourses d'études,1380.0,706.0,51.2,695.0,186.0,509.0,26.8,Affoltern,Zürich,Suisse
2,Aeugst am Albis,25.09.2016 Loi fédérale sur le renseignement,1400.0,670.0,47.9,659.0,417.0,242.0,63.3,Affoltern,Zürich,Suisse
3,Aeugst am Albis,03.03.1991 Encouragement des transports publics,835.0,321.0,38.4,312.0,128.0,184.0,41.0,Affoltern,Zürich,Suisse
4,Aeugst am Albis,12.02.2017 Réforme de l'imposition des entrepr...,1395.0,759.0,54.4,750.0,318.0,432.0,42.4,Affoltern,Zürich,Suisse


In [8]:
commune_list = list(set(df['Commune'].values))
print(len(commune_list))
commune_dict = { val : idx for idx , val in enumerate(commune_list)   }

votation_list = list(set(df['Votation'].values))
votation_dict = { val : idx for idx , val in enumerate(votation_list)   }

X = np.ones((len(commune_list) , len(votation_list) ) , dtype=float)

2165


In [9]:
for x in df [['Commune','Votation','Oui en %']].fillna(50).values :
    X [commune_dict[x[0]]][votation_dict[x[1]]] = x[2]

In [10]:
X

array([[ 56.1,  27.5,  41.4, ...,  41.2,  34.8,  46.2],
       [ 72.3,  19. ,  50. , ...,  84.4,  47.2,  70.3],
       [ 62.7,  27.5,  42.1, ...,  46.4,  51.1,  49. ],
       ..., 
       [ 55.2,  33. ,  43.9, ...,  51.9,  59.4,  61.1],
       [ 47.7,  31.5,  52.9, ...,  12.5,  30.7,  40.4],
       [ 60.7,  32.5,  48.7, ...,  52.8,  56.9,  69.7]])

In [11]:
switzerland_coord = [46.765213, 8.252444]
town_geo_path = r'data/switzerland_borders/admin_level_8.geojson'
geo_json_data = json.load(open(town_geo_path, encoding="utf8"))

In [12]:
color_list = ['#ff0000' , '#00ff00' , '#0000ff' , '#ffff00' , '#ff00ff' , '#00ffff'  , '#000000']


In [13]:
def style_function(feature):
    #print(feature['name'])
    language = languages_series.get(feature['name'], None)
    return {
        'fillOpacity': 0.5,
        'weight': 0,
        'fillColor':  color_language(language)
    }

def draw_map_kmeans (n_clusters) :
    kmeans_res = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    groups = kmeans_res.labels_
    
    centers = kmeans_res.cluster_centers_
    print( [ [ np.linalg.norm(x-y) for y in centers] for x in centers])
    
    to_map = pd.DataFrame({'Commune' : commune_list , 'Group' : groups})
    
    map1 = folium.Map(location=switzerland_coord, zoom_start=8)
    map1.choropleth(geo_data = geo_json_data, \
                                    data = to_map, \
                                    columns = ['Commune', 'Group'], \
                                    key_on = 'feature.name', \
                                    fill_color = 'RdYlGn', \
                                     fill_opacity = 0.7, \
                                    line_opacity = 0.2, \
                                    legend_name = 'group')
    
    return map1

In [14]:
for i in range (2,6) :  
    draw_map_kmeans(i).save('data/map_ml/kmeans'+str(i)+'.html')

[[0.0, 208.67401237835259], [208.67401237835259, 0.0]]
[[0.0, 247.21580842824972, 144.46503224810522], [247.21580842824972, 0.0, 196.38733951665665], [144.46503224810522, 196.38733951665665, 0.0]]
[[0.0, 236.4110594985622, 247.68057562772188, 141.34287366739736], [236.4110594985622, 0.0, 221.59382027546039, 193.87486182163735], [247.68057562772188, 221.59382027546039, 0.0, 207.87304423530244], [141.34287366739736, 193.87486182163735, 207.87304423530244, 0.0]]
[[0.0, 275.89090092058177, 215.67520469036882, 242.72212200864692, 141.06638785088913], [275.89090092058177, 0.0, 140.64525255314481, 237.04607787057085, 222.98561992569213], [215.67520469036882, 140.64525255314481, 0.0, 194.22622625708948, 221.63966288565692], [242.72212200864692, 237.04607787057085, 194.22622625708948, 0.0, 214.70083837911403], [141.06638785088913, 222.98561992569213, 221.63966288565692, 214.70083837911403, 0.0]]


KeyboardInterrupt: 

In [None]:
def draw_map_DBSCAN (X) : 
    min_samples = 20

    X_array = [ np.array(x_) for x_ in X]
    range_X = range(len(X))
    Xmeans = np.mean([ np.mean(\
                            np.sort([np.linalg.norm(X_array[x]-X_array[y]) \
                             for x in range_X  if x!=y])[:(min_samples-1)] \
                           )\
                   for y in range_X ] )
    groups =  DBSCAN(eps=Xmeans, min_samples=min_samples).fit(X).labels_

    
  
    
    to_map = pd.DataFrame({'Commune' : commune_list , 'Group' : groups})
    
    map1 = folium.Map(location=switzerland_coord, zoom_start=8)
    map1.choropleth(geo_data = geo_json_data, \
                                    data = to_map, \
                                    columns = ['Commune', 'Group'], \
                                    key_on = 'feature.name', \
                                    fill_color = 'RdYlGn', \
                                     fill_opacity = 0.7, \
                                    line_opacity = 0.2, \
                                    legend_name = 'group')
    
    return map1

In [None]:
draw_map_DBSCAN (X).save('data/map_ml/DBSCAN.html')

In [37]:
import networkx as nx
G = nx.DiGraph()

min_max =[0,0]
max_value = -1


X_len = len(X)
for i in range (X_len) :
    #G.add_edge('g',i , capacity =10^3000)
    #G.add_edge(i,'e' , capacity = 10^3000)
    for j in range (i+1,X_len) :
        dist = 1-np.linalg.norm(X[i]/100-X[j]/100)
        G.add_edge(i,j, capacity = dist)
        G.add_edge(j,i, capacity = dist)
        if ( dist > max_value) :
            min_max = [i,j]
G

<networkx.classes.digraph.DiGraph at 0x1f98f861780>

In [38]:
cut_value, partition = nx.minimum_cut(G, min_max[0],min_max[1])
reachable, non_reachable = partition

In [39]:
cut_value

2.54028112712315

In [43]:
len(non_reachable)


162

In [18]:
G = nx.DiGraph()
G.add_edge('x','a', capacity = 3.0)
G.add_edge('x','b', capacity = 1.0)
G.add_edge('a','c', capacity = 3.0)
G.add_edge('b','c', capacity = 5.0)
G.add_edge('b','d', capacity = 4.0)
G.add_edge('d','e', capacity = 2.0)
G.add_edge('c','y', capacity = 2.0)
G.add_edge('e','y', capacity = 3.0)


cut_value, partition = nx.minimum_cut(G, 'x', 'y')
reachable, non_reachable = partition
reachable

{'a', 'c', 'x'}