This is the main notebook, it is used to create the map and any data we need for the website.

We will not explain in this notebook the result obtained or the reason why we want them, those will be discuss in the data story.
Please do not try to run this notebook in a unchronological order otherwise it might not work.

In [1]:
import pandas as pd
import numpy as np
import json
import folium


from difflib import SequenceMatcher


#This 2 library are used for clustering the data
from sklearn.cluster import KMeans, DBSCAN
#This library is used to reduce a vector in such way as to retain as much data as possible
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

In [2]:
# Read votation data
data = pd.read_pickle("data/votations.pkl")

In [3]:
# Function that capitalizes the first letter in a given string
def cap_first(s):
    return s[0].capitalize() + s[1:]

In [4]:
# Create a dataframe with the indices of the votation data
data = pd.DataFrame([x for x, _ in data.index.values]).drop_duplicates()

# Rename the community column
data.columns = ["Commune"]

# Create columns for districts/cantons/countries
data["District"] = np.nan
data["Canton"] = np.nan
data["Pays"] = np.nan

# Extract names of districts/cantons/countries
data["Pays"] = data["Commune"].map(lambda x : x if x[0] != ">" and x[0] != "-" and x[0] != "." else np.nan)
data["Canton"] = data["Commune"].map(lambda x : x[2:] if x[0] == "-" else np.nan)
data["District"] = data["Commune"].map(lambda x : x[3:] if x[0] == ">" else np.nan)

# Propagate names of districts/cantons/countries downwards
data = data.fillna(method='ffill')

# Remove lines that do not describe a community
data = data[data["Commune"].map(lambda x : x[0] == ".")]

# Clean canton and district names
data["Canton"] = data["Canton"].map(lambda x : x if x is np.nan else x.split(" /")[0])
data["District"] = data["District"].map(lambda x :
                                        x if \
                                            "Bezirk See" in x else \
                                        "".join(x.split("'")[1:]).strip() if \
                                            "District d'" in x or \
                                            "District de l'" in x else
                                        cap_first(" ".join(x.split(" ")[2:])).strip() if \
                                            "Arrondissement administratif" in x or \
                                            "District" in x or \
                                            "Canton" in x or \
                                            "Distretto di" in x else \
                                        " ".join(x.split(" ")[1:]).strip() if \
                                            "Verwaltungskreis" in x or \
                                            "Wahlkreis" in x or \
                                            "Kanton" in x or \
                                            "Bezirk" in x or \
                                            "Region" in x \
                                        else x)

data["District"] = data["District"].map(lambda x : \
                                        "Obwald" if x == "Obwalden" else \
                                        "Nidwald" if x == "Nidwalden" else \
                                        x)

# Write correct district/canton/country data for foreign votes
data[["District", "Canton", "Pays"]] = data.apply(lambda x : pd.Series(["-", "-", "Etranger"]) if \
                                        "-Ausland-" in x["District"] or \
                                        " de l'étranger" in x["District"] or \
                                        "-Korrespondenzweg" in x["District"] or \
                                        "-autres" in x["District"] or \
                                        "-voto per corrispondenza" in x["District"] \
                                      else pd.Series([x["District"], x["Canton"], x["Pays"]]), axis=1)

#data.set_index('Commune', inplace=True)
data = data.reset_index(drop=True)

#NAMES NEED TO BE CLEANED AFTER MODIFYING THE PROPER DATA, OTHERWISE COMMUNITY NAMES WILL NOT MATCH
#NO DISTRICTS FOR GENEVA, SCHAFFHAUSEN, APPENZELL INNERRHODEN, OBWALD AND NIDWALD

In [5]:
# Create columns for districts/cantons/countries

df = pd.read_pickle("data/votations.pkl")

df.reset_index(inplace=True)
data = df.merge(data, on="Commune")
data["Commune"] = data["Commune"].map(lambda x : x[7:] if x[0] == "." else x)

data.head()

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,Canton,Pays
0,Aeugst am Albis,29.11.1998 Initiative Droleg,1070.0,487.0,45.5,478.0,167.0,311.0,34.9,Affoltern,Zürich,Suisse
1,Aeugst am Albis,14.06.2015 Initiative sur les bourses d'études,1380.0,706.0,51.2,695.0,186.0,509.0,26.8,Affoltern,Zürich,Suisse
2,Aeugst am Albis,25.09.2016 Loi fédérale sur le renseignement,1400.0,670.0,47.9,659.0,417.0,242.0,63.3,Affoltern,Zürich,Suisse
3,Aeugst am Albis,03.03.1991 Encouragement des transports publics,835.0,321.0,38.4,312.0,128.0,184.0,41.0,Affoltern,Zürich,Suisse
4,Aeugst am Albis,12.02.2017 Réforme de l'imposition des entrepr...,1395.0,759.0,54.4,750.0,318.0,432.0,42.4,Affoltern,Zürich,Suisse


In [6]:
#replace ' (Urne commune)' in the names of towns

def remove_urne_commune(x):
    if ' (Urne commune)' in x:
        return x[:-len(' (Urne commune)')]
    else: return x
    
data['Commune'] = data['Commune'].apply(lambda x : remove_urne_commune(x))

In [7]:
data = data[data['Commune'] != 'Meienried']
data = data[data['Commune'] != 'Hellsau']
data = data[data['Commune'] != 'Rüti bei Lyssach']
data = data[data['Commune'] != 'Deisswil bei Münchenbuchsee']
data = data[data['Commune'] != 'Clavaleyres']
data = data[data['Commune'] != 'Jaberg']
data = data[data['Commune'] != 'Niedermuhlern']

data = data.replace(to_replace='Wald (BE)', value='Wald (BE)-Niedermuhlern')
data = data.replace(to_replace='Kirchdorf (BE)', value='Kirchdorf (BE)-Jaberg')
data = data.replace(to_replace='Münchenwiler', value='Münchenwiler-Clavaleyres')
data = data.replace(to_replace='Wiggiswil', value='Wiggiswil-Deisswil bei Münchenbuchsee')
data = data.replace(to_replace='Mötschwil', value='Mötschwil-Rüti bei Lyssach')
data = data.replace(to_replace='Höchstetten', value='Höchstetten-Hellsau')
data = data.replace(to_replace='Büren an der Aare', value='Büren an der Aare-Meienried')

In [8]:
#drop 'Etranger' and save to pickle
data = data[data['Pays'] == 'Suisse']
data.drop(['Pays'], axis=1, inplace=True)
data.head()

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,Canton
0,Aeugst am Albis,29.11.1998 Initiative Droleg,1070.0,487.0,45.5,478.0,167.0,311.0,34.9,Affoltern,Zürich
1,Aeugst am Albis,14.06.2015 Initiative sur les bourses d'études,1380.0,706.0,51.2,695.0,186.0,509.0,26.8,Affoltern,Zürich
2,Aeugst am Albis,25.09.2016 Loi fédérale sur le renseignement,1400.0,670.0,47.9,659.0,417.0,242.0,63.3,Affoltern,Zürich
3,Aeugst am Albis,03.03.1991 Encouragement des transports publics,835.0,321.0,38.4,312.0,128.0,184.0,41.0,Affoltern,Zürich
4,Aeugst am Albis,12.02.2017 Réforme de l'imposition des entrepr...,1395.0,759.0,54.4,750.0,318.0,432.0,42.4,Affoltern,Zürich


The following cell is loading all the data we need to draw the map. 

In [9]:
#coordinate for folium of the center of Switzerland
switzerland_coord = [46.765213, 8.252444]

#path to a geojson with in it all the borders between cantons and the swiss border.
town_geo_path = r'data/switzerland_borders/municipalities_no_urnes.geojson'
#content of the geojson 
geo_json_data = json.load(open(town_geo_path, encoding="utf8"))
#list of the name of all the commune into the geojson
commune = [x['name']  for x in geo_json_data['features']]

This cell is creating a matrix that represent the dataframe data with a line representing a commune and a column a votation.
This will be used when we will need to clustre our data.
It is possible to pass from a commune to its line on the matrix and the other way around by using respectively commune_dict (which is a dictionary commune to index) and commue_list. It is the same for each vote with votation_dict and votation_list.

If we are missing some data for the matrix we put this vote of a canton to 50% because we cannot let a empty case for the clustering and this way the votation will be counting as not really having an opinion about the vote. (This case will happen 1306 on 696696 so this will not skew the resuts too much).

In [10]:
commune_list = list(set(data['Commune'].values))
commune_dict = { val : idx for idx , val in enumerate(commune_list)   }

votation_list = list(set(data['Votation'].values))
votation_dict = { val : idx for idx , val in enumerate(votation_list)   }

# we create an array of the good size and for each line we get the the result of the votation in the good position,
#using the dictionaries to find the good indexes.
X = np.ones((len(commune_list) , len(votation_list) ) , dtype=float)
for x in data [['Commune','Votation','Oui en %']].fillna(50).values :
    X [commune_dict[x[0]]][votation_dict[x[1]]] = x[2]

This list is a color list that will be used to colour the different group in the maps where the distinction we want to do are about a few discrete values.

In [11]:
#color_list = [ '#c7e9b4' , '#7fcdbb' , '#41b6c4' , '#2c7fb8' , '#253494' , '#0c2c84' , '#bd0026' ]
color_list = [ '#9ecae1' , '#2171b5' , '#a1d99b' , '#238b45' , '#fdae6b' , '#6a51a3' , '#bd0026' ]

## Draw map languages

This map is about creating a map which show what language is talked in this commune and at which intensity, and it will show the most voted and less voted proposition by language.

This cell loads the language spoken by commune, for each commune we gives the information of the language spoken (french, german, italian or romansh) and at which intensity it is spoken (big or medium) or if no language is a majority. 

In [12]:
languages = pd.read_excel('data/languages_2000.xlsx', skiprows=1, skip_footer=11)
languages.drop(['Regions-ID'], axis=1, inplace=True)
languages.columns = ['Commune' , 'Language']
languages.head()

Unnamed: 0,Commune,Language
0,Aeugst am Albis,Allemand: forte
1,Affoltern am Albis,Allemand: moyenne
2,Bonstetten,Allemand: forte
3,Hausen am Albis,Allemand: forte
4,Hedingen,Allemand: forte


In [13]:
#give for each line of data the language spoken inside its commune. 
data_lang = data.merge ( languages , on = 'Commune')
data_lang.head()

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,Canton,Language
0,Aeugst am Albis,29.11.1998 Initiative Droleg,1070.0,487.0,45.5,478.0,167.0,311.0,34.9,Affoltern,Zürich,Allemand: forte
1,Aeugst am Albis,14.06.2015 Initiative sur les bourses d'études,1380.0,706.0,51.2,695.0,186.0,509.0,26.8,Affoltern,Zürich,Allemand: forte
2,Aeugst am Albis,25.09.2016 Loi fédérale sur le renseignement,1400.0,670.0,47.9,659.0,417.0,242.0,63.3,Affoltern,Zürich,Allemand: forte
3,Aeugst am Albis,03.03.1991 Encouragement des transports publics,835.0,321.0,38.4,312.0,128.0,184.0,41.0,Affoltern,Zürich,Allemand: forte
4,Aeugst am Albis,12.02.2017 Réforme de l'imposition des entrepr...,1395.0,759.0,54.4,750.0,318.0,432.0,42.4,Affoltern,Zürich,Allemand: forte


In [14]:
#this is a dictionnary that associate a language to a color on the map.
color_language = {
    'Allemand: forte':'red',
    'Allemand: moyenne':'lightcoral',
    'Français: forte':'blue',
     'Français: moyenne':'lightskyblue',
    'Italien: forte':'limegreen',
     'Italien: moyenne':'darkseagreen',
    'Romanche: forte':'yellow',
    'Romanche: moyenne':'khaki',
    'Pas de dominance nette': 'grey'   
}

In [15]:
#this cell is drawing the map of the language in Switzerland


languages_series = languages.set_index('Commune')['Language']

#this function will be used inside the geojson method to color the part of the map as descibed by color_language
def style_function_language(feature):
    language = languages_series.get(feature['name'], None)
    if(language == None):
        print(feature['name'])
    return {
        'fillOpacity': 1,
        'weight': 0,
        'fillColor': color_language[language]
    }

m = folium.Map(
    location=switzerland_coord,
    tiles=None,
    zoom_start=8
)

folium.GeoJson(
    geo_json_data,
    style_function=style_function_language
).add_to(m)

m.choropleth(geo_data=geo_json_data,
             fill_opacity=0,
             line_opacity=1)

m.save('data/map_language.html')

For each language and intensity spoken we search for the 5 most voted and the 5 least voted.

In [16]:
for data_by_l in data_lang.groupby('Language') :
    current_language = data_by_l[0]
    databl_mean = data_by_l[1].groupby('Votation' , as_index = False).mean()[['Votation','Oui en %']]
    databl_votation = databl_mean.sort_values(by='Oui en %' , ascending = False)
    print (current_language + ' max : '  )
    print (databl_votation.head(5))
    
    print (current_language + ' min : '  )
    print (databl_votation.tail(5))

Allemand: forte max : 
                                              Votation   Oui en %
107  10.03.1996 Transfert de la commune de Vellerat...  89.625902
19                    02.12.2001 Frein à l'endettement  86.600879
178                  18.05.2014 Soins médicaux de base  85.846480
65           07.02.1999 Médecine de la transplantation  84.996163
97                          09.06.2013 Loi sur l'asile  84.108960
Allemand: forte min : 
                                              Votation   Oui en %
32                          04.03.2001 Oui à l'Europe!  14.187850
276    28.09.1986 En faveur de la culture (initiative)  12.616103
124                   12.03.2000 Initiative des quotas  12.483773
18               02.12.1984 Protection de la maternité  10.493237
82   08.03.2015 Initiative «Remplacer la taxe sur l...   6.516320
Allemand: moyenne max : 
                                              Votation   Oui en %
107  10.03.1996 Transfert de la commune de Vellerat...  90.740373
178  

### Map by theme

This part creates maps for each that show the percentage of agreement for each thematique.

In [17]:
themes = pd.read_csv("data/px-x-1703010000_103.csv", sep=";", encoding="cp1254", skiprows=2)[:-1]
themes = themes[~themes['Période'].str.contains("bis")]
themes["Période"] = themes["Période"].apply(lambda x : x.split(" ")[1])
themes = themes.set_index("Période")
themes.head()

Unnamed: 0_level_0,Régime politique,Politique étrangère,Politique de sécurité,Economie,Finances publiques,"Infrastructure, aménagement, environnement",Politique sociale,"Enseignement, culture et médias"
Période,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0
2016,1.0,0.0,0.0,1.0,0.0,5.0,6.0,0.0
2015,0.0,0.0,0.0,0.0,2.0,0.0,2.0,2.0
2014,1.0,1.0,1.0,1.0,2.0,1.0,5.0,0.0
2013,1.0,0.0,1.0,1.0,0.0,2.0,6.0,0.0


In [18]:
#We extract the subjects of votations from the index of the data and make a dataframe we the subjects and the year
df = pd.read_pickle("data/votations.pkl")
votations = pd.DataFrame(df.index.levels[1])
votations["Année"] = votations["Votation"].apply(lambda s : s[6:10])
votations = votations.sort_values("Année")
votations = votations.reset_index(drop=True)

header = list(themes.columns)
values = []

#Now let's use our indexation of themes to append a theme to each votation. We use a handmade indexation of the
#themes since no mapping between subjects and themes exist online. We used the available listing of voted themes
#for each year and manually reattributed the themes to the subjects.
with open("data/theme_indices.txt", "r") as file:
    for year in votations["Année"].unique():
        indices = file.readline().replace(" ", "").split(",")
        temp = []
        
        #We get the themes for the current year
        for i in range(len(header)):
            for j in range(int(themes.loc[year, header[i]])):
                temp.append(header[i])

        #We reorder them using our indexation and add them to the list
        values += list(map(lambda x : temp[int(x) - 1], indices))
        
#We add the list to the dataframe as a column
votations["Thématique"] = values
votations = votations.drop("Année", axis=1)
votations.head()

Unnamed: 0,Votation,Thématique
0,14.06.1981 Protection des consommateurs,Economie
1,29.11.1981 Régime financier,Finances publiques
2,14.06.1981 Egalité entre hommes et femmes,Politique sociale
3,06.06.1982 Code pénal suisse,Régime politique
4,06.06.1982 Loi sur les étrangers,Politique sociale


We merge the 2 dataframes so that we have for each votation and each commune the theme and the percentage of yes. We only takes 'Thématique','Commune' and 'Oui en %' because it will be the only usefull information for later (Votation is no longer usefull once we managed to merge)

We also make sure that there is in the dataframes only commune that are in the json so that we do not make the folium functions crash.

In [19]:
thematique = votations.copy()
data_theme = data.merge(thematique , on = 'Votation')

data_t = data_theme[['Thématique','Commune','Oui en %']]
data_t = data_t[data_t['Commune'].isin(commune)]
data_t.head()

Unnamed: 0,Thématique,Commune,Oui en %
0,Politique sociale,Aeugst am Albis,34.9
1,Politique sociale,Affoltern am Albis,30.3
2,Politique sociale,Bonstetten,34.3
3,Politique sociale,Hausen am Albis,35.5
4,Politique sociale,Hedingen,31.1


We group the data by theme, and for each one we create a map showing how much people voted yes. We then save it into an html.

In [20]:

for theme, curr_data_theme in data_t.groupby('Thématique') :
    curr_data_theme = curr_data_theme.groupby('Commune', as_index  = False).mean()
    map1 = folium.Map(location=switzerland_coord, zoom_start=8)
    map1.choropleth(geo_data = geo_json_data, \
                                    data = curr_data_theme, \
                                    columns = ['Commune', 'Oui en %'], \
                                    key_on = 'feature.name', \
                                    fill_color = 'YlGnBu', \
                                    fill_opacity = 0.7, \
                                    line_opacity = 0.2, \
                                    legend_name = 'yes in % given to the theme ' + theme)
    
    map1.save('data/map_theme/map_'+theme+'.html')

### Map by recommendation

This notebook is for using the proposition of vote of each poilitical party to create a visual representation of how much each party is listened and try to see which region vote more for each party.

This part load the dataframe with the % of yes and clean the dataframe so that we have only the date in one column and another to have only the subject of the votation.

In [21]:
#The dataframe with the votation data
df = data.copy()
df['Date'] = df['Votation'].map(lambda x : x.split(' ')[0])
#df['Votation'] = df['Votation'].map(lambda x : ' '.join(x.split(' ')[1:]))
df.head()

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,Canton,Date
0,Aeugst am Albis,29.11.1998 Initiative Droleg,1070.0,487.0,45.5,478.0,167.0,311.0,34.9,Affoltern,Zürich,29.11.1998
1,Aeugst am Albis,14.06.2015 Initiative sur les bourses d'études,1380.0,706.0,51.2,695.0,186.0,509.0,26.8,Affoltern,Zürich,14.06.2015
2,Aeugst am Albis,25.09.2016 Loi fédérale sur le renseignement,1400.0,670.0,47.9,659.0,417.0,242.0,63.3,Affoltern,Zürich,25.09.2016
3,Aeugst am Albis,03.03.1991 Encouragement des transports publics,835.0,321.0,38.4,312.0,128.0,184.0,41.0,Affoltern,Zürich,03.03.1991
4,Aeugst am Albis,12.02.2017 Réforme de l'imposition des entrepr...,1395.0,759.0,54.4,750.0,318.0,432.0,42.4,Affoltern,Zürich,12.02.2017


In [22]:
#download all the page 
xls = pd.ExcelFile('data/Recommandations des partis.xls')
#the dataframe to fill with the info of the recommandation
recommend = pd.DataFrame()

#for every page with information from 2017 to 1981  :
for i in range ((2017 - 1981 + 1)) :
    #get the page i
    x = xls.parse(i)
    #change the name of the columns so that they will be easier to use 
    x.columns = range(len(x.columns))
    
    #We will need the parties and the numero of the votation, for that we use the fact that the line that have the date 
    #always begin by 'Parti 1)' and that the no of votation is 2 lines later
    base_nb = x[x.iloc[:, 0] == 'Parti 1)'].index[0]
    
    #We drop all the columns with only Nan
    x = x.dropna(axis=1, how='all')

    #concat the date, the No of votation and the conseil of vote using the fact that they are the only lines with Nan
    recommend_inter = pd.concat([x.iloc[base_nb:base_nb + 1], x.iloc[base_nb + 2:base_nb + 3], x.dropna()]).transpose()
    
    #this part change the name of the party so that even if there is little change in the name  (like had a 3) to the name)
    #we still have consistent name
    recommend_inter.iloc[0, 2:] = recommend_inter.iloc[0, 2:].map(lambda x : x.split(' ')[0])    
    
    #change the name of the columns so that they will be easier to use
    recommend_inter.columns = range(len(recommend_inter.columns))
    
    recommend_inter.iloc[:, 0] = recommend_inter.iloc[:, 0] + str(2017 - i)
    # make sure that 'no ###' are  made into 'No ###'
    recommend_inter.iloc[:, 1] = recommend_inter.iloc[:, 1].map(lambda x : x[0].upper() + x[1:], na_action='ignore')
    
    #get the lines that give the name of a votation knowing its No
    propositions = x[x.iloc[:, 0].str.contains('No ').fillna(False)]
    #change the name of the columns so that they will be easier to use
    propositions.columns = range(len(propositions.columns))
    
    #The next part is transforming the No of votation into name of votation
    
    #in some sheet this information is on only 1 cell so we split the information in 2 clls like the rest of the sheets 
    if (propositions.iloc[:, 1].isnull()).all():
        if (2017 - i == 1997):
            propositions.iloc[:, 1] = propositions[0].map(lambda x : x.split(' ')[1])
            propositions.iloc[:, 0] = propositions[0].map(lambda x : x.split(' ')[0].rstrip())
        else:
            propositions.iloc[:, 1] = propositions[0].map(lambda x : x.split(':')[1][1:])
            propositions.iloc[:, 0] = propositions[0].map(lambda x : x.split(':')[0].rstrip())
    
    
    dico_no_propos = propositions.dropna(1).set_index(0).to_dict()
    
    #some dico are inside a dictionarry {1:{true_dictionarry}}
    if (dico_no_propos.get(1) == None) :
        recommend_inter.iloc[:, 1] = recommend_inter.iloc[:, 1].map(dico_no_propos)
    else :
        recommend_inter.iloc[:, 1] = recommend_inter.iloc[:, 1].map(dico_no_propos[1])
        
    #rename column meaningfully
    recommend_columns = ['Date', 'Votation'] + list(recommend_inter.iloc[0][2:])
    recommend_inter.columns = recommend_columns
    
    #fill the date for the line that does not have it
    recommend_inter['Date'] = recommend_inter['Date'].fillna(method='ffill')
    #transpose so that we can join
    recommend_inter = recommend_inter.transpose()
    
    #delete duplicate (in some sheet there is twice PST )
    recommend_inter = recommend_inter.groupby(recommend_inter.index).first()
    #join the 2 dataframe, the name of the column are not important
    recommend = recommend.join(recommend_inter, how='outer', lsuffix='l', rsuffix='r')

recommend = recommend.transpose()  

#clean the dataframe 
recommend = recommend[~recommend.loc[:, 'Date'].str.contains('Parti 1').fillna(False)]
recommend = recommend[~recommend['Votation'].isnull()]
#create meaningfull index
recommend.index = range(len(recommend.index))

#transform the date into the same format than the main dataframe
month_to_int = { \
    'janvier' : '01', \
    'février' : '02', \
    'févirer' : '02', \
    'mars' : '03', \
    'avril' : '04', \
    'mai' : '05', \
    'juin' : '06', \
    'juillet' : '07', \
    'aout' : '08', \
    'septembre' : '09', \
    'octobre' : '10', \
    'novembre' : '11', \
    'décembre' : '12' \
}

def good_format_month (x) :
    if '.' in x :
        split_x = x.split('.')
        return split_x[0].zfill(2) + '.' + split_x[1].zfill(2) + '.' + split_x[2][:4]
    else :
        return str(x.split(' ')[0]).zfill(2) + '.' +  month_to_int[str(x.split(' ')[1]) ] + '.' + x[-4:] 

recommend.loc[:, 'Date'] = recommend.loc[:, 'Date'].map(lambda x : good_format_month(x))

recommend = recommend.loc[:, recommend.columns.drop_duplicates() ]
#parties is the list of all the parties
parties = list(recommend.columns.drop_duplicates())
parties.remove('Date')
parties.remove('Votation')
parties.remove('Parti')

recommend = recommend.loc[:, ['Date', 'Votation'] + parties]
recommend.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,Date,Votation,AdI,DS,Lega,MCR,PBD,PCS,PDC,PES,...,PLS,POCH,PRD,PS,PSL,PST,PVL,Rep.,UDC,UDF
0,12.02.2017,Naturalisation facilitée des étrangers de la t...,,,-,non,oui,,oui,oui,...,,,,oui,,,oui,,non,
1,12.02.2017,Fonds pour les routes nationales et pour le tr...,,,-,oui,oui,,oui,non,...,,,,non,,,oui,,oui,
2,12.02.2017,Réforme de l'imposition des entreprises III,,,-,oui,oui,,oui,non,...,,,,non,,,oui,,oui,
3,21.05.2017,Loi sur l'énergie,,,-,non,oui,,oui,oui,...,,,,oui,,,oui,,non,
4,28.02.2016,Initiative sur la réforme de l'imposition du c...,,,,,non,,oui,non,...,,,,non,,,non,,oui,


Change the value so that they are all uniform       
a yes recommandation becomes 1     
a no recommandation becomes -1   
No data or no recommendation is 0

In [23]:
def translate_choice(x) :
    if (x == 1 or str(x) == 'oui') :
        return 1
    elif (x == 2 or str(x) == 'non'):
        return -1
    else :
        return 0

good_recommend = recommend.copy() 
good_recommend.loc[:, parties] = good_recommend.loc[:, parties].applymap(lambda x : translate_choice(x))
good_recommend.head()

Unnamed: 0,Date,Votation,AdI,DS,Lega,MCR,PBD,PCS,PDC,PES,...,PLS,POCH,PRD,PS,PSL,PST,PVL,Rep.,UDC,UDF
0,12.02.2017,Naturalisation facilitée des étrangers de la t...,0,0,0,-1,1,0,1,1,...,0,0,0,1,0,0,1,0,-1,0
1,12.02.2017,Fonds pour les routes nationales et pour le tr...,0,0,0,1,1,0,1,-1,...,0,0,0,-1,0,0,1,0,1,0
2,12.02.2017,Réforme de l'imposition des entreprises III,0,0,0,1,1,0,1,-1,...,0,0,0,-1,0,0,1,0,1,0
3,21.05.2017,Loi sur l'énergie,0,0,0,-1,1,0,1,1,...,0,0,0,1,0,0,1,0,-1,0
4,28.02.2016,Initiative sur la réforme de l'imposition du c...,0,0,0,0,-1,0,1,-1,...,0,0,0,-1,0,0,-1,0,1,0


We need to create a dictionarry to link the name of the votation in the recomendation dataframe and the vote dataframe.   
The problem is that the name are very different, so to link one to another we need to group the votation of the 2 by date and for each date we compare all pair using SequenceMatch, we find the best match, we decide to link this pair together, we take out the 2 element from their respective list and we research a new maximum until there is no elemnt in one of the list.

This method might create some imprecision but as a whole it is solid.

In [24]:
dico_recom_vote = {}

for date1, group1 in df[df['Commune'] == 'Aeugst am Albis'].groupby('Date'):
    
    for date2, group2 in good_recommend.groupby('Date'):
        
        if (date1 == date2) :
            vot1 = list(group1['Votation']) 
            vot2 = list(group2['Votation'])
            
            while (len(vot1) > 0 and len(vot2) > 0):
                max_v = -1
                max_match = [0, 0]
                
                for elem1 in vot1:
                    for elem2 in vot2:
                        current_v = SequenceMatcher(None, elem1, elem2).ratio()
                        
                        if (current_v > max_v) :
                            max_v = current_v
                            max_match = [elem1, elem2]
                            
                dico_recom_vote[max_match[0]] = max_match[1]
                dico_recom_vote[max_match[1]] = max_match[0]
                vot1.remove(max_match[0])
                vot2.remove(max_match[1])

Create a recommandation dataframe with names corresponding to the ones in the main dataframe

In [25]:
recommend = good_recommend.copy()
recommend['Votation'] = recommend['Votation'].map(dico_recom_vote)
recommend.head()

Unnamed: 0,Date,Votation,AdI,DS,Lega,MCR,PBD,PCS,PDC,PES,...,PLS,POCH,PRD,PS,PSL,PST,PVL,Rep.,UDC,UDF
0,12.02.2017,12.02.2017 Naturalisation facilitée des étrang...,0,0,0,-1,1,0,1,1,...,0,0,0,1,0,0,1,0,-1,0
1,12.02.2017,12.02.2017 Fonds pour les routes nationales et...,0,0,0,1,1,0,1,-1,...,0,0,0,-1,0,0,1,0,1,0
2,12.02.2017,12.02.2017 Réforme de l'imposition des entrepr...,0,0,0,1,1,0,1,-1,...,0,0,0,-1,0,0,1,0,1,0
3,21.05.2017,21.05.2017 Loi sur l'énergie,0,0,0,-1,1,0,1,1,...,0,0,0,1,0,0,1,0,-1,0
4,28.02.2016,28.02.2016 Initiative sur la réforme de l'impo...,0,0,0,0,-1,0,1,-1,...,0,0,0,-1,0,0,-1,0,1,0


In [26]:

parties = list(recommend.columns.drop_duplicates())
parties.remove('Date')
parties.remove('Votation')



data_recommend = data.merge(recommend.loc[:, ['Votation'] + parties], on='Votation')
data_recommend.head()

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,...,PLS,POCH,PRD,PS,PSL,PST,PVL,Rep.,UDC,UDF
0,Aeugst am Albis,29.11.1998 Initiative Droleg,1070.0,487.0,45.5,478.0,167.0,311.0,34.9,Affoltern,...,-1,0,-1,1,-1,1,0,0,-1,-1
1,Affoltern am Albis,29.11.1998 Initiative Droleg,5729.0,2286.0,39.9,2236.0,678.0,1558.0,30.3,Affoltern,...,-1,0,-1,1,-1,1,0,0,-1,-1
2,Bonstetten,29.11.1998 Initiative Droleg,2596.0,1063.0,40.9,1045.0,358.0,687.0,34.3,Affoltern,...,-1,0,-1,1,-1,1,0,0,-1,-1
3,Hausen am Albis,29.11.1998 Initiative Droleg,2081.0,807.0,38.8,792.0,281.0,511.0,35.5,Affoltern,...,-1,0,-1,1,-1,1,0,0,-1,-1
4,Hedingen,29.11.1998 Initiative Droleg,1858.0,810.0,43.6,791.0,246.0,545.0,31.1,Affoltern,...,-1,0,-1,1,-1,1,0,0,-1,-1


Create a map of % of people agreeing in a party for each party.

People agreeing are considered to people that vote the same as the party if it votes yes or no. We do not take into consideration other proposition of the party (like abstentation) or when we do not have information about the recommandation of a party.

In [27]:
for parti in parties :
    current_to_map = data_recommend.loc[:, ['Commune', 'Oui en %', parti]]
    current_to_map = current_to_map[current_to_map[parti] != 0]
    current_to_map['Agreement'] = current_to_map[['Oui en %', parti]] \
        .apply(lambda x : x['Oui en %'] if x[parti] == 1 else 100 - x['Oui en %'], axis=1)
    
    current_to_map = current_to_map.groupby('Commune', as_index=False).mean()
    
    map1 = folium.Map(location=switzerland_coord, zoom_start=8)
    map1.choropleth(geo_data = geo_json_data, \
                    data = current_to_map, \
                    columns = ['Commune', 'Agreement'], \
                    key_on = 'feature.name', \
                    fill_color = 'YlGnBu', \
                    fill_opacity = 0.7, \
                    line_opacity = 0.2, \
                    legend_name = 'Agreement in % with ' + parti)
    
    map1.save('data/maps_partis/map_' + parti + '.html')
    

## Clustering


### kmeans

draw_map_kmeans is a function that will cluster all the commune of Switzerland using a kmeans method with n_clusters group and the matrix X given, return a map that show all the commune in different colors depending their groups, and reduce the matrix X to 2 parameters using PCA to create a scatter plot graph that will be a graphical representation of this where one point will be one commune and if it has a value for file_PCA, it will create a png there otherwise it will just print this graph

In [28]:
def draw_map_kmeans (n_clusters , X , file_PCA = None ) :
    #cluster using kmeans
    kmeans_res = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    groups = kmeans_res.labels_
    commune_to_group = pd.DataFrame({'Commune' : commune_list , 'Group' : groups})
    commune_to_group = commune_to_group.set_index('Commune')['Group']
    
    
    ##reduce matrix using PCA
    plt.figure(100+n_clusters)
    
    
    model_PCA = PCA ( n_components=2)
    X_PCA = model_PCA.fit_transform(X)
    
    for current_group in range (n_clusters) :
        group_y = [X_PCA[i] for i in range(len(X_PCA)) if groups[i] == current_group]
        plt.scatter( [x[0] for x in group_y], [x[1] for x in group_y], c= color_list[current_group])
    
    if (file_PCA != None) :
        plt.savefig(file_PCA+'PCAA_kmeans'+str(n_clusters)+'.png')
    else :
        print(plt.show())
    plt.gcf().clear()
    
    
    #create a function for how to color the map
    def style_function_kmeans(feature):
        group = commune_to_group.get(feature['name'], None)
        if(group == None):
            print(feature['name'])
        return {
            'fillOpacity': 1,
            'weight': 0,
            'fillColor': color_list[group]
        }
    
    
    #create the map
    m = folium.Map(location=switzerland_coord, zoom_start=8)
    
    folium.GeoJson(
        geo_json_data,
        style_function=style_function_kmeans
    ).add_to(m)

    m.choropleth(geo_data=geo_json_data,
             fill_opacity=0,
             line_opacity=1)
    
    return m

In [29]:
#create the map and PCA graph as describe above using draw_map_kmeans for kmeans with 2 to 5 groups and put them in data/map_ml
for i in range (2,6) :  
    draw_map_kmeans(i,X, file_PCA='data/map_ml/').save('data/map_ml/kmeans'+str(i)+'.html')

<matplotlib.figure.Figure at 0x1d00a176c88>

<matplotlib.figure.Figure at 0x1d05ee8b470>

<matplotlib.figure.Figure at 0x1d00d2b6630>

<matplotlib.figure.Figure at 0x1d0090ad518>

### DBSCAN

draw_map_DBSCAN is a function that will cluster all the commune of Switzerland using a DBSCAN method and the matrix X given, return a map that show all the commune in different colors depending their groups, and reduce the matrix X to 2 parameters using PCA to create a scatter plot graph that will be a graphical representation of this where one point will be one commune and if it has a value for file_PCA, it will create a png there otherwise it will just print this graph

In [30]:
def find_epsilon(X) :
    min_samples = 2*len(X[0])
    X_array = [ np.array(x_) for x_ in X]
    range_X = range(len(X))
    calculate = np.sort([ \
                            np.sort([np.linalg.norm(X_array[x]-X_array[y]) \
                             for x in range_X  if x!=y])[int(min_samples/2)] \
                           \
                   for y in range_X ] )
    
    
    for i in range (len(calculate)) :
        plt.scatter( i,calculate[i])
    
    print(plt.show())
        
    plt.gcf().clear()

In [31]:
def draw_map_DBSCAN (X , file_PCA = None) : 
    #cluster using DBSCAN
    min_samples =  20
    
    

    X_array = [ np.array(x_) for x_ in X]
    range_X = range(len(X))
    
    
    
    
    Xmeans = np.mean([ np.mean(\
                            np.sort([np.linalg.norm(X_array[x]-X_array[y]) \
                             for x in range_X  if x!=y])[:(min_samples*2-1)] \
                           )\
                   for y in range_X ] )
    groups =  DBSCAN(eps=Xmeans, min_samples=min_samples).fit(X).labels_
    
    
    
    
    ##reduce matrix using PCA
    model_PCA = PCA ( n_components=2)
    X_PCA = model_PCA.fit_transform(X)
    
    n_clusters = max(groups)+1
    for current_group in range (-1,n_clusters) :
        group_y = [X_PCA[i] for i in range(len(X_PCA)) if groups[i] == current_group]
        plt.scatter( [x[0] for x in group_y], [x[1] for x in group_y], c= color_list[current_group])
    
    
    if (file_PCA != None) :
        
        plt.savefig(file_PCA+'PCAA_DBSCAN.png')
    else :
        print(plt.show())
        
    plt.gcf().clear()
    
    
    
    
    #create a function for how to color the map
    commune_to_group = pd.DataFrame({'Commune' : commune_list , 'Group' : groups}).set_index('Commune')['Group']
    
    def style_function_DBSCAN(feature):
        group = commune_to_group.get(feature['name'], None)
        if(group == None):
            print(feature['name'])
        return {
            'fillOpacity': 1,
            'weight': 0,
            'fillColor': color_list[group]
        }
    
    
    #create the map
    m = folium.Map(location=switzerland_coord, zoom_start=8)
    
    folium.GeoJson(
        geo_json_data,
        style_function=style_function_DBSCAN
    ).add_to(m)

    m.choropleth(geo_data=geo_json_data,
             fill_opacity=0,
             line_opacity=1)
    
    return m

In [32]:

draw_map_DBSCAN (X,'data/map_ml/').save('data/map_ml/DBSCAN.html')

<matplotlib.figure.Figure at 0x1d00a32b7f0>

### clustering for theme

In this cell I will cluster again the data but using only votes from one theme at a time, and this for all theme.
The method used to cluster will be DBSCAN and kmeans with 2 cluster.

In [33]:
#data_bg = data by group
for data_bg in data_theme[['Thématique','Votation','Commune','Oui en %']].groupby('Thématique') :
    theme = data_bg[0]
    data_bg = pd.DataFrame(data = data_bg[1])

    votation_list_t = list(set(data_bg['Votation'].values))
    votation_dict_t = { val : idx for idx , val in enumerate(votation_list_t)   }

    Xt = np.ones((len(commune_list) , len(votation_list_t) ) , dtype=float)
    
    for x in data_bg [['Commune','Votation','Oui en %']].fillna(50).values :
        Xt [commune_dict[x[0]]][votation_dict_t[x[1]]] = x[2]
    draw_map_kmeans(2,Xt,'data/maps_theme_ml/'+theme).save('data/maps_theme_ml/kmeans_'+theme+'.html') 
    draw_map_DBSCAN (Xt,'data/maps_theme_ml/'+theme).save('data/maps_theme_ml/DBSCAN_'+theme+'.html')

<matplotlib.figure.Figure at 0x1d00a0ffe48>

### Cluster by recommendation

In this cell I will cluster again the data but this time instead of using the percentage of yes vote I will use the percentage of agreement with each parties.
The method used to cluster will be DBSCAN and kmeans with 2 cluster.

In [34]:
for parti in parties :
    curr_recommend = data_recommend.loc[:, ['Commune', 'Votation' , 'Oui en %', parti]]
    curr_recommend = curr_recommend[curr_recommend[parti] != 0]
    curr_recommend['Agreement'] = curr_recommend[['Oui en %', parti]] \
        .apply(lambda x : x['Oui en %'] if x[parti] == 1 else 100 - x['Oui en %'], axis=1)
    
    
    votation_list_t = list(set(curr_recommend['Votation'].values))
    votation_dict_t = { val : idx for idx , val in enumerate(votation_list_t)   }

    Xt = np.ones((len(commune_list) , len(votation_list_t) ) , dtype=float)
    
    for x in curr_recommend [['Commune','Votation','Oui en %']].fillna(50).values :
        Xt [commune_dict[x[0]]][votation_dict_t[x[1]]] = x[2]
    draw_map_kmeans(2,Xt,'data/map_recommendation_cluster/'+parti).save('data/map_recommendation_cluster/kmeans_'+parti+'.html') 
    draw_map_DBSCAN (Xt,'data/map_recommendation_cluster/'+parti).save('data/map_recommendation_cluster/DBSCAN_'+parti+'.html')
    
    

<matplotlib.figure.Figure at 0x1d00920e278>

# Current day analysis

We will now do the analysis of the clustering since 2005 (which is a good compromise between being enough in the present and having enough data to create meaningfull clusters). 
We will use only clustering because they are the only method which made meaningfull results.

In [35]:
def get_good_date (x) :
    date = x.split(' ') [0]
    return '.'.join(date.split('.')[::-1])


#return -1 if date1 < date2 return 1 if date1 > date2 return 0 if if date1 == date2
def compare_date (date1 , date2) :
    date1_s = date1.split('.')
    date2_s = date2.split('.')
    for i in range ( len(date1_s)) :
        if ( int(date1_s[i]) < int(date2_s[i])) :
            return -1
        elif ( int(date1_s[i]) > int(date2_s[i])) :
            return 1
    return 0



data['Date'] = data['Votation'].map( get_good_date)


data = data [ data ['Date'].map (lambda x : compare_date ( '2005.00.00' , x ) == -1 )]
data.head()

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,Canton,Date
1,Aeugst am Albis,14.06.2015 Initiative sur les bourses d'études,1380.0,706.0,51.2,695.0,186.0,509.0,26.8,Affoltern,Zürich,2015.06.14
2,Aeugst am Albis,25.09.2016 Loi fédérale sur le renseignement,1400.0,670.0,47.9,659.0,417.0,242.0,63.3,Affoltern,Zürich,2016.09.25
4,Aeugst am Albis,12.02.2017 Réforme de l'imposition des entrepr...,1395.0,759.0,54.4,750.0,318.0,432.0,42.4,Affoltern,Zürich,2017.02.12
5,Aeugst am Albis,07.03.2010 Recherche sur l'être humain,1262.0,588.0,46.6,579.0,476.0,103.0,82.2,Affoltern,Zürich,2010.03.07
16,Aeugst am Albis,09.02.2014 Initiative «Financer l'avortement e...,1374.0,857.0,62.4,849.0,290.0,559.0,34.2,Affoltern,Zürich,2014.02.09


In [36]:
data_recommend = data.merge(recommend.loc[:, ['Votation'] + parties], on='Votation')
data_theme = data.merge(thematique , on = 'Votation')

In [37]:
votation_list = list(set(data['Votation'].values))
votation_dict = { val : idx for idx , val in enumerate(votation_list)   }

X = np.ones((len(commune_list) , len(votation_list) ) , dtype=float)


for x in data [['Commune','Votation','Oui en %']].fillna(50).values :
    X [commune_dict[x[0]]][votation_dict[x[1]]] = x[2]

In [38]:
for i in range (2,6) :  
    draw_map_kmeans(i,X, file_PCA='data/young/map_ml/').save('data/young/map_ml/kmeans'+str(i)+'.html')

<matplotlib.figure.Figure at 0x1d009081898>

<matplotlib.figure.Figure at 0x1d00a0fe908>

<matplotlib.figure.Figure at 0x1d00a13bfd0>

<matplotlib.figure.Figure at 0x1d01de2f390>

In [39]:
draw_map_DBSCAN (X,'data/young/map_ml/').save('data/young/map_ml/DBSCAN.html')

<matplotlib.figure.Figure at 0x1d00a0b62e8>

In [40]:
#data_bg = data by group
for data_bg in data_theme[['Thématique','Votation','Commune','Oui en %']].groupby('Thématique') :
    theme = data_bg[0]
    data_bg = pd.DataFrame(data = data_bg[1])

    votation_list_t = list(set(data_bg['Votation'].values))
    votation_dict_t = { val : idx for idx , val in enumerate(votation_list_t)   }

    Xt = np.ones((len(commune_list) , len(votation_list_t) ) , dtype=float)
    
    for x in data_bg [['Commune','Votation','Oui en %']].fillna(50).values :
        Xt [commune_dict[x[0]]][votation_dict_t[x[1]]] = x[2]
    draw_map_kmeans(2,Xt,'data/young/maps_theme_ml/'+theme).save('data/young/maps_theme_ml/kmeans_'+theme+'.html') 
    draw_map_DBSCAN (Xt,'data/young/maps_theme_ml/'+theme).save('data/young/maps_theme_ml/DBSCAN_'+theme+'.html')

<matplotlib.figure.Figure at 0x1d00b035c88>

In [41]:
for parti in parties :
    curr_recommend = data_recommend.loc[:, ['Commune', 'Votation' , 'Oui en %', parti]]
    curr_recommend = curr_recommend[curr_recommend[parti] != 0]
    if (len(curr_recommend) > 0):
        curr_recommend['Agreement'] = curr_recommend[['Oui en %', parti]] \
            .apply(lambda x : x['Oui en %'] if x[parti] == 1 else 100 - x['Oui en %'], axis=1)
    
    
        votation_list_t = list(set(curr_recommend['Votation'].values))
        votation_dict_t = { val : idx for idx , val in enumerate(votation_list_t)   }

        Xt = np.ones((len(commune_list) , len(votation_list_t) ) , dtype=float)
    
        for x in curr_recommend [['Commune','Votation','Oui en %']].fillna(50).values :
            Xt [commune_dict[x[0]]][votation_dict_t[x[1]]] = x[2]
        
        if ( len(Xt[0]) > 1) :
            draw_map_kmeans(2,Xt,'data/young/map_recommendation_cluster/'+parti).save('data/young/map_recommendation_cluster/kmeans_'+parti+'.html') 
            draw_map_DBSCAN (Xt,'data/young/map_recommendation_cluster/'+parti).save('data/young/map_recommendation_cluster/DBSCAN_'+parti+'.html')
    

<matplotlib.figure.Figure at 0x1d00a14cc50>