In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Read votation data
df = pd.read_pickle("data/votations.pkl")

In [3]:
# Function that capitalizes the first letter in a given string
def cap_first(s):
    return s[0].capitalize() + s[1:]

In [4]:
# Create a dataframe with the indices of the votation data
data = pd.DataFrame([x for x, _ in df.index.values]).drop_duplicates()

# Rename the community column
data.columns = ["Commune"]

# Create columns for districts/cantons/countries
data["District"] = np.nan
data["Canton"] = np.nan
data["Pays"] = np.nan

# Extract names of districts/cantons/countries
data["Pays"] = data["Commune"].map(lambda x : x if x[0] != ">" and x[0] != "-" and x[0] != "." else np.nan)
data["Canton"] = data["Commune"].map(lambda x : x[2:] if x[0] == "-" else np.nan)
data["District"] = data["Commune"].map(lambda x : x[3:] if x[0] == ">" else np.nan)

# Propagate names of districts/cantons/countries downwards
data = data.fillna(method='ffill')

# Remove lines that do not describe a community
data = data[data["Commune"].map(lambda x : x[0] == ".")]

# Clean canton and district names
data["Canton"] = data["Canton"].map(lambda x : x if x is np.nan else x.split(" /")[0])
data["District"] = data["District"].map(lambda x :
                                        x if \
                                            "Bezirk See" in x else \
                                        "".join(x.split("'")[1:]).strip() if \
                                            "District d'" in x or \
                                            "District de l'" in x else
                                        cap_first(" ".join(x.split(" ")[2:])).strip() if \
                                            "Arrondissement administratif" in x or \
                                            "District" in x or \
                                            "Canton" in x or \
                                            "Distretto di" in x else \
                                        " ".join(x.split(" ")[1:]).strip() if \
                                            "Verwaltungskreis" in x or \
                                            "Wahlkreis" in x or \
                                            "Kanton" in x or \
                                            "Bezirk" in x or \
                                            "Region" in x \
                                        else x)

data["District"] = data["District"].map(lambda x : \
                                        "Obwald" if x == "Obwalden" else \
                                        "Nidwald" if x == "Nidwalden" else \
                                        x)

# Write correct district/canton/country data for foreign votes
data[["District", "Canton", "Pays"]] = data.apply(lambda x : pd.Series(["-", "-", "Etranger"]) if \
                                        "-Ausland-" in x["District"] or \
                                        " de l'étranger" in x["District"] or \
                                        "-Korrespondenzweg" in x["District"] or \
                                        "-autres" in x["District"] or \
                                        "-voto per corrispondenza" in x["District"] \
                                      else pd.Series([x["District"], x["Canton"], x["Pays"]]), axis=1)

#data.set_index('Commune', inplace=True)
data = data.reset_index(drop=True)

#NAMES NEED TO BE CLEANED AFTER MODIFYING THE PROPER DATA, OTHERWISE COMMUNITY NAMES WILL NOT MATCH
#NO DISTRICTS FOR GENEVA, SCHAFFHAUSEN, APPENZELL INNERRHODEN, OBWALD AND NIDWALD

In [25]:
# Create columns for districts/cantons/countries

df = pd.read_pickle("data/votations.pkl")

df.reset_index(inplace=True)
df = df.merge(data, on="Commune")
df["Commune"] = df["Commune"].map(lambda x : x[7:] if x[0] == "." else x)
# data["Commune"] = data["Commune"].map(lambda x : x.split(" (")[0])

df.head()
#df[df["Commune"] == "Bodensee (SG)"]["Canton"].unique()

Unnamed: 0,Commune,Votation,Electeurs inscrits,Bulletins rentrés,Participation en %,Bulletins valables,Oui,Non,Oui en %,District,Canton,Pays
0,Aeugst am Albis,29.11.1998 Initiative Droleg,1070.0,487.0,45.5,478.0,167.0,311.0,34.9,Affoltern,Zürich,Suisse
1,Aeugst am Albis,14.06.2015 Initiative sur les bourses d'études,1380.0,706.0,51.2,695.0,186.0,509.0,26.8,Affoltern,Zürich,Suisse
2,Aeugst am Albis,25.09.2016 Loi fédérale sur le renseignement,1400.0,670.0,47.9,659.0,417.0,242.0,63.3,Affoltern,Zürich,Suisse
3,Aeugst am Albis,03.03.1991 Encouragement des transports publics,835.0,321.0,38.4,312.0,128.0,184.0,41.0,Affoltern,Zürich,Suisse
4,Aeugst am Albis,12.02.2017 Réforme de l'imposition des entrepr...,1395.0,759.0,54.4,750.0,318.0,432.0,42.4,Affoltern,Zürich,Suisse


In [6]:
themes = pd.read_csv("data/px-x-1703010000_103.csv", sep=";", encoding="cp1254", skiprows=2)[:-1]
themes = themes[~themes['Période'].str.contains("bis")]
themes["Période"] = themes["Période"].apply(lambda x : x.split(" ")[1])
themes = themes.set_index("Période")
themes.head()

Unnamed: 0_level_0,Régime politique,Politique étrangère,Politique de sécurité,Economie,Finances publiques,"Infrastructure, aménagement, environnement",Politique sociale,"Enseignement, culture et médias"
Période,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0
2016,1.0,0.0,0.0,1.0,0.0,5.0,6.0,0.0
2015,0.0,0.0,0.0,0.0,2.0,0.0,2.0,2.0
2014,1.0,1.0,1.0,1.0,2.0,1.0,5.0,0.0
2013,1.0,0.0,1.0,1.0,0.0,2.0,6.0,0.0


In [7]:
#We extract the subjects of votations from the index of the data and make a dataframe we the subjects and the year
df = pd.read_pickle("data/votations.pkl")
votations = pd.DataFrame(df.index.levels[1])
votations["Année"] = votations["Votation"].apply(lambda s : s[6:10])
votations = votations.sort_values("Année")
votations = votations.reset_index(drop=True)

header = list(themes.columns)
values = []

#Now let's use our indexation of themes to append a theme to each votation. We use a handmade indexation of the
#themes since no mapping between subjects and themes exist online. We used the available listing of voted themes
#for each year and manually reattributed the themes to the subjects.
with open("data/theme_indices.txt", "r") as file:
    for year in votations["Année"].unique():
        indices = file.readline().replace(" ", "").split(",")
        temp = []
        
        #We get the themes for the current year
        for i in range(len(header)):
            for j in range(int(themes.loc[year, header[i]])):
                temp.append(header[i])

        #We reorder them using our indexation and add them to the list
        values += list(map(lambda x : temp[int(x) - 1], indices))
        
#We add the list to the dataframe as a column
votations["Thématique"] = values
votations = votations.drop("Année", axis=1)
votations.head()

Unnamed: 0,Votation,Thématique
0,14.06.1981 Protection des consommateurs,Economie
1,29.11.1981 Régime financier,Finances publiques
2,14.06.1981 Egalité entre hommes et femmes,Politique sociale
3,06.06.1982 Code pénal suisse,Régime politique
4,06.06.1982 Loi sur les étrangers,Politique sociale


In [8]:
votations.to_pickle("data/Thématique.pkl")

In [9]:
town_geo_path = r'data/switzerland_borders/admin_level_8.geojson'
geo_json_data = json.load(open(town_geo_path, encoding="utf8"))

commune_official = [[x['name'], x['properties'].get('official_name')] for x in geo_json_data['features']]
commune = [x['name'] for x in geo_json_data['features']]
commune_wiki = [[x['name'], x['properties'].get('wikipedia')] for x in geo_json_data['features']]

def correct_parenthese(x):
    without_parenthese = x[:-5]
    correct = [y for y in commune_official if y[1] != None and y[1] == x]
    if len(correct) == 1:
        return correct[0][0]
    
    correct = [y for y in commune if without_parenthese in y and '(' in y]
    
    if len(correct) == 1:
        return correct[0]
    
    correct = [y for y in commune if y in x and '(' not in y ]
    
    if len(correct) == 1:
        return correct[0]
    elif ' (Urne commune)' in x:
        return x[:-len(' (Urne commune)')]
    else :
        return x

dico = {v: correct_parenthese(v) for k, v in data["Commune"].map(lambda x : x[7:] if x[0] == "." else x).iteritems()}
df['Commune'] = df['Commune'].map(dico)
df.head()
dict(list(geo_json_data.items())[:3])
#df = df[df['Pays'] == 'Suisse']

KeyError: 'Commune'

In [None]:
df.to_pickle("data/data.pkl")