The work done till this second milestone was mainly to get our data and clean them. As we do not use a provided dataset, collecting and cleaning data take a tremendeous amount of time.

In [None]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import json
import folium

The votations have been scraped from the BFS website. See the notbook 'bfs_scrap' for more information.

In [None]:
# Read votation data
df = pd.read_pickle("data/votations.pkl")

In [None]:
# Function that capitalizes the first letter in a given string
def cap_first(s):
    return s[0].capitalize() + s[1:]

In [None]:
# Create a dataframe with the indices of the votation data
data = pd.DataFrame([x for x, _ in df.index.values]).drop_duplicates()

# Rename the community column
data.columns = ["Commune"]

# Create columns for districts/cantons/countries
data["District"] = np.nan
data["Canton"] = np.nan
data["Pays"] = np.nan

# Extract names of districts/cantons/countries
data["Pays"] = data["Commune"].map(lambda x : x if x[0] != ">" and x[0] != "-" and x[0] != "." else np.nan)
data["Canton"] = data["Commune"].map(lambda x : x[2:] if x[0] == "-" else np.nan)
data["District"] = data["Commune"].map(lambda x : x[3:] if x[0] == ">" else np.nan)

# Propagate names of districts/cantons/countries downwards
data = data.fillna(method='ffill')

# Remove lines that do not describe a community
data = data[data["Commune"].map(lambda x : x[0] == ".")]

# Clean canton and district names
data["Canton"] = data["Canton"].map(lambda x : x if x is np.nan else x.split(" /")[0])
data["District"] = data["District"].map(lambda x :
                                        x if \
                                            "Bezirk See" in x else \
                                        "".join(x.split("'")[1:]).strip() if \
                                            "District d'" in x or \
                                            "District de l'" in x else
                                        cap_first(" ".join(x.split(" ")[2:])).strip() if \
                                            "Arrondissement administratif" in x or \
                                            "District" in x or \
                                            "Canton" in x or \
                                            "Distretto di" in x else \
                                        " ".join(x.split(" ")[1:]).strip() if \
                                            "Verwaltungskreis" in x or \
                                            "Wahlkreis" in x or \
                                            "Kanton" in x or \
                                            "Bezirk" in x or \
                                            "Region" in x \
                                        else x)

data["District"] = data["District"].map(lambda x : \
                                        "Obwald" if x == "Obwalden" else \
                                        "Nidwald" if x == "Nidwalden" else \
                                        x)

# Write correct district/canton/country data for foreign votes
data[["District", "Canton", "Pays"]] = data.apply(lambda x : pd.Series(["-", "-", "Etranger"]) if \
                                        "-Ausland-" in x["District"] or \
                                        " de l'étranger" in x["District"] or \
                                        "-Korrespondenzweg" in x["District"] or \
                                        "-autres" in x["District"] or \
                                        "-voto per corrispondenza" in x["District"] \
                                      else pd.Series([x["District"], x["Canton"], x["Pays"]]), axis=1)

#data.set_index('Commune', inplace=True)
data = data.reset_index(drop=True)

#NAMES NEED TO BE CLEANED AFTER MODIFYING THE PROPER DATA, OTHERWISE COMMUNITY NAMES WILL NOT MATCH
#NO DISTRICTS FOR GENEVA, SCHAFFHAUSEN, APPENZELL INNERRHODEN, OBWALD AND NIDWALD

In [None]:
# Create columns for districts/cantons/countries

df = pd.read_pickle("data/votations.pkl")

df.reset_index(inplace=True)
df = df.merge(data, on="Commune")
df["Commune"] = df["Commune"].map(lambda x : x[7:] if x[0] == "." else x)

df.head()


In [None]:
#replace ' (Urne commune)' in the names of towns

def remove_urne_commune(x):
    if ' (Urne commune)' in x:
        return x[:-len(' (Urne commune)')]
    else: return x
    
df['Commune'] = df['Commune'].apply(lambda x : remove_urne_commune(x))

In [None]:
#drop 'Etranger' and save to pickle
df = df[df['Pays'] == 'Suisse']
df.drop(['Pays'], axis=1, inplace=True)
df.to_pickle("data/data.pkl")

## THEMES

In [None]:
themes = pd.read_csv("data/px-x-1703010000_103.csv", sep=";", encoding="cp1254", skiprows=2)[:-1]
themes = themes[~themes['Période'].str.contains("bis")]
themes["Période"] = themes["Période"].apply(lambda x : x.split(" ")[1])
themes = themes.set_index("Période")
themes.head()

In [None]:
#We extract the subjects of votations from the index of the data and make a dataframe we the subjects and the year
df = pd.read_pickle("data/votations.pkl")
votations = pd.DataFrame(df.index.levels[1])
votations["Année"] = votations["Votation"].apply(lambda s : s[6:10])
votations = votations.sort_values("Année")
votations = votations.reset_index(drop=True)

header = list(themes.columns)
values = []

#Now let's use our indexation of themes to append a theme to each votation. We use a handmade indexation of the
#themes since no mapping between subjects and themes exist online. We used the available listing of voted themes
#for each year and manually reattributed the themes to the subjects.
with open("data/theme_indices.txt", "r") as file:
    for year in votations["Année"].unique():
        indices = file.readline().replace(" ", "").split(",")
        temp = []
        
        #We get the themes for the current year
        for i in range(len(header)):
            for j in range(int(themes.loc[year, header[i]])):
                temp.append(header[i])

        #We reorder them using our indexation and add them to the list
        values += list(map(lambda x : temp[int(x) - 1], indices))
        
#We add the list to the dataframe as a column
votations["Thématique"] = values
votations = votations.drop("Année", axis=1)
votations.head()

In [None]:
votations.to_pickle("data/Thématique.pkl")

### Dominating language in towns

To be able to prove that a Röstigraben exists, we need the language spoken in each town together with the votation results. We find such data [here](https://www.atlas.bfs.admin.ch/maps/13/fr/12401_229_228_227/20443.html). The raw values downloaded from there can be found in **data/3007\_Langues\_nationales\_dominantes\_dans\_les\_communes\_en\_2000\_(fr)**. It gives the dominating language for each town in 2000. The value each town gets is [Language]: [(medium)/(strong)] or "No domination" (note that these values are in French and language can be either one of the fourth national language, i.e, 'French', 'German', 'Italian', or 'Romansh').

We had a lot of mismatches between the town names in the votations dataframe and the file above. The reason is that mergers between multiple town have been encouraged from the beginning of this century. While the votation data have been nicely adapted susequently by the BFS, it is obviously not the case for the last language survey performed in 2000. We manually added the merge in the file, resulting in **data/'data/languages_2000.xlsx'**

In [None]:
languages = pd.read_excel('data/languages_2000.xlsx', skiprows=1, skip_footer=11)
languages.drop(['Regions-ID'], axis=1, inplace=True)
languages.head()

In [None]:
#loading votation with only town as index and extracting list of towns
df_votation = pd.read_pickle('data/temp_data.pkl')
towns_votations = set(df_votation[df_votation['Pays'] == 'Suisse']['Commune'])

In [None]:
#all unique town in the language dataset  
towns_languages = set(languages['Regionsname'])

Now lets look at the difference between the two sets:

In [None]:
diff1 = towns_votations - towns_languages
len(diff1)

#### --> 100% match between towns in votations and towns in languages after a tough job!

We tried to draw a map of the languages with folium but without success. However, the code we tried to run can be seen in the notebook 'parse_languages'. You can also find in that notebook some helpers to perform the manual matching of the towns name in the language file.

### Map by theme

This part creates maps for each that show the percentage of agreement for each thematique.

Load the data for the % by commune and for the the theme of each votation.

In [None]:
data = pd.read_pickle("data/data.pkl")
thematique = pd.read_pickle("data/Thématique.pkl")

load the json to create the commune on the map, and create the lsit of the commune in the json. 

In [None]:
switzerland_coord = [46.765213, 8.252444]
town_geo_path = r'data/switzerland_borders/admin_level_8.geojson'
geo_json_data = json.load(open(town_geo_path, encoding="utf8"))
commune = [x['name']  for x in geo_json_data['features']]

thematique.to_pickle('data/data_theme.pkl')


We merge the 2 dataframes so that we have for each votation and each commune the theme and the percentage of yes. We only takes 'Thématique','Commune' and 'Oui en %' because it will be the only usefull information for later (Votation is no longer use full once we managed to merge)

We also make sure thatthere is in the dtaframes only commune that are in the json.

In [None]:
data_t = data.merge(thematique , on = 'Votation')[['Thématique','Commune','Oui en %']]
data_t = data_t[data_t['Commune'].isin(commune)]


data_t.head()


We group the data by theme, and for each one we create a map showing how much people voted yes. We then save it into an html.

In [None]:
for theme, data_theme in data_t.groupby('Thématique') :
    data_theme = data_theme.groupby('Commune', as_index=False).mean()
    map1 = folium.Map(location=switzerland_coord, zoom_start=8)
    map1.choropleth(geo_data = geo_json_data, \
                    data = data_theme, \
                    columns = ['Commune', 'Oui en %'], \
                    key_on = 'feature.name', \
                    fill_color = 'RdYlGn', \
                    fill_opacity = 0.7, \
                    line_opacity = 0.2, \
                    legend_name = 'yes in % given to the theme ' + theme)
    
    map1.save('data/map_theme/map_' + theme + '.html')

Now we will randomly take 3 theme to make an alysis (we took only 3 but the anylisis can be the same on the other maps.

So we will look at the ['Economic'](data/map_theme/map_Economie.html) theme, ['Politique de securite'](data/map_theme/map_Politique de sécurité.html) theme and the ['Régime politique'](data/map_theme/map_Régime politique.html) :    
In each one of them we can clearly see a large strip beggining in the valais and ending at Vaduz. We can see in this the Röstigraben, but at the same time the north part and east part of the deutsch part of Switzerland are not as different as the strip, so even if we can put forward the split between the freanch part and deutsch part, it is possible that the Röstigraben is not the only explanationfor those differences.

### Map by recommendation

This notebook is for using the proposition of vote of each poilitical party to create a visual representation of how much each party is listened and try to see which region vote more for each party.

This part load the dataframe with the % of yes and clean the dataframe so that we have only the date in one column and another to have only the subject of the votation.

In [None]:
#The dataframe with the votation data
df = pd.read_pickle("data/data.pkl")
df['Date'] = df['Votation'].map(lambda x : x.split(' ')[0])
df['Votation'] = df['Votation'].map(lambda x : ' '.join(x.split(' ')[1:]))
df.head()

In [None]:
#download all the page 
xls = pd.ExcelFile('data/Recommandations des partis.xls')
#the dataframe to fill with the info of the recommandation
recommend = pd.DataFrame()

#for every page with information from 2017 to 1981  :
for i in range ((2017 - 1981 + 1)) :
    #get the page i
    x = xls.parse(i)
    #change the name of the columns so that they will be easier to use 
    x.columns = range(len(x.columns))
    
    #We will need the parties and the numero of the votation, for that we use the fact that the line that have the date 
    #always begin by 'Parti 1)' and that the no of votation is 2 lines later
    base_nb = x[x.iloc[:, 0] == 'Parti 1)'].index[0]
    
    #We drop all the columns with only Nan
    x = x.dropna(axis=1, how='all')

    #concat the date, the No of votation and the conseil of vote using the fact that they are the only lines with Nan
    recommend_inter = pd.concat([x.iloc[base_nb:base_nb + 1], x.iloc[base_nb + 2:base_nb + 3], x.dropna()]).transpose()
    
    #this part change the name of the party so that even if there is little change in the name  (like had a 3) to the name)
    #we still have consistent name
    recommend_inter.iloc[0, 2:] = recommend_inter.iloc[0, 2:].map(lambda x : x.split(' ')[0])    
    
    #change the name of the columns so that they will be easier to use
    recommend_inter.columns = range(len(recommend_inter.columns))
    
    recommend_inter.iloc[:, 0] = recommend_inter.iloc[:, 0] + str(2017 - i)
    # make sure that 'no ###' are  made into 'No ###'
    recommend_inter.iloc[:, 1] = recommend_inter.iloc[:, 1].map(lambda x : x[0].upper() + x[1:], na_action='ignore')
    
    #get the lines that give the name of a votation knowing its No
    propositions = x[x.iloc[:, 0].str.contains('No ').fillna(False)]
    #change the name of the columns so that they will be easier to use
    propositions.columns = range(len(propositions.columns))
    
    #The next part is transforming the No of votation into name of votation
    
    #in some sheet this information is on only 1 cell so we split the information in 2 clls like the rest of the sheets 
    if (propositions.iloc[:, 1].isnull()).all():
        if (2017 - i == 1997):
            propositions.iloc[:, 1] = propositions[0].map(lambda x : x.split(' ')[1])
            propositions.iloc[:, 0] = propositions[0].map(lambda x : x.split(' ')[0].rstrip())
        else:
            propositions.iloc[:, 1] = propositions[0].map(lambda x : x.split(':')[1][1:])
            propositions.iloc[:, 0] = propositions[0].map(lambda x : x.split(':')[0].rstrip())
    
    
    dico_no_propos = propositions.dropna(1).set_index(0).to_dict()
    
    #some dico are inside a dictionarry {1:{true_dictionarry}}
    if (dico_no_propos.get(1) == None) :
        recommend_inter.iloc[:, 1] = recommend_inter.iloc[:, 1].map(dico_no_propos)
    else :
        recommend_inter.iloc[:, 1] = recommend_inter.iloc[:, 1].map(dico_no_propos[1])
        
    #rename column meaningfully
    recommend_columns = ['Date', 'Votation'] + list(recommend_inter.iloc[0][2:])
    recommend_inter.columns = recommend_columns
    
    #fill the date for the line that does not have it
    recommend_inter['Date'] = recommend_inter['Date'].fillna(method='ffill')
    #transpose so that we can join
    recommend_inter = recommend_inter.transpose()
    
    #delete duplicate (in some sheet there is twice PST )
    recommend_inter = recommend_inter.groupby(recommend_inter.index).first()
    #join the 2 dataframe, the name of the column are not important
    recommend = recommend.join(recommend_inter, how='outer', lsuffix='l', rsuffix='r')

recommend = recommend.transpose()  

#clean the dataframe 
recommend = recommend[~recommend.loc[:, 'Date'].str.contains('Parti 1').fillna(False)]
recommend = recommend[~recommend['Votation'].isnull()]
#create meaningfull index
recommend.index = range(len(recommend.index))

#transform the date into the same format than the main dataframe
month_to_int = { \
    'janvier' : '01', \
    'février' : '02', \
    'févirer' : '02', \
    'mars' : '03', \
    'avril' : '04', \
    'mai' : '05', \
    'juin' : '06', \
    'juillet' : '07', \
    'aout' : '08', \
    'septembre' : '09', \
    'octobre' : '10', \
    'novembre' : '11', \
    'décembre' : '12' \
}

def good_format_month (x) :
    if '.' in x :
        split_x = x.split('.')
        return split_x[0].zfill(2) + '.' + split_x[1].zfill(2) + '.' + split_x[2][:4]
    else :
        return str(x.split(' ')[0]).zfill(2) + '.' +  month_to_int[str(x.split(' ')[1]) ] + '.' + x[-4:] 

recommend.loc[:, 'Date'] = recommend.loc[:, 'Date'].map(lambda x : good_format_month(x))

recommend = recommend.loc[:, recommend.columns.drop_duplicates() ]
#parties is the list of all the parties
parties = list(recommend.columns.drop_duplicates())
parties.remove('Date')
parties.remove('Votation')
parties.remove('Parti')

recommend = recommend.loc[:, ['Date', 'Votation'] + parties]
recommend.head()

Change the value so that they are all uniform       
a yes recommandation becomes 1     
a no recommandation becomes -1   
No data or no recommendation is 0

In [None]:
def translate_choice(x) :
    if (x == 1 or str(x) == 'oui') :
        return 1
    elif (x == 2 or str(x) == 'non'):
        return -1
    else :
        return 0

good_recommend = recommend.copy() 
good_recommend.loc[:, parties] = good_recommend.loc[:, parties].applymap(lambda x : translate_choice(x))
good_recommend

We need to create a dictionarry to link the name of the votation in the recomendation dataframe and the vote dataframe.   
The problem is that the name are very different, so to link one to another we need to group the votation of the 2 by date and for each date we compare all pair using SequenceMatch, we find the best match, we decide to link this pair together, we take out the 2 element from their respective list and we research a new maximum until there is no elemnt in one of the list.

This method might create some imprecision but as a whole it is solid.

In [None]:
dico_recom_vote = {}

for date1, group1 in df[df['Commune'] == 'Aeugst am Albis'].groupby('Date'):
    
    for date2, group2 in good_recommend.groupby('Date'):
        
        if (date1 == date2) :
            vot1 = list(group1['Votation']) 
            vot2 = list(group2['Votation'])
            
            while (len(vot1) > 0 and len(vot2) > 0):
                max_v = -1
                max_match = [0, 0]
                
                for elem1 in vot1:
                    for elem2 in vot2:
                        current_v = SequenceMatcher(None, elem1, elem2).ratio()
                        
                        if (current_v > max_v) :
                            max_v = current_v
                            max_match = [elem1, elem2]
                            
                dico_recom_vote[max_match[0]] = max_match[1]
                dico_recom_vote[max_match[1]] = max_match[0]
                vot1.remove(max_match[0])
                vot2.remove(max_match[1])

Create a recommandation dataframe with names corresponding to the ones in the main dataframe

In [None]:
recommend_to_join = good_recommend.copy()
recommend_to_join['Votation'] = recommend_to_join['Votation'].map(dico_recom_vote)
recommend_to_join.head()

Prepare the map :    
get the json to do the border   
get all the commune name     
only keep the value that ar ein the json.

In [None]:
switzerland_coord = [46.765213, 8.252444]
town_geo_path = r'data/switzerland_borders/admin_level_8.geojson'
geo_json_data = json.load(open(town_geo_path, encoding="utf8"))
commune = [x['name'] for x in geo_json_data['features']]

to_map = df.merge(recommend_to_join.loc[:, ['Votation'] + parties], on='Votation')
to_map = to_map[to_map['Commune'].isin(commune)]
to_map.head()

Create a map of % of people agreeing in a party for each party.

People agreeing are considered to people that vote the same as the party if it votes yes or no. We do not take into consideration other proposition of the party (like abstentation) or when we do not have information about the recommandation of a party.

In [None]:
for parti in parties :
    current_to_map = to_map.loc[:, ['Commune', 'Oui en %', parti]]
    current_to_map = current_to_map[current_to_map[parti] != 0]
    current_to_map['Agreement'] = current_to_map[['Oui en %', parti]] \
        .apply(lambda x : x['Oui en %'] if x[parti] == 1 else 100 - x['Oui en %'], axis=1)
    
    current_to_map = current_to_map.groupby('Commune', as_index=False).mean()
    
    map1 = folium.Map(location=switzerland_coord, zoom_start=8)
    map1.choropleth(geo_data = geo_json_data, \
                    data = current_to_map, \
                    columns = ['Commune', 'Agreement'], \
                    key_on = 'feature.name', \
                    fill_color = 'RdYlGn', \
                    fill_opacity = 0.7, \
                    line_opacity = 0.2, \
                    legend_name = 'Agreement in % with ' + parti)
    
    map1.save('data/maps_partis/map_' + parti + '.html')
    print(map1)

Now we will take 4 parties to make an alysis, the 2 biggest (UDC,PS) , on with a medium level of importance (PB) and a parti with very few seat at the parlement (PST) (once again we took only 4 but the anylisis can be the same on the other maps) .

So we will look at the ['UDC'](data//maps_partis/map_UDC.html), ['PS](data/maps_partis/map_PS.html) , ['PB](data/maps_partis/map_PBD.html) and the ['PST'](data/maps_partis/map_PST.html) :    
This time the maps shows a lot of difference : the PS and PST shows very clear distinction between the french part and the deutsch part, very fitting of the Röstigraben, in the UDC and PBC on the other hand the difference are less important where in the UDC the difference overall between most of the state is not that big and in the PBC map the Röstigraben is near impossible to see.

# Conclusion

In the end we have seen obvious differences between the french part and the deutsch part, but those differences seems to vary, not apply every where, and even in those differences we can see that they are far from being uniform.     
As we stand it is diffcult to approve or deny the existence of the Röstigraben.
