# Covariates choice, preparation, cleaning and analysis

### library imports

In [7]:
# import libraries
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [8]:

import sys
import os

# Dynamically add the 'utils' folder to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../source/utils')))

# Now, you can import the functions
from utils.municipality_dict import create_municipalities_dict
from utils.merge_covariates import merge_municipalities_covariates
from utils.replace_nan import replace_nan_based_on_type

## Pronovo data

In [9]:
# load cleaned pronovo data 
datapath = "../../data/cleaned_data/Swiss_solar_potential.csv"
Swiss_solar_potential_df = pd.read_csv(datapath, low_memory=False)
Swiss_solar_potential_df.set_index("mun_name", inplace=True)

## Covariate data

In [10]:
# import business creation
business_creation = pd.read_csv("../../data/raw_data/création_entreprises_2021.csv", delimiter=";")
business_creation = business_creation[["GEO_ID", "GEO_NAME", "VALUE"]]
business_creation.columns = ["MunicipalityNumber", "MunicipalityName", "new business"]
business_creation.set_index("MunicipalityName", inplace=True)
business_creation.index

Index(['Aeugst am Albis', 'Affoltern am Albis', 'Bonstetten',
       'Hausen am Albis', 'Hedingen', 'Kappel am Albis', 'Knonau',
       'Maschwanden', 'Mettmenstetten', 'Obfelden',
       ...
       'Fahy', 'Fontenais', 'Grandfontaine', 'Lugnez', 'Porrentruy',
       'Vendlincourt', 'Basse-Allaine', 'Clos du Doubs', 'Haute-Ajoie',
       'La Baroche'],
      dtype='object', name='MunicipalityName', length=2163)

In [11]:
# importing share of population in individual households
individual_houselhold_people_share = pd.read_csv("../../data/raw_data/share_of_people_in_individual_households.csv", delimiter=";")
individual_houselhold_people_share=individual_houselhold_people_share[["GEO_ID", "GEO_NAME", "VALUE"]]
individual_houselhold_people_share.columns = ["MunicipalityNumber", "MunicipalityName", "share of people in individual households"]
individual_houselhold_people_share.set_index("MunicipalityName", inplace=True)
individual_houselhold_people_share["share of people in individual households"] = individual_houselhold_people_share["share of people in individual households"]/100
individual_houselhold_people_share

Unnamed: 0_level_0,MunicipalityNumber,share of people in individual households
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1
Aeugst am Albis,1,0.493
Affoltern am Albis,2,0.209
Bonstetten,3,0.354
Hausen am Albis,4,0.474
Hedingen,5,0.432
...,...,...
Basse-Allaine,6807,0.702
Clos du Doubs,6808,0.462
Haute-Ajoie,6809,0.643
La Baroche,6810,0.577


In [12]:
# importing population data 
population_df = pd.read_excel('../../data/raw_data/Municipal_populations_2023.xlsx', skiprows=5, skipfooter=11)
population_df.columns = ["MunicipalityNumber", "MunicipalityName", "Population"]
population_df.set_index("MunicipalityName", inplace=True)
population_df.head()

Unnamed: 0_level_0,MunicipalityNumber,Population
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1
Aeugst am Albis,1,1998
Affoltern am Albis,2,12859
Bonstetten,3,5678
Hausen am Albis,4,3974
Hedingen,5,3963


In [13]:
#importing 9 urbanisation typologies 
urbanization_typo_df = pd.read_excel('../../data/raw_data/Municipalities_typologies_9.xlsx', skiprows=4, skipfooter=11)
urbanization_typo_df.columns = ["MunicipalityNumber", "MunicipalityName", "urban_typo"]
urbanization_typo_df.set_index("MunicipalityName", inplace=True)

#print(urbanization_typo_df["urban_typo"].head(10))
french_cat = [
    'Commune périurbaine de faible densité (23)',
    'Commune urbaine d’une grande agglomération (11)',
    'Commune périurbaine de moyenne densité (22)',
    'Commune périurbaine de forte densité (21)',
    "Commune urbaine d'une agglomération moyenne (12)",
    'Commune urbaine d’une petite ou hors agglomération (13)',
    'Commune rurale en situation centrale (32)',
    'Commune d’un centre rural (31)',
    'Commune rurale périphérique (33)'
]
english_cat = [
    'Low-density peri-urban municipality (23)',
    'Urban municipality in a large agglomeration (11)',
    'Medium-density peri-urban municipality (22)',
    'High-density peri-urban municipality (21)',
    'Urban municipality in a medium-sized agglomeration (12)',
    'Urban municipality in a small or non-agglomerated area (13)',
    'Centrally located rural municipality (32)',
    'Rural center municipality (31)',
    'Peripheral rural municipality (33)'
]

for i, name in enumerate(english_cat):
    urbanization_typo_df["urban_typo"] = urbanization_typo_df["urban_typo"].replace(french_cat[i], name)
    
urbanization_typo_df = urbanization_typo_df.replace('X', 0)

# visualizes df
urbanization_typo_df.tail(10)

Unnamed: 0_level_0,MunicipalityNumber,urban_typo
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1
Fontenais,6790,Urban municipality in a small or non-agglomera...
Grandfontaine,6792,Peripheral rural municipality (33)
Porrentruy,6800,Urban municipality in a small or non-agglomera...
Vendlincourt,6806,Peripheral rural municipality (33)
Basse-Allaine,6807,Peripheral rural municipality (33)
Clos du Doubs,6808,Peripheral rural municipality (33)
Haute-Ajoie,6809,Peripheral rural municipality (33)
La Baroche,6810,Peripheral rural municipality (33)
Damphreux-Lugnez,6811,Peripheral rural municipality (33)
Basse-Vendline,6812,Peripheral rural municipality (33)


In [14]:
#imports mountainous regions
Mountain_region_df = pd.read_excel('../../data/raw_data/régions_de_montagne.xlsx', skiprows=4, skipfooter=11)
Mountain_region_df.columns = ["MunicipalityNumber", "MunicipalityName", "mountain_type"]
Mountain_region_df.set_index("MunicipalityName", inplace=True)
Mountain_region_df.tail()

Unnamed: 0_level_0,MunicipalityNumber,mountain_type
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1
Lugano: inconnus,5192999,Alpes (4)
Lausanne: inconnus,5586999,Moyen-pays - Rhin supérieur (1)
La Chaux-de-Fonds: inconnus,6421999,Jura (3)
Neuchâtel: inconnus,6458999,Moyen-pays - Rhin supérieur (1)
Genève: inconnus,6621999,Moyen-pays - Rhin supérieur (1)


In [15]:
#imports language repartition
language_df = pd.read_excel('../../data/raw_data/language_repartition.xlsx', skiprows=4, skipfooter=11)
language_df.columns = ["MunicipalityNumber", "MunicipalityName", "language"]
language_df["language"]=language_df["language"].str.replace('Région linguistique ', '', regex=False)
language_df["language"] = language_df["language"].replace("allemande", "german")
language_df["language"] = language_df["language"].replace("italienne", "italian")
language_df["language"] = language_df["language"].replace("française", "french")
language_df["language"] = language_df["language"].replace("romanche", "romansh")

language_df.set_index("MunicipalityName", inplace=True)
language_df.tail()

Unnamed: 0_level_0,MunicipalityNumber,language
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1
Vendlincourt,6806,french
Basse-Allaine,6807,french
Clos du Doubs,6808,french
Haute-Ajoie,6809,french
La Baroche,6810,french


In [16]:
#imports votation repartition
votation_df = pd.read_excel('../../data/raw_data/votations_energie.xlsx', skiprows=5, skipfooter=6)
votation_df[["MunicipalityNumber", "MunicipalityName"]] = votation_df[["No commune", "Commune"]]
votation_df.drop( ["No commune","Commune"] , axis=1, inplace=True)
votation_df = votation_df[["MunicipalityName", "MunicipalityNumber", "Canton", "Oui", "Non"]]
votation_df.set_index("MunicipalityName", inplace=True)
votation_df.columns = ["MunicipalityNumber", "Canton", "Yes [nbr]", "No [nbr]"]
votation_df.tail()

Unnamed: 0_level_0,MunicipalityNumber,Canton,Yes [nbr],No [nbr]
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AG-Ausland-CH,9190.0,Aargau,1847,691
TG-Ausland-CH,9200.0,Thurgau,856,313
VD-CH de l'étranger,9220.0,Vaud,4245,1186
VS-CH de l'étranger,9230.0,Valais / Wallis,1119,325
GE-CH de l'étranger,9250.0,Genève,5800,1510


In [17]:
#Municipal stats
Municipal_stats = pd.read_excel('../../data/raw_data/portrait_communes.xlsx', skiprows=5, skipfooter=16)
Municipal_stats.drop([0, 1], axis=0, inplace=True)
Municipal_stats["MunicipalityNumber"] = Municipal_stats["Code commune"]
Municipal_stats["MunicipalityName"] = Municipal_stats["Commune"]
print(Municipal_stats.columns)
Municipal_stats.drop("Code commune", axis=1, inplace=True)
Municipal_stats.drop("Commune", axis=1, inplace=True)
Municipal_stats = Municipal_stats[[
    'MunicipalityName', 
    "MunicipalityNumber",
    "Habitants",
    '0-19 ans', 
    '20-64 ans', 
    '65 ans ou plus',
    'Ménages privés',
    "Surfaces d'habitat et d'infrastructure en %", 
    'Emplois total', 
    'Secteur primaire',
    'Secteur secondaire', 
    'Secteur tertiaire', 
    "Taux d'aide sociale",
    'PLR 2)', 
    'PDC', 
    'PS', 
    'UDC', 
    'PEV/PCS', 
    'PVL',
    'PBD', 
    'PST/Sol.', 
    'PES', 
    'Petits partis de droite'
]]

Municipal_stats.columns = [
    'MunicipalityName',
    'MunicipalityNumber',
    'inhabitants',
    '0-19 years', 
    '20-64 years', 
    '65 years or older',
    'Private households',
    'Housing and infrastructure area (%)',
    'Total employment', 
    'Primary sector',
    'Secondary sector', 
    'Tertiary sector', 
    'Social assistance rate',
    'PLR',  # Swiss political party: FDP.The Liberals
    'PDC',  # Swiss political party: Christian Democratic People's Party
    'PS',   # Swiss political party: Social Democratic Party
    'UDC',  # Swiss political party: Swiss People's Party
    'PEV/PCS',  # Swiss political party: Evangelical People's Party / Christian Social Party
    'PVL',  # Swiss political party: Green Liberal Party
    'PBD',  # Swiss political party: Conservative Democratic Party
    'PST/Sol.',  # Swiss political party: Swiss Labor Party / Solidarity
    'PES',  # Swiss political party: Green Party of Switzerland
    'Small right-wing parties'
]

percentage_cols = [
    '0-19 years', 
    '20-64 years', 
    '65 years or older',
    'Housing and infrastructure area (%)',
    'Social assistance rate',
    'PLR',  # Swiss political party: FDP.The Liberals
    'PDC',  # Swiss political party: Christian Democratic People's Party
    'PS',   # Swiss political party: Social Democratic Party
    'UDC',  # Swiss political party: Swiss People's Party
    'PEV/PCS',  # Swiss political party: Evangelical People's Party / Christian Social Party
    'PVL',  # Swiss political party: Green Liberal Party
    'PBD',  # Swiss political party: Conservative Democratic Party
    'PST/Sol.',  # Swiss political party: Swiss Labor Party / Solidarity
    'PES',  # Swiss political party: Green Party of Switzerland
    'Small right-wing parties'
]

Municipal_stats = Municipal_stats.replace("X", np.nan)
Municipal_stats = Municipal_stats.replace("*", np.nan)
    
for i, col in enumerate(percentage_cols):
    if col == 'Housing and infrastructure area (%)':
        print(i)
        print("before :", Municipal_stats[col])
        Municipal_stats[col] = Municipal_stats[col]*Municipal_stats["inhabitants"]/1000
        print("after:", Municipal_stats[col])
        print("*******************************************\n")
    else:
        print(i)
        print("before :", Municipal_stats[col])
        Municipal_stats[col] = Municipal_stats[col] * Municipal_stats["inhabitants"]/100
        print("after:", Municipal_stats[col])
        print("*******************************************\n")
Municipal_stats["0-64 years"] = Municipal_stats['0-19 years'] + Municipal_stats['20-64 years']
Municipal_stats.drop(columns=['0-19 years', '20-64 years'], inplace=True)
Municipal_stats.set_index("MunicipalityName", inplace=True)

Index(['Code commune', 'Commune', 'Habitants', 'Variation en %',
       'Densité de la population par km²', 'Etrangers en %', '0-19 ans',
       '20-64 ans', '65 ans ou plus', 'Taux brut de nuptialité',
       'Taux brut de divortialité', 'Taux brut de natalité',
       'Taux brut de mortalité', 'Ménages privés',
       'Taille moyenne des ménages en personnes', 'Surface totale en km² 1)',
       'Surfaces d'habitat et d'infrastructure en %', 'Variation en ha',
       'Surface agricole en %', 'Variation en ha.1', 'Surface boisée en %',
       'Surface improductive en %', 'Emplois total', 'Secteur primaire',
       'Secteur secondaire', 'Secteur tertiaire', 'Etablissements total',
       'Secteur primaire.1', 'Secteur secondaire.1', 'Secteur tertiaire.1',
       'Taux de logements vacants',
       'Nouveaux logements construits pour 1000 habitants',
       'Taux d'aide sociale', 'PLR 2)', 'PDC', 'PS', 'UDC', 'PEV/PCS', 'PVL',
       'PBD', 'PST/Sol.', 'PES', 'Petits partis de droite',
 

  Municipal_stats = Municipal_stats.replace("X", np.nan)
  Municipal_stats = Municipal_stats.replace("*", np.nan)


In [18]:
Municipal_stats.columns

Index(['MunicipalityNumber', 'inhabitants', '65 years or older',
       'Private households', 'Housing and infrastructure area (%)',
       'Total employment', 'Primary sector', 'Secondary sector',
       'Tertiary sector', 'Social assistance rate', 'PLR', 'PDC', 'PS', 'UDC',
       'PEV/PCS', 'PVL', 'PBD', 'PST/Sol.', 'PES', 'Small right-wing parties',
       '0-64 years'],
      dtype='object')

In [19]:
# load income data
revenue_df =  pd.read_excel('../../data/raw_data/revenu_par_contribuable.xlsx', skiprows=5, skipfooter=12)
revenue_df.columns = ["MunicipalityNumber", "MunicipalityName", "Municipal_rev [CHF]", "revenue_per_capita [CHF]"]
revenue_df["Municipal_rev [CHF]"]*=1000000
revenue_df.set_index("MunicipalityName", inplace=True)

revenue_df = revenue_df.replace("X", np.nan)
revenue_df = revenue_df.replace("*", np.nan)

revenue_df.head()

  revenue_df = revenue_df.replace("X", np.nan)


Unnamed: 0_level_0,MunicipalityNumber,Municipal_rev [CHF],revenue_per_capita [CHF]
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aeugst am Albis,1,109921000.0,
Affoltern am Albis,2,435719400.0,80080.757214
Bonstetten,3,251102000.0,99960.987261
Hausen am Albis,4,165486900.0,100968.212325
Hedingen,5,177331200.0,106826.024096


In [20]:
# load energy consumption data 
energy_cons_df = pd.read_csv("../../data/raw_data/energyreporter_municipality_latest.csv")
print(energy_cons_df.columns)
energy_cons_df = energy_cons_df[["bfs_nr", "municipality", "elec_consumption_households_mwh_per_year_per_capita", "elec_consumption_households_mwh_per_year", "elec_consumption_mwh_per_year_per_capita", "elec_consumption_mwh_per_year"]]
energy_cons_df.columns = ["MunicipalityNumber", "MunicipalityName", "elec_consumption_households_mwh_per_year_per_capita", "elec_consumption_households_mwh_per_year", "elec_consumption_mwh_per_year_per_capita", "elec_consumption_mwh_per_year"]
energy_cons_df.isna().any(axis=1)
energy_cons_df.set_index("MunicipalityName", inplace=True)
energy_cons_df.head()

Index(['bfs_nr', 'municipality', 'canton', 'bfs_municipality_type_2012_25',
       'electric_car_share', 'electric_car_count',
       'electric_car_share_last_change', 'electric_car_charging_spot_count',
       'electric_cars_per_charging_spot',
       'electric_car_charging_spot_last_change', 'solar_potential_usage',
       'solar_power_installed_kwp', 'solar_potential_usage_last_change',
       'renewable_heating_share', 'renewable_heating_count',
       'non_renewable_heating_count', 'no_heating_count',
       'renewable_heating_share_coverage',
       'renewable_heating_share_last_change',
       'elec_consumption_mwh_per_year_per_capita',
       'elec_consumption_households_mwh_per_year_per_capita',
       'elec_consumption_mwh_per_year',
       'elec_consumption_households_mwh_per_year',
       'elec_consumption_date_from', 'elec_consumption_date_until',
       'renelec_production_mwh_per_year_per_capita',
       'renelec_production_mwh_per_year',
       'renelec_production_water

Unnamed: 0_level_0,MunicipalityNumber,elec_consumption_households_mwh_per_year_per_capita,elec_consumption_households_mwh_per_year,elec_consumption_mwh_per_year_per_capita,elec_consumption_mwh_per_year
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aeugst am Albis,1,2.9119,5811,4.4623,8905
Affoltern am Albis,2,1.6492,21056,4.184,53420
Bonstetten,3,1.7356,9827,2.8006,15857
Hausen am Albis,4,2.3004,9128,3.9176,15545
Hedingen,5,2.1157,8335,5.3155,20941


In [21]:
# load electricity cost data 
full_elec_cost_df = pd.read_csv("../../data/raw_data/electricity_prices.csv")
operator_prices_df = full_elec_cost_df.groupby(" operatorLabel")[" total (cts./kWh)"].agg(["mean", "std"]).reset_index()
operator_prices_df.columns = ["operatorLabel", "mean_price (cts/kWh)", "std_price (cts/kWh)"]

df_mun = pd.read_csv("../../data/raw_data/electricity_municipality.csv")
print(df_mun.columns)

merged_df = pd.merge(df_mun, operator_prices_df, left_on='operator', right_on='operatorLabel')
elec_cost_df = merged_df.drop(["website", "operatorPostalCode", "operatorLabel", "operatorAddress", "operator", "canton"], axis=1)

print(len(elec_cost_df["municipalityNumber"].unique()))
elec_cost_df = elec_cost_df.drop_duplicates(subset=["municipalityNumber"])
elec_cost_df.columns = ["MunicipalityNumber", "MunicipalityName", "mean_price (cts/kWh)", "std_price (cts/kWh)"]
elec_cost_df.set_index("MunicipalityName", inplace=True)

elec_cost_df.to_csv("../../data/cleaned_data/mun_cost_elec.csv")
elec_cost_df.head()

Index(['operator', 'website', 'municipalityNumber', 'municipalityName',
       'operatorAddress', 'operatorPostalCode', 'canton'],
      dtype='object')
2079


Unnamed: 0_level_0,MunicipalityNumber,mean_price (cts/kWh),std_price (cts/kWh)
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aeugst am Albis,1,26.231762,2.464933
Affoltern am Albis,2,26.231762,2.464933
Bonstetten,3,26.231762,2.464933
Hausen am Albis,4,26.231762,2.464933
Hedingen,5,26.231762,2.464933


In [22]:
# load buildings data
buildings_df = pd.read_csv("../../data/raw_data/buildings_2024.csv")
buildings_df.set_index("MunicipalityName", inplace=True)
buildings_df["building_old"] = buildings_df["building_medium_age"] + buildings_df["building_old_age"]
buildings_df.drop(columns=["building_medium_age", "building_old_age"], inplace=True)
buildings_df.head()

Unnamed: 0_level_0,MunicipalityNumber,building_new_age,building_old
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aeugst am Albis,1,998,6628
Affoltern am Albis,2,2145,24610
Bonstetten,3,1404,13942
Hausen am Albis,4,1325,13395
Hedingen,5,1219,10773


In [23]:
# Creates a list of all covariates independent dataframes
covariates_dataframes_list = [population_df, urbanization_typo_df, Mountain_region_df, language_df, votation_df, Municipal_stats, revenue_df, energy_cons_df, elec_cost_df, buildings_df, individual_houselhold_people_share, business_creation]
#list of the dataframe names 
df_names_list = ["Pop_mun_df_2024","urbanization_typo_df", "Mountain_region_df", "language_df", "votation_df", "Municipal_stats", "revenue_df", "energy_cons_df", "elec_cost_df", "buildings_df", "individual_houselhold_people_share", "business_creation"]

In [24]:
# matching dataframes 
for i, cov_df in enumerate(covariates_dataframes_list):
    print(i)
    #print("Muninicipal_df :", Municipal_df.index)
    #print(f"{df_names_list[i]}:", cov_df.index)

    missing_mun = list(cov_df.index.difference(Swiss_solar_potential_df.index))
    print(f"number of elements that are in {df_names_list[i]} and not in Swiss_solar_potential_df: {len(missing_mun)}")
        
    new_mun = list(Swiss_solar_potential_df.index.difference(cov_df.index))
    print(f"elements in Swiss_solar_potential_df that aren't in {df_names_list[i]}: {len(new_mun)}")
    print("*******************************")

0
number of elements that are in Pop_mun_df_2024 and not in Swiss_solar_potential_df: 6
elements in Swiss_solar_potential_df that aren't in Pop_mun_df_2024: 1
*******************************
1
number of elements that are in urbanization_typo_df and not in Swiss_solar_potential_df: 0
elements in Swiss_solar_potential_df that aren't in urbanization_typo_df: 0
*******************************
2
number of elements that are in Mountain_region_df and not in Swiss_solar_potential_df: 1077
elements in Swiss_solar_potential_df that aren't in Mountain_region_df: 128
*******************************
3
number of elements that are in language_df and not in Swiss_solar_potential_df: 19
elements in Swiss_solar_potential_df that aren't in language_df: 5
*******************************
4
number of elements that are in votation_df and not in Swiss_solar_potential_df: 19
elements in Swiss_solar_potential_df that aren't in votation_df: 5
*******************************
5
number of elements that are in Munic

### scrapping a wikipedia page that contains information on Municipalities merges throughout the years

In [25]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO  # Import StringIO

# Define the Wikipedia page URL
url = 'https://en.wikipedia.org/wiki/List_of_former_municipalities_of_Switzerland'  # Example URL

# Send a GET request to the page
response = requests.get(url)

# Parse the page content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the first table on the page (You can also specify which table to scrape)
table = soup.find('table', {'class': 'wikitable'})

# Convert the table HTML to a StringIO object
table_html = str(table)
table_io = StringIO(table_html)

# Use pandas to read the table from the StringIO object
df_wiki = pd.read_html(table_io)[0]

# Display the DataFrame
df_wiki = df_wiki.sort_values(by="Year", ascending=True, inplace=False)
df_wiki = df_wiki[df_wiki.Year >= 2000]
df_wiki.to_csv("../../data/cleaned_data/mun_change.csv", index=False)

#testing
df_wiki[df_wiki.Fate.str.contains('Lugano', case=False, na=False)]

Unnamed: 0,Name,Canton[3],Fate,Resulting municipality,Year
303,Cureggia,Ticino,Incorporated into Lugano,Lugano,2004
309,Davesco-Soragno,Ticino,Incorporated into Lugano,Lugano,2004
421,Gandria,Ticino,Incorporated into Lugano,Lugano,2004
909,Pazzallo,Ticino,Incorporated into Lugano,Lugano,2004
140,Breganzona,Ticino,Incorporated into Lugano,Lugano,2004
949,Pregassona,Ticino,Incorporated into Lugano,Lugano,2004
902,Pambio-Noranco,Ticino,Incorporated into Lugano,Lugano,2004
1253,Viganello,Ticino,Incorporated into Lugano,Lugano,2004
1256,Villa Luganese,Ticino,Incorporated into Lugano,Lugano,2008
198,Carabbia,Ticino,Incorporated into Lugano,Lugano,2008


In [26]:
# Get a dictionary of merged municipalities
mun_dict = create_municipalities_dict(df_wiki)
mun_dict["Basse-Vendline"]

['Bonfol', 'Beurnevésin']

In [27]:
#getting id's from the new communes

new_muns = list(mun_dict.keys())
mask = Swiss_solar_potential_df.index.str.contains('|'.join(new_muns), case=False, na=False)
new_mun_ids_dict = Swiss_solar_potential_df.loc[mask].mun_id.to_dict()
new_mun_ids_dict["Basse-Vendline"]

6812

In [28]:
# creating a list of all dataframes 
all_cleaned_dataframes = []

In [29]:
Mountain_region_2024_df = merge_municipalities_covariates(Mountain_region_df,mun_dict,new_mun_ids_dict)
all_cleaned_dataframes.append(Mountain_region_2024_df)

Existing rows for target 'Guttet-Feschel': []
Existing rows for target 'Gurmels': ['Cordast', 'Guschelmuth', 'Liebistorf', 'Wallenbuch']
Numeric sum. Series([], dtype: float64)
Existing rows for target 'Grolley': []
Existing rows for target 'Grafschaft': []
Existing rows for target 'Montagny': ['Mannens-Grandsivaz']
Numeric sum. Series([], dtype: float64)
Existing rows for target 'Diessenhofen': []
Existing rows for target 'Isorno': ['Auressio', 'Berzona', 'Loco']
Numeric sum. Series([], dtype: float64)
Existing rows for target 'Avry': ['Avry-sur-Matran', 'Corjolens']
Numeric sum. Series([], dtype: float64)
Existing rows for target 'Capriasca': ['Bidogno', 'Cagiallo', 'Corticiasca', 'Lopagno', 'Lugaggia', 'Sala Capriasca', 'Tesserete', 'Vaglio']
Numeric sum. Series([], dtype: float64)
Existing rows for target 'Ursy': ['Bionnens', 'Mossel', 'Vauderens', 'Vuarmarens']
Numeric sum. Series([], dtype: float64)
Existing rows for target 'Villorsonnens': ['Chavannes-sous-Orsonnens', 'Orsonnens

In [30]:
Mountain_region_2024_df.loc["Basse-Vendline"]

MunicipalityNumber                   6812.0
mountain_type         pas de classification
Name: Basse-Vendline, dtype: object

In [31]:
population_2024_df = merge_municipalities_covariates(population_df,mun_dict,new_mun_ids_dict)
all_cleaned_dataframes.append(population_2024_df)
population_2024_df[population_2024_df.index=="Chur"]

Existing rows for target 'Guttet-Feschel': []
Existing rows for target 'Gurmels': []
Existing rows for target 'Grolley': []
Existing rows for target 'Grafschaft': []
Existing rows for target 'Montagny': []
Existing rows for target 'Diessenhofen': []
Existing rows for target 'Isorno': []
Existing rows for target 'Avry': []
Existing rows for target 'Capriasca': []
Existing rows for target 'Ursy': []
Existing rows for target 'Villorsonnens': []
Existing rows for target 'Sâles': []
Existing rows for target 'Hauterive': []
Existing rows for target 'La Brillaz': []
Existing rows for target 'Rue': []
Existing rows for target 'Marsens': []
Existing rows for target 'Suraua': []
Existing rows for target 'Cudrefin': []
Existing rows for target 'Zofingen': []
Existing rows for target 'Haut-Intyamon': []
Existing rows for target 'Vuisternens-devant-Romont': []
Existing rows for target 'Le Mouret': []
Existing rows for target 'Pont-en-Ogoz': []
Existing rows for target 'Le Glèbe': []
Existing rows f

Unnamed: 0_level_0,MunicipalityNumber,Population
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1
Chur,3901.0,38949.0


In [32]:
urbanization_typo_2024_df =  merge_municipalities_covariates(urbanization_typo_df,mun_dict,new_mun_ids_dict)
all_cleaned_dataframes.append(urbanization_typo_2024_df)

Existing rows for target 'Guttet-Feschel': []
Existing rows for target 'Gurmels': []
Existing rows for target 'Grolley': []
Existing rows for target 'Grafschaft': []
Existing rows for target 'Montagny': []
Existing rows for target 'Diessenhofen': []
Existing rows for target 'Isorno': []
Existing rows for target 'Avry': []
Existing rows for target 'Capriasca': []
Existing rows for target 'Ursy': []
Existing rows for target 'Villorsonnens': []
Existing rows for target 'Sâles': []
Existing rows for target 'Hauterive': []
Existing rows for target 'La Brillaz': []
Existing rows for target 'Rue': []
Existing rows for target 'Marsens': []
Existing rows for target 'Suraua': []
Existing rows for target 'Cudrefin': []
Existing rows for target 'Zofingen': []
Existing rows for target 'Haut-Intyamon': []
Existing rows for target 'Vuisternens-devant-Romont': []
Existing rows for target 'Le Mouret': []
Existing rows for target 'Pont-en-Ogoz': []
Existing rows for target 'Le Glèbe': []
Existing rows f

In [33]:
language_df_2024_df =  merge_municipalities_covariates(language_df,mun_dict,new_mun_ids_dict)
all_cleaned_dataframes.append(language_df_2024_df)

Existing rows for target 'Guttet-Feschel': []
Existing rows for target 'Gurmels': []
Existing rows for target 'Grolley': []
Existing rows for target 'Grafschaft': []
Existing rows for target 'Montagny': []
Existing rows for target 'Diessenhofen': []
Existing rows for target 'Isorno': []
Existing rows for target 'Avry': []
Existing rows for target 'Capriasca': []
Existing rows for target 'Ursy': []
Existing rows for target 'Villorsonnens': []
Existing rows for target 'Sâles': []
Existing rows for target 'Hauterive': []
Existing rows for target 'La Brillaz': []
Existing rows for target 'Rue': []
Existing rows for target 'Marsens': []
Existing rows for target 'Suraua': []
Existing rows for target 'Cudrefin': []
Existing rows for target 'Zofingen': []
Existing rows for target 'Haut-Intyamon': []
Existing rows for target 'Vuisternens-devant-Romont': []
Existing rows for target 'Le Mouret': []
Existing rows for target 'Pont-en-Ogoz': []
Existing rows for target 'Le Glèbe': []
Existing rows f

In [34]:
votations_2024_df =  merge_municipalities_covariates(votation_df,mun_dict,new_mun_ids_dict)
all_cleaned_dataframes.append(votations_2024_df)

Existing rows for target 'Guttet-Feschel': []
Existing rows for target 'Gurmels': []
Existing rows for target 'Grolley': []
Existing rows for target 'Grafschaft': []
Existing rows for target 'Montagny': []
Existing rows for target 'Diessenhofen': []
Existing rows for target 'Isorno': []
Existing rows for target 'Avry': []
Existing rows for target 'Capriasca': []
Existing rows for target 'Ursy': []
Existing rows for target 'Villorsonnens': []
Existing rows for target 'Sâles': []
Existing rows for target 'Hauterive': []
Existing rows for target 'La Brillaz': []
Existing rows for target 'Rue': []
Existing rows for target 'Marsens': []
Existing rows for target 'Suraua': []
Existing rows for target 'Cudrefin': []
Existing rows for target 'Zofingen': []
Existing rows for target 'Haut-Intyamon': []
Existing rows for target 'Vuisternens-devant-Romont': []
Existing rows for target 'Le Mouret': []
Existing rows for target 'Pont-en-Ogoz': []
Existing rows for target 'Le Glèbe': []
Existing rows f

In [35]:
municipal_stats_2024_df =  merge_municipalities_covariates(Municipal_stats,mun_dict,new_mun_ids_dict, numeric_only=True)
all_cleaned_dataframes.append(municipal_stats_2024_df)

Existing rows for target 'Guttet-Feschel': []
Existing rows for target 'Gurmels': []
Existing rows for target 'Grolley': []
Existing rows for target 'Grafschaft': []
Existing rows for target 'Montagny': []
Existing rows for target 'Diessenhofen': []
Existing rows for target 'Isorno': []
Existing rows for target 'Avry': []
Existing rows for target 'Capriasca': []
Existing rows for target 'Ursy': []
Existing rows for target 'Villorsonnens': []
Existing rows for target 'Sâles': []
Existing rows for target 'Hauterive': []
Existing rows for target 'La Brillaz': []
Existing rows for target 'Rue': []
Existing rows for target 'Marsens': []
Existing rows for target 'Suraua': []
Existing rows for target 'Cudrefin': []
Existing rows for target 'Zofingen': []
Existing rows for target 'Haut-Intyamon': []
Existing rows for target 'Vuisternens-devant-Romont': []
Existing rows for target 'Le Mouret': []
Existing rows for target 'Pont-en-Ogoz': []
Existing rows for target 'Le Glèbe': []
Existing rows f

In [36]:
revenue_2024_df =  merge_municipalities_covariates(revenue_df,mun_dict,new_mun_ids_dict)
all_cleaned_dataframes.append(revenue_2024_df)

Existing rows for target 'Guttet-Feschel': []
Existing rows for target 'Gurmels': []
Existing rows for target 'Grolley': []
Existing rows for target 'Grafschaft': []
Existing rows for target 'Montagny': []
Existing rows for target 'Diessenhofen': []
Existing rows for target 'Isorno': []
Existing rows for target 'Avry': []
Existing rows for target 'Capriasca': []
Existing rows for target 'Ursy': []
Existing rows for target 'Villorsonnens': []
Existing rows for target 'Sâles': []
Existing rows for target 'Hauterive': []
Existing rows for target 'La Brillaz': []
Existing rows for target 'Rue': []
Existing rows for target 'Marsens': []
Existing rows for target 'Suraua': []
Existing rows for target 'Cudrefin': []
Existing rows for target 'Zofingen': []
Existing rows for target 'Haut-Intyamon': []
Existing rows for target 'Vuisternens-devant-Romont': []
Existing rows for target 'Le Mouret': []
Existing rows for target 'Pont-en-Ogoz': []
Existing rows for target 'Le Glèbe': []
Existing rows f

In [37]:
energy_cons_2024_df = energy_cons_df.copy()
all_cleaned_dataframes.append(energy_cons_2024_df)

In [38]:
elec_cost_2024_df = merge_municipalities_covariates(elec_cost_df,mun_dict,new_mun_ids_dict)
all_cleaned_dataframes.append(elec_cost_2024_df)

Existing rows for target 'Guttet-Feschel': []
Existing rows for target 'Gurmels': []
Existing rows for target 'Grolley': []
Existing rows for target 'Grafschaft': []
Existing rows for target 'Montagny': []
Existing rows for target 'Diessenhofen': []
Existing rows for target 'Isorno': []
Existing rows for target 'Avry': []
Existing rows for target 'Capriasca': []
Existing rows for target 'Ursy': []
Existing rows for target 'Villorsonnens': []
Existing rows for target 'Sâles': []
Existing rows for target 'Hauterive': []
Existing rows for target 'La Brillaz': []
Existing rows for target 'Rue': []
Existing rows for target 'Marsens': []
Existing rows for target 'Suraua': []
Existing rows for target 'Cudrefin': []
Existing rows for target 'Zofingen': []
Existing rows for target 'Haut-Intyamon': []
Existing rows for target 'Vuisternens-devant-Romont': []
Existing rows for target 'Le Mouret': []
Existing rows for target 'Pont-en-Ogoz': []
Existing rows for target 'Le Glèbe': []
Existing rows f

In [39]:
buildings_2024_df = merge_municipalities_covariates(buildings_df,mun_dict,new_mun_ids_dict)
all_cleaned_dataframes.append(buildings_2024_df)

Existing rows for target 'Guttet-Feschel': []
Existing rows for target 'Gurmels': []
Existing rows for target 'Grolley': []
Existing rows for target 'Grafschaft': []
Existing rows for target 'Montagny': []
Existing rows for target 'Diessenhofen': []
Existing rows for target 'Isorno': []
Existing rows for target 'Avry': []
Existing rows for target 'Capriasca': []
Existing rows for target 'Ursy': []
Existing rows for target 'Villorsonnens': []
Existing rows for target 'Sâles': []
Existing rows for target 'Hauterive': []
Existing rows for target 'La Brillaz': []
Existing rows for target 'Rue': []
Existing rows for target 'Marsens': []
Existing rows for target 'Suraua': []
Existing rows for target 'Cudrefin': []
Existing rows for target 'Zofingen': []
Existing rows for target 'Haut-Intyamon': []
Existing rows for target 'Vuisternens-devant-Romont': []
Existing rows for target 'Le Mouret': []
Existing rows for target 'Pont-en-Ogoz': []
Existing rows for target 'Le Glèbe': []
Existing rows f

In [40]:
individual_houselhold_people_share_2024_df = merge_municipalities_covariates(individual_houselhold_people_share,mun_dict,new_mun_ids_dict)
all_cleaned_dataframes.append(individual_houselhold_people_share_2024_df)

Existing rows for target 'Guttet-Feschel': []
Existing rows for target 'Gurmels': []
Existing rows for target 'Grolley': []
Existing rows for target 'Grafschaft': []
Existing rows for target 'Montagny': []
Existing rows for target 'Diessenhofen': []
Existing rows for target 'Isorno': []
Existing rows for target 'Avry': []
Existing rows for target 'Capriasca': []
Existing rows for target 'Ursy': []
Existing rows for target 'Villorsonnens': []
Existing rows for target 'Sâles': []
Existing rows for target 'Hauterive': []
Existing rows for target 'La Brillaz': []
Existing rows for target 'Rue': []
Existing rows for target 'Marsens': []
Existing rows for target 'Suraua': []
Existing rows for target 'Cudrefin': []
Existing rows for target 'Zofingen': []
Existing rows for target 'Haut-Intyamon': []
Existing rows for target 'Vuisternens-devant-Romont': []
Existing rows for target 'Le Mouret': []
Existing rows for target 'Pont-en-Ogoz': []
Existing rows for target 'Le Glèbe': []
Existing rows f

In [41]:
business_creation_2024_df = merge_municipalities_covariates(business_creation, mun_dict,new_mun_ids_dict)
all_cleaned_dataframes.append(business_creation_2024_df)

Existing rows for target 'Guttet-Feschel': []
Existing rows for target 'Gurmels': []
Existing rows for target 'Grolley': []
Existing rows for target 'Grafschaft': []
Existing rows for target 'Montagny': []
Existing rows for target 'Diessenhofen': []
Existing rows for target 'Isorno': []
Existing rows for target 'Avry': []
Existing rows for target 'Capriasca': []
Existing rows for target 'Ursy': []
Existing rows for target 'Villorsonnens': []
Existing rows for target 'Sâles': []
Existing rows for target 'Hauterive': []
Existing rows for target 'La Brillaz': []
Existing rows for target 'Rue': []
Existing rows for target 'Marsens': []
Existing rows for target 'Suraua': []
Existing rows for target 'Cudrefin': []
Existing rows for target 'Zofingen': []
Existing rows for target 'Haut-Intyamon': []
Existing rows for target 'Vuisternens-devant-Romont': []
Existing rows for target 'Le Mouret': []
Existing rows for target 'Pont-en-Ogoz': []
Existing rows for target 'Le Glèbe': []
Existing rows f

In [42]:
Mountain_region_2024_df[Mountain_region_2024_df.index.str.contains("Basse-Vendline", case=False, na=False)]

Unnamed: 0_level_0,MunicipalityNumber,mountain_type
MunicipalityName,Unnamed: 1_level_1,Unnamed: 2_level_1
Basse-Vendline,6812.0,pas de classification


In [43]:
# matching dataframes 
for i, cov_df in enumerate(all_cleaned_dataframes):
    print(i)
    #print("Muninicipal_df :", Municipal_df.index)
    #print(f"{df_names_list[i]}:", cov_df.index)

    missing_mun = list(cov_df.index.difference(Swiss_solar_potential_df.index))
    print(f"number of elements that are in {df_names_list[i]} and not in Swiss_solar_potential_df: {len(missing_mun)}")
        
    new_mun = list(Swiss_solar_potential_df.index.difference(cov_df.index))
    print(f"elements in Swiss_solar_potential_df that aren't in {df_names_list[i]}: {new_mun}")
    print(len(new_mun))
    print("*******************************")

0
number of elements that are in Pop_mun_df_2024 and not in Swiss_solar_potential_df: 283
elements in Swiss_solar_potential_df that aren't in Pop_mun_df_2024: ['Baden', 'Erlinsbach (SO)', 'Hauterive (NE)', 'Nesslau', 'Wald (BE)', 'Wangen an der Aare']
6
*******************************
1
number of elements that are in urbanization_typo_df and not in Swiss_solar_potential_df: 2
elements in Swiss_solar_potential_df that aren't in urbanization_typo_df: ['Nesslau']
1
*******************************
2
number of elements that are in Mountain_region_df and not in Swiss_solar_potential_df: 1
elements in Swiss_solar_potential_df that aren't in Mountain_region_df: ['Nesslau']
1
*******************************
3
number of elements that are in language_df and not in Swiss_solar_potential_df: 6
elements in Swiss_solar_potential_df that aren't in language_df: ['Nesslau']
1
*******************************
4
number of elements that are in votation_df and not in Swiss_solar_potential_df: 15
elements in 

ages > moyenner les pourcentages par rapport à la population 
ménages privés > somme
emploi > somme
logement > moyenne basique
aide sociale > pondérer par la population 
partis politiques > pondérer par population

# Merging dataframes

In [44]:
remaining_df_to_merge = [ 
    population_2024_df, 
    language_df_2024_df, 
    votations_2024_df,
    municipal_stats_2024_df, 
    revenue_2024_df,
    energy_cons_2024_df,
    elec_cost_2024_df, 
    buildings_2024_df,
    individual_houselhold_people_share_2024_df,
    business_creation_2024_df
  ]

urba_final = urbanization_typo_df.reset_index("MunicipalityName")
urba_final = urba_final.set_index("MunicipalityNumber")
urba_final.head()

Mountain_region_final = Mountain_region_2024_df.reset_index("MunicipalityName")
Mountain_region_final = Mountain_region_final.set_index("MunicipalityNumber")
Mountain_region_final = Mountain_region_final.drop("MunicipalityName", axis=1)

Covariates_df = pd.merge(urba_final, Mountain_region_final, left_index=True, right_index=True, how='left')

for df in remaining_df_to_merge:
    df_final = df.reset_index("MunicipalityName")
    df_final = df_final.set_index("MunicipalityNumber")
    df_final = df_final.drop("MunicipalityName", axis=1)

    Covariates_df = pd.merge(Covariates_df, df_final, left_index=True, right_index=True, how='left')

Covariates_final_df=Covariates_df.drop("inhabitants", axis=1)
Covariates_final_df.columns

Index(['MunicipalityName', 'urban_typo', 'mountain_type', 'Population',
       'language', 'Canton', 'Yes [nbr]', 'No [nbr]', '65 years or older',
       'Private households', 'Housing and infrastructure area (%)',
       'Total employment', 'Primary sector', 'Secondary sector',
       'Tertiary sector', 'Social assistance rate', 'PLR', 'PDC', 'PS', 'UDC',
       'PEV/PCS', 'PVL', 'PBD', 'PST/Sol.', 'PES', 'Small right-wing parties',
       '0-64 years', 'Municipal_rev [CHF]', 'revenue_per_capita [CHF]',
       'elec_consumption_households_mwh_per_year_per_capita',
       'elec_consumption_households_mwh_per_year',
       'elec_consumption_mwh_per_year_per_capita',
       'elec_consumption_mwh_per_year', 'mean_price (cts/kWh)',
       'std_price (cts/kWh)', 'building_new_age', 'building_old',
       'share of people in individual households', 'new business'],
      dtype='object')

In [45]:
# Check if there are any duplicate indices
duplicate_indices = Covariates_final_df.index[Covariates_final_df.index.duplicated()].tolist()

# Display the rows with duplicate indices
duplicate_rows = Covariates_final_df.loc[Covariates_final_df.index.isin(duplicate_indices)]

# Show the duplicate indices and the corresponding rows
print("Duplicate indices:", duplicate_indices)
#print(duplicate_rows)
print("********************************************\n")

# Drop rows where the index is NaN
Covariates_final_df = Covariates_final_df[Covariates_final_df.index.notna()]

# Drop duplicates but keep the first occurrence
Covariates_final_df = Covariates_final_df.loc[~Covariates_final_df.index.duplicated(keep='first')]

# Verify the result
print("Data after dropping second instance of duplicated indices:")
# Check if there are any duplicate indices
duplicate_indices = Covariates_final_df.index[Covariates_final_df.index.duplicated()].tolist()

# Display the rows with duplicate indices
duplicate_rows = Covariates_final_df.loc[Covariates_final_df.index.isin(duplicate_indices)]

# Show the duplicate indices and the corresponding rows
#print("Duplicate indices:", duplicate_indices)
#print(duplicate_rows)
print("********************************************\n")

Covariates_final_df.shape

Duplicate indices: [2113.0, 2152.0, 2238.0, 2238.0, 2238.0, 3871.0, 3891.0, 3988.0, 3988.0, 3988.0, 5138.0, 5226.0, 5399.0]
********************************************

Data after dropping second instance of duplicated indices:
********************************************



(2131, 39)

In [46]:
# final tests on DF
Swiss_solar_potential_nbr_df = Swiss_solar_potential_df.reset_index()
Swiss_solar_potential_nbr_df = Swiss_solar_potential_nbr_df.set_index("mun_id")
    
missing_mun = list(Covariates_final_df.index.difference(Swiss_solar_potential_nbr_df.index))
print(f"elements that are in Covariates_final_df and not in Swiss_solar_potential_df: {missing_mun}")
    
new_mun = list(Swiss_solar_potential_nbr_df.index.difference(Covariates_final_df.index))
print(f"elements in Swiss_solar_potential_df that aren't in Covariates_final_df: {(new_mun)}")
print("*******************************")

elements that are in Covariates_final_df and not in Swiss_solar_potential_df: []
elements in Swiss_solar_potential_df that aren't in Covariates_final_df: []
*******************************


# FINAL COVARIATE DATAFRAME

In [47]:
Covariates_final_df.isna().sum()

MunicipalityName                                          0
urban_typo                                                0
mountain_type                                             9
Population                                                1
language                                                  1
Canton                                                    5
Yes [nbr]                                                 5
No [nbr]                                                  5
65 years or older                                         1
Private households                                        1
Housing and infrastructure area (%)                       1
Total employment                                         91
Primary sector                                           34
Secondary sector                                         55
Tertiary sector                                           7
Social assistance rate                                  460
PLR                                     

In [48]:
columns = Covariates_final_df.drop(["PLR", "PDC", "PS", "UDC", "PEV/PCS", "PVL", "PBD", "PST/Sol.", "PES", "Small right-wing parties", "Canton", "Primary sector", "Secondary sector", "Tertiary sector"], axis=1).columns
print(columns)

# Replace NaN values based on column type
updated_df = replace_nan_based_on_type(Covariates_final_df, columns)

print("\nUpdated DataFrame:")
updated_df.head()


Index(['MunicipalityName', 'urban_typo', 'mountain_type', 'Population',
       'language', 'Yes [nbr]', 'No [nbr]', '65 years or older',
       'Private households', 'Housing and infrastructure area (%)',
       'Total employment', 'Social assistance rate', '0-64 years',
       'Municipal_rev [CHF]', 'revenue_per_capita [CHF]',
       'elec_consumption_households_mwh_per_year_per_capita',
       'elec_consumption_households_mwh_per_year',
       'elec_consumption_mwh_per_year_per_capita',
       'elec_consumption_mwh_per_year', 'mean_price (cts/kWh)',
       'std_price (cts/kWh)', 'building_new_age', 'building_old',
       'share of people in individual households', 'new business'],
      dtype='object')
Replaced NaN in numeric column 'Population' with mean: 4193.719248826291
Replaced NaN in numeric column 'Yes [nbr]' with mean: 634.3786453433678
Replaced NaN in numeric column 'No [nbr]' with mean: 444.9054562558796
Replaced NaN in numeric column '65 years or older' with mean: 748.3004

Unnamed: 0_level_0,MunicipalityName,urban_typo,mountain_type,Population,language,Canton,Yes [nbr],No [nbr],65 years or older,Private households,...,elec_consumption_households_mwh_per_year_per_capita,elec_consumption_households_mwh_per_year,elec_consumption_mwh_per_year_per_capita,elec_consumption_mwh_per_year,mean_price (cts/kWh),std_price (cts/kWh),building_new_age,building_old,share of people in individual households,new business
MunicipalityNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,Aeugst am Albis,Low-density peri-urban municipality (23),Moyen-pays - Rhin supérieur (1),1998.0,german,Zürich,444.0,335.0,357.0,848.0,...,2.9119,5811,4.4623,8905,26.231762,2.464933,998.0,6628.0,0.493,9.0
2.0,Affoltern am Albis,Urban municipality in a large agglomeration (11),Moyen-pays - Rhin supérieur (1),12859.0,german,Zürich,1884.0,1260.0,2235.0,5412.0,...,1.6492,21056,4.184,53420,26.231762,2.464933,2145.0,24610.0,0.209,41.0
3.0,Bonstetten,Medium-density peri-urban municipality (22),Moyen-pays - Rhin supérieur (1),5678.0,german,Zürich,1197.0,706.0,921.0,2327.0,...,1.7356,9827,2.8006,15857,26.231762,2.464933,1404.0,13942.0,0.354,21.0
4.0,Hausen am Albis,Medium-density peri-urban municipality (22),Moyen-pays - Rhin supérieur (1),3974.0,german,Zürich,764.0,555.0,675.0,1552.0,...,2.3004,9128,3.9176,15545,26.231762,2.464933,1325.0,13395.0,0.474,16.0
5.0,Hedingen,Urban municipality in a large agglomeration (11),Moyen-pays - Rhin supérieur (1),3963.0,german,Zürich,809.0,580.0,632.0,1580.0,...,2.1157,8335,5.3155,20941,26.231762,2.464933,1219.0,10773.0,0.432,14.0


In [49]:
updated_df.isna().sum()

MunicipalityName                                          0
urban_typo                                                0
mountain_type                                             0
Population                                                0
language                                                  0
Canton                                                    5
Yes [nbr]                                                 0
No [nbr]                                                  0
65 years or older                                         0
Private households                                        0
Housing and infrastructure area (%)                       0
Total employment                                          0
Primary sector                                           34
Secondary sector                                         55
Tertiary sector                                           7
Social assistance rate                                    0
PLR                                     

In [50]:
updated_df.Canton.unique()

array(['Zürich', 'Bern / Berne', nan, 'Luzern', 'Uri', 'Schwyz',
       'Obwalden', 'Nidwalden', 'Glarus', 'Zug', 'Fribourg / Freiburg',
       'Solothurn', 'Basel-Stadt', 'Basel-Landschaft', 'Schaffhausen',
       'Appenzell Ausserrhoden', 'Appenzell Innerrhoden', 'St. Gallen',
       'Graubünden / Grigioni / Grischun', 'Aargau', 'Thurgau', 'Ticino',
       'Vaud', 'Valais / Wallis', 'Neuchâtel', 'Genève', 'Jura'],
      dtype=object)

In [51]:
updated_df.loc[updated_df["Canton"].isna(), "Canton"] = ["Bern / Berne", "Bern / Berne", "Bern / Berne", "Bern / Berne", "St. Gallen"]


In [52]:
mun_check = ["Meienried", "Hellsau", "Deisswil bei Münchenbuchsee", "Niedermuhlern", "Nesslau"]
updated_df[updated_df.MunicipalityName.isin(mun_check)]["Canton"]

MunicipalityNumber
389.0     Bern / Berne
408.0     Bern / Berne
535.0     Bern / Berne
877.0     Bern / Berne
3360.0      St. Gallen
Name: Canton, dtype: object

In [53]:
Covariates_final_df.columns

Index(['MunicipalityName', 'urban_typo', 'mountain_type', 'Population',
       'language', 'Canton', 'Yes [nbr]', 'No [nbr]', '65 years or older',
       'Private households', 'Housing and infrastructure area (%)',
       'Total employment', 'Primary sector', 'Secondary sector',
       'Tertiary sector', 'Social assistance rate', 'PLR', 'PDC', 'PS', 'UDC',
       'PEV/PCS', 'PVL', 'PBD', 'PST/Sol.', 'PES', 'Small right-wing parties',
       '0-64 years', 'Municipal_rev [CHF]', 'revenue_per_capita [CHF]',
       'elec_consumption_households_mwh_per_year_per_capita',
       'elec_consumption_households_mwh_per_year',
       'elec_consumption_mwh_per_year_per_capita',
       'elec_consumption_mwh_per_year', 'mean_price (cts/kWh)',
       'std_price (cts/kWh)', 'building_new_age', 'building_old',
       'share of people in individual households', 'new business'],
      dtype='object')

In [54]:
Covariates_final_df.reset_index()
Covariates_final_df.to_csv("../../data/cleaned_data/Covariates.csv", index=True, encoding='utf-8')