Note: 2017 was the first year where PKWs had a category for "Elektro". Before that, there was only a category for "sonstige". (Years from 2017 have both categories.)

Data from https://www.kba.de/DE/Statistik/Produktkatalog/produkte/Fahrzeuge/fz1_b_uebersicht.html

In [5]:
import pandas as pd

## Making dataframes from the years where we have a CSV

In [116]:
#List of columns that we want as they are listed in CSV files. New list uses Python numbering.
csv_col_list = [1,7,19,21,36,37,40,41,44,45,46,47,48,49,50,51,52]
py_col_list = [i - 1 for i in csv_col_list]

#function to import a CSV file for a year into a dataframe
def CSVtoPD(year):
    filepath = "data/" + str(year) + " formatted.csv"
    
    #removes columns and rows that we don't want
    df = pd.read_csv(filepath,usecols=py_col_list)
    df = df.drop([0],axis=0)
    
    #change column names
    df.rename(columns = {'Unnamed: 0':'Place'}, inplace = True)
    
    return(df)

In [118]:
df2013 = CSVtoPD(2013)
df2014 = CSVtoPD(2014)
df2015 = CSVtoPD(2015)
df2016 = CSVtoPD(2016)
df2017 = CSVtoPD(2017)

## Making dataframes from the years where we have a .xlsx

In [129]:
#these numbers are used in the spreadsheet to identify our 10 cities
cities = "11000|02000|09162|05315|06412|08111|05111|14713|05913|05113"

#function to turn an .XLSX file for year into a df
def ExceltoDF(year):
    #generate filepath for the relevant year
    filepath = "data/fz1_" + str(year) + ".xlsx"
    
    #read sheet FZ1.1 from that file path, skip first 8 roads, skip columns without header names
    df1 = pd.read_excel(filepath, sheet_name='FZ1.1',skiprows=8,usecols=lambda x: 'Unnamed' not in x)

    #fill the merged cells with the corresponding value
    df1 = df1.fillna(method='ffill')

    #remove the final 5 rows. They don't have relevant data and are in a different format.
    df1 = df1[:-6]
    
    #same as above for most of sheet 2
    #the Excel formatting makes it hard to get the columns before Benzin but they're not needed
    df2 = pd.read_excel(filepath, sheet_name='FZ1.2',skiprows=8,usecols=lambda x: 'Unnamed' not in x)
    df2 = df2.fillna(method='ffill')
    df2 = df2[:-5]
    
    #combines the dataframes from each of the sheets
    df = pd.concat([df1, df2], axis=1)
    
    #filters the dataframe to only show rows with our 10 cities    
    df = df.dropna(subset=['Statistische Kennziffer und Zulassungsbezirk'])
    df = df.loc[df['Statistische Kennziffer und Zulassungsbezirk'].str.contains(cities)]
    
    #Berlin and Hamburg are cities and Länder so this line is needed to stop them from being duplicated in the df
    df = df.loc[df["Land"].str.contains('INSGESAMT')==False]
    
    #remove unnecessary columns
    df = df.iloc[:,[2,8,19,20,33,34,35,36,40,42,43,44,45,46,47,50,51]]

    #renames the columns
    df.columns = df.columns.str.replace('Euro', 'PKW Emissionsgruppe Euro')
    df.rename(columns = {'insgesamt.1':'PKW insgesamt', 
                         'PKW-Dichte\nje 1.000 \nEinwohner':'PKW Dichte',
                         'insgesamt.2':'LKW insgesamt',
                         'insgesamt.4':'KFZ insgesamt',
                         'Kfz-Dichte\nje 1.000 \nEinwohner':'KFZ Dichte',
                         'sonstige.1':'PKW Emissionsgruppe sonstige',
                         'schadstoff-\nreduzierte \ninsgesamt':'PKW Emissionsgruppe schadstoffreduzierte insgesamt'}, inplace = True)
    
    return(df)

In [130]:
#It doesn't work with 2018 and I don't understand why!
#df2018 = ExceltoDF(2018)
df2019 = ExceltoDF(2019)
df2020 = ExceltoDF(2020)
df2021 = ExceltoDF(2021)
df2022 = ExceltoDF(2022)