In [2]:
import pandas as pd
import numpy as np

from os.path import join, isfile
from os import path, scandir, listdir

In [3]:
def list_all_files(location='../input/', pattern=None, recursive=True):
    """
    This function returns a list of files at a given location (including subfolders)
    
    - location: path to the directory to be searched
    - pattern: part of the file name to be searched (ex. pattern='.csv' would return all the csv files)
    - recursive: boolean, if True the function calls itself for every subdirectory it finds
    """
    subdirectories= [f.path for f in scandir(location) if f.is_dir()]
    files = [join(location, f) for f in listdir(location) if isfile(join(location, f))]
    if recursive:
        for directory in subdirectories:
            files.extend(list_all_files(directory))
    if pattern:
        files = [f for f in files if pattern in f]
    return files

# Enexis

In [4]:
enexis = list_all_files('raw_data/', pattern='Enexis')
enexis

['raw_data/Enexis_kleinverbruiksgegevens_01012013.csv',
 'raw_data/Enexis_kleinverbruiksgegevens_01012016.csv',
 'raw_data/Enexis_kleinverbruiksgegevens_01012018.csv',
 'raw_data/Enexis_kleinverbruiksgegevens_01012015.csv',
 'raw_data/Enexis_kleinverbruiksgegevens_01012012.csv',
 'raw_data/Enexis_kleinverbruiksgegevens_01012017.csv',
 'raw_data/Enexis_kleinverbruiksgegevens_01012011.csv',
 'raw_data/Enexis_kleinverbruiksgegevens_01012010.csv',
 'raw_data/Enexis_kleinverbruiksgegevens_01012014.csv']

In [15]:
renaming = {'NETBEHEERDER' : 'net_manager', 'NETGEBIED': 'purchase_area', 
            'LEVERINGSRICHTING_PERC': 'delivery_perc', 
            'AANSLUITINGEN_AANTAL': 'num_connections', 
           'SOORT_AANSLUITING_PERC': 'type_conn_perc', 'SJV_GEMIDDELD' : 'annual_consume',
           'SJV_LAAG_TARIEF_PERC' : 'annual_consume_lowtarif_perc',
           'SLIMME_METER_PERC' : 'smartmeter_perc',
           'STRAATNAAM': "street",
           'POSTCODE_VAN': 'zipcode_from',
           'POSTCODE_TOT': 'zipcode_to',
           'WOONPLAATS': 'city',
           'PRODUCTSOORT': 'type_of_product',
           'SOORT_AANSLUITING': 'type_of_connection',
           'FYSIEKE_STATUS_PERC': 'perc_of_active_connections'}

In [16]:
for file in enexis:
    yr = file.split('_')[-1]
    print(yr)
    if '2017' in file:
        df = pd.read_csv(file, encoding = "ISO-8859-1", sep = ";", thousands='.')
    else:
        df = pd.read_csv(file, encoding = "ISO-8859-1", sep = ";")
    
    try:
        df['SJV_GEMIDDELD'] = df['SJV_GEMIDDELD'].str.replace(",", ".")
        df['LEVERINGSRICHTING_PERC'] = df['LEVERINGSRICHTING_PERC'].str.replace(",", ".")
        df['FYSIEKE_STATUS_PERC'] = df['FYSIEKE_STATUS_PERC'].str.replace(",", ".")
        df['SJV_LAAG_TARIEF_PERC'] = df['SJV_LAAG_TARIEF_PERC'].str.replace(",", ".")
        df['SLIMME_METER_PERC'] = df['SLIMME_METER_PERC'].str.replace(",", ".")
    except AttributeError as e: # some of them are already numeric
        print(e)
        pass
    
    if '2018' in file:
        df['SJV_GEMIDDELD'] = df['SJV_GEMIDDELD'].str.replace(",", ".")
        df['LEVERINGSRICHTING_PERC'] = df['LEVERINGSRICHTING_PERC'].str.replace(",", ".")
        df['SJV_LAAG_TARIEF_PERC'] = df['SJV_LAAG_TARIEF_PERC'].str.replace(",", ".")
        df['SLIMME_METER_PERC'] = df['SLIMME_METER_PERC'].str.replace(",", ".")
    
    df['LEVERINGSRICHTING_PERC'] = pd.to_numeric(df['LEVERINGSRICHTING_PERC'])
    df['FYSIEKE_STATUS_PERC'] = pd.to_numeric(df['FYSIEKE_STATUS_PERC'])
    df['SJV_LAAG_TARIEF_PERC'] = pd.to_numeric(df['SJV_LAAG_TARIEF_PERC'])
    df['SLIMME_METER_PERC'] = pd.to_numeric(df['SLIMME_METER_PERC'])
    df['SJV_GEMIDDELD'] = pd.to_numeric(df['SJV_GEMIDDELD'])
    
    df.POSTCODE_VAN = df.POSTCODE_VAN.str.replace(" ", "")
    df.POSTCODE_TOT = df.POSTCODE_TOT.str.replace(" ", "")
    
    df = df.rename(columns=renaming)
    
    del df['LANDCODE'] # always NL
    del df['VERBRUIKSSEGMENT'] # always KVB
    df_el = df[df.type_of_product == 'ELK'].copy()
    df_gas = df[df.type_of_product == 'GAS'].copy()
    del df_el['type_of_product']
    del df_gas['type_of_product']
    # del df_gas['smartmeter_perc'] # all null
    print(df_el.shape)
    print(df_gas.shape)
    df_el.to_csv("cleaned_data/Electricity/enexis_" + "electricity_" + yr, encoding='utf-8', index=False)
    df_gas.to_csv("cleaned_data/Gas/enexis_" + "gas_" + yr, encoding='utf-8', index=False)

01012013.csv
Can only use .str accessor with string values, which use np.object_ dtype in pandas
(113248, 14)
(94423, 14)
01012016.csv
Can only use .str accessor with string values, which use np.object_ dtype in pandas
(110577, 14)
(84429, 14)
01012018.csv
Can only use .str accessor with string values, which use np.object_ dtype in pandas
(124163, 14)
(110407, 14)
01012015.csv
Can only use .str accessor with string values, which use np.object_ dtype in pandas
(113989, 14)
(94945, 14)
01012012.csv
Can only use .str accessor with string values, which use np.object_ dtype in pandas
(109808, 14)
(91509, 14)
01012017.csv
Can only use .str accessor with string values, which use np.object_ dtype in pandas
(115576, 14)
(102661, 14)
01012011.csv
Can only use .str accessor with string values, which use np.object_ dtype in pandas
(108299, 14)
(83518, 14)
01012010.csv
Can only use .str accessor with string values, which use np.object_ dtype in pandas
(107512, 14)
(82966, 14)
01012014.csv
Can only 

# Liander

In [17]:
liander = list_all_files('raw_data/', pattern='Liander')
liander

['raw_data/Liander_kleinverbruiksgegevens_01012010.csv',
 'raw_data/Liander_kleinverbruiksgegevens_01012015.csv',
 'raw_data/LianderKV01012018.xlsx',
 'raw_data/Liander_kleinverbruiksgegevens_01012017.xlsx',
 'raw_data/Liander_kleinverbruiksgegevens_01012009.csv',
 'raw_data/Liander_kleinverbruiksgegevens_01012012.csv',
 'raw_data/Liander_kleinverbruiksgegevens_01012011.csv',
 'raw_data/Liander_kleinverbruiksgegevens_01012016.csv',
 'raw_data/Liander_kleinverbruiksgegevens_01012014.csv',
 'raw_data/Liander_kleinverbruiksgegevens_01012013.csv']

In [18]:
li_rename = {'%Leveringsrichting': 'LEVERINGSRICHTING_PERC',
             'Aantal Aansluitingen': 'AANSLUITINGEN_AANTAL',
             '%Fysieke status': 'FYSIEKE_STATUS_PERC',
             '%Soort aansluiting':'SOORT_AANSLUITING_PERC',
             'Soort aansluiting Naam': 'SOORT_AANSLUITING',
             'SJV': 'SJV_GEMIDDELD',
             '%SJV laag tarief': 'SJV_LAAG_TARIEF_PERC',
             '%Slimme Meter' : 'SLIMME_METER_PERC'}

In [19]:
for file in liander:
    yr = file.split('_')[-1].split('/')[-1]
    print(yr)
    if file.endswith('csv'):
        if '2010.csv' in file:
            df = pd.read_csv(file, encoding = "ISO-8859-1", sep=';')
        else:
            df = pd.read_csv(file, encoding = "ISO-8859-1", sep='\t', low_memory=False)
    else:
        df = pd.read_excel(file)
        yr = yr.split('.')[0] + '.csv'
        if 'Liander' in yr:
            yr = yr.split('KV')[-1]
    df = df.rename(columns=li_rename)
    
    try:
        df['LEVERINGSRICHTING_PERC'] = df['LEVERINGSRICHTING_PERC'].str.replace(",", ".")
        df['FYSIEKE_STATUS_PERC'] = df['FYSIEKE_STATUS_PERC'].str.replace(",", ".")
        df['SJV_LAAG_TARIEF_PERC'] = df['SJV_LAAG_TARIEF_PERC'].str.replace(",", ".")
        df['SLIMME_METER_PERC'] = df['SLIMME_METER_PERC'].str.replace(",", ".")
        df['Gemiddeld aantal telwielen'] = df['Gemiddeld aantal telwielen'].str.replace(",", ".")
    except AttributeError as e: # some of them are already numeric
        print(e)
        pass
    
    df['LEVERINGSRICHTING_PERC'] = pd.to_numeric(df['LEVERINGSRICHTING_PERC'])
    df['FYSIEKE_STATUS_PERC'] = pd.to_numeric(df['FYSIEKE_STATUS_PERC'])
    df['SJV_LAAG_TARIEF_PERC'] = pd.to_numeric(df['SJV_LAAG_TARIEF_PERC'])
    df['Gemiddeld aantal telwielen'] = pd.to_numeric(df['Gemiddeld aantal telwielen'])
    df['SLIMME_METER_PERC'] = pd.to_numeric(df['SLIMME_METER_PERC'])
    
    df.POSTCODE_VAN = df.POSTCODE_VAN.str.replace(" ", "")
    df.POSTCODE_TOT = df.POSTCODE_TOT.str.replace(" ", "")
    
    df = df.rename(columns=renaming)
    
    del df['%Defintieve aansl (NRM)'] # always 100
    del df['LANDCODE'] # always NL
    del df['VERBRUIKSSEGMENT'] # always KVB
    del df['MEETVERANTWOORDELIJKE'] # always Liander
    del df['Gemiddeld aantal telwielen'] # not clear
    df_el = df[df.type_of_product == 'ELK'].copy()
    df_gas = df[df.type_of_product == 'GAS'].copy()
    del df_el['type_of_product']
    del df_gas['type_of_product']
    print(df_el.shape)
    print(df_gas.shape)
    df_el.to_csv("cleaned_data/Electricity/liander_" + "electricity_" + yr, encoding='utf-8', index=False)
    df_gas.to_csv("cleaned_data/Gas/liander_" + "gas_" + yr, encoding='utf-8', index=False)

01012010.csv
(133554, 14)
(101398, 14)
01012015.csv
(136238, 14)
(106015, 14)
LianderKV01012018.xlsx
Can only use .str accessor with string values, which use np.object_ dtype in pandas
(142265, 14)
(118199, 14)
01012017.xlsx
Can only use .str accessor with string values, which use np.object_ dtype in pandas
(141484, 14)
(117735, 14)
01012009.csv
(132655, 14)
(100839, 14)
01012012.csv
(135181, 14)
(104936, 14)
01012011.csv
(134371, 14)
(104509, 14)
01012016.csv
(140639, 14)
(117250, 14)
01012014.csv
Can only use .str accessor with string values, which use np.object_ dtype in pandas
(135737, 14)
(105671, 14)
01012013.csv
(136040, 14)
(105332, 14)


# Stedin

In [20]:
stedin = list_all_files('raw_data/', pattern='Open')
stedin

['raw_data/20160119_OpenData_KV_Verbruiksdata_2013.csv',
 'raw_data/20160114_OpenData_KV_Verbruiksdata_2011_Final.csv',
 'raw_data/20160114_OpenData_KV_Verbruiksdata_2010_Final.csv',
 'raw_data/20160114_OpenData_KV_Verbruiksdata_2009_Final.csv',
 'raw_data/20180618_OpenData_KV_Verbruiksdata_2017.CSV',
 'raw_data/20160114_OpenData_KV_Verbruiksdata_2012_Final.csv',
 'raw_data/20160119_OpenData_KV_Verbruiksdata_2014.csv',
 'raw_data/20180129_OpenData_KV_Verbruiksdata_2018.csv',
 'raw_data/20160119_OpenData_KV_Verbruiksdata_2015.csv',
 'raw_data/20160119_OpenData_KV_Verbruiksdata_2016.csv']

In [21]:
for file in stedin:
    yr = file.split('_Verbruiksdata_')[-1]
    if 'Final' in yr:
            yr = yr.replace('_Final', '')
    yr = yr.lower()
    print(yr)
    df = pd.read_csv(file, encoding = "ISO-8859-1", sep='\t', low_memory=False)
    
    try:
        df['LEVERINGSRICHTING_PERC'] = df['LEVERINGSRICHTING_PERC'].str.replace(",", ".")
        df['FYSIEKE_STATUS_PERC'] = df['FYSIEKE_STATUS_PERC'].str.replace(",", ".")
        df['SJV_LAAG_TARIEF_PERC'] = df['SJV_LAAG_TARIEF_PERC'].str.replace(",", ".")
        df['SLIMME_METER_PERC'] = df['SLIMME_METER_PERC'].str.replace(",", ".")
    except AttributeError as e: # some of them are already numeric
        print(e)
        pass
    
    df['LEVERINGSRICHTING_PERC'] = pd.to_numeric(df['LEVERINGSRICHTING_PERC'])
    df['FYSIEKE_STATUS_PERC'] = pd.to_numeric(df['FYSIEKE_STATUS_PERC'])
    df['SJV_LAAG_TARIEF_PERC'] = pd.to_numeric(df['SJV_LAAG_TARIEF_PERC'])
    df['SLIMME_METER_PERC'] = pd.to_numeric(df['SLIMME_METER_PERC'])
    
    df.POSTCODE_VAN = df.POSTCODE_VAN.str.replace(" ", "")
    df.POSTCODE_TOT = df.POSTCODE_TOT.str.replace(" ", "")
    
    df = df.rename(columns=renaming)
    
    del df['LANDCODE'] # always NL
    del df['VERBRUIKSSEGMENT'] # always KVB
    
    df_el = df[df.type_of_product == 'ELK'].copy()
    df_gas = df[df.type_of_product == 'GAS'].copy()
    del df_el['type_of_product']
    del df_gas['type_of_product']
    print(df_el.shape)
    print(df_gas.shape)
    
    df_el.to_csv("cleaned_data/Electricity/stedin_" + "electricity_" + yr, encoding='utf-8', index=False)
    df_gas.to_csv("cleaned_data/Gas/stedin_" + "gas_" + yr, encoding='utf-8', index=False)

2013.csv
(82687, 14)
(85083, 14)
2011.csv
(82549, 14)
(84533, 14)
2010.csv
(82074, 14)
(84225, 14)
2009.csv
(81514, 14)
(83875, 14)
2017.csv
(84190, 14)
(86221, 14)
2012.csv
(83000, 14)
(84869, 14)
2014.csv
(83004, 14)
(85313, 14)
2018.csv
(84695, 14)
(85841, 14)
2015.csv
(83388, 14)
(85573, 14)
2016.csv
(83791, 14)
(85904, 14)
