In [301]:
import pandas as pd
import numpy as np
import re

Merge the different runs into a single file

In [302]:
run1 = pd.read_csv("FullRun_311023.csv")
run2 = pd.read_csv("FullRun_011123.csv")
run3 = pd.read_csv("FullRun_011123_2.csv")
combined_df = pd.concat([run1, run2, run3])

In order to find out which country council is in, we define welsh ones.
If Scotland were to be added, this part of the code would have to change.

In [303]:
Wales = [
"Blaenau Gwent County Borough Council",
"Bridgend County Borough Council",
"Caerphilly County Borough Council",
"Cardiff Council",
"Carmarthenshire County Council",
"Ceredigion County Council",
"Conwy County Borough Council",
"Denbighshire County Council",
"Flintshire County Council",
"Gwynedd County Council",
"Isle of Anglesey County Council",
"Merthyr Tydfil County Borough Council",
"Monmouthshire County Council",
"Neath Port Talbot County Borough Council",
"Newport City Council",
"Pembrokeshire County Council",
"Powys County Council",
"Rhondda Cynon Taf County Borough Council",
"Swansea City and Borough Council",
"Vale of Glamorgan Council",
"Torfaen County Borough Council",
"Wrexham County Borough Council"]
England = list(set([item for item in combined_df['Local Authority'].tolist() if item not in Wales]))

In [304]:
def determine_country(local_authority):
    if local_authority in England:
        return "England"
    elif local_authority in Wales:
        return "Wales"
    else:
        return "error"
    
def extract_website_from_string(input_string):
    # Check for None and return None
    if input_string is None:
        return None
    else:
        # Define a regular expression pattern to match URLs inside brackets
        url_pattern = r'\[.*\]\(([^)]+)\)'
    
        # Search for the URL in the input string
        if isinstance(input_string, str):  # Check if it's a string
            match = re.search(url_pattern, input_string)
            if match:
                # Extract and return the URL
                return match.group(1)
    
    # Return None if no URL is found or if the input is not a string
    return None


In [305]:
combined_df['Scheme ID'] = range(1,len(combined_df['Name of Grant'])+1)
combined_df['Location type'] = 'LA'
combined_df['Postcodes'] = pd.NA
combined_df['Grant URL'] = combined_df['Grant URL'].apply(extract_website_from_string)
country = []

combined_df['Country'] = combined_df["Local Authority"].apply(determine_country)
combined_df_reordered = combined_df[['Scheme ID',
                                     'Grant Type',
                                     'Name of Grant',
                                     'Grant Description',
                                     'Grant URL',
                                     'Measures',
                                     'Location type',
                                     'Country',
                                     'Local Authority',
                                     'Postcodes',
                                     'Start Date',
                                     'End Date',
                                     'Last updated',
                                     'Website',
                                     'Conditions for Eligibility',
                                     'Other Notes']]
print(combined_df_reordered)

    Scheme ID              Grant Type  \
0           1                   Other   
1           2                   Other   
2           3                   Other   
3           4                   Other   
4           5  Home Energy Efficiency   
..        ...                     ...   
65       2960                   Other   
66       2961  Home Energy Efficiency   
67       2962  Home Energy Efficiency   
68       2963  Home Energy Efficiency   
69       2964  Home Energy Efficiency   

                                        Name of Grant  \
0                 Mandatory Disabled Facilities Grant   
1      Discretionary Disabled Facilities Top-Up Grant   
2           Discretionary Disability Relocation Grant   
3           Discretionary Disability Grant Assistance   
4   Green Homes Grant Scheme Local Authority Deliv...   
..                                                ...   
65              Wolverhampton Home Improvement Agency   
66                         Property Improvement Loa

In [306]:
combined_df_reordered.to_csv("measurestacked_df.csv", index=False)

Works the same as the R equivalent of this function. New row is created for each measure.

In [307]:
def separate_longer_delim(df, col, delim):
    split_data = df[col].str.split(delim, expand=True)
    
    # Find rows without the delimiter
    no_delim_mask = ~df[col].str.contains(delim, na=False)

    # Split data for rows with the delimiter
    result = pd.concat([df.drop(col, axis=1), split_data], axis=1).melt(id_vars=df.columns.difference([col]), value_name=col)
    result = result.dropna(subset=[col]).drop(columns=['variable']).sort_index(axis=1)

    # Create a DataFrame for rows without the delimiter
    no_delim_df = df[no_delim_mask].copy()

    # Concatenate the two DataFrames
    result = pd.concat([result, no_delim_df], ignore_index=True)

    return result

In [308]:
print(separate_longer_delim(combined_df_reordered,'Measures',', '))

                             Conditions for Eligibility  Country  \
0     Applications will be assessed on a case-by-cas...  England   
1     Homeowners (owner occupier) or private rental ...  England   
2     Household income of less than £30,000, Home ha...  England   
3     Must be receiving one of several specified ben...  England   
4                                                   NaN  England   
...                                                 ...      ...   
7105                                                NaN  England   
7106  To be eligible, one must currently live in the...  England   
7107  To be eligible, one must currently live in the...  England   
7108  Homeowners must be elderly, disabled, or vulne...  England   
7109  Any resident in Wolverhampton who has had reco...  England   

                                               End Date  \
0                                                   NaN   
1                                                   NaN   
2     

In [309]:
separated_df = separate_longer_delim(combined_df_reordered,'Measures',', ')
separated_df_reorder = separated_df[['Scheme ID',
                                     'Grant Type',
                                     'Name of Grant',
                                     'Grant Description',
                                     'Grant URL',
                                     'Measures',
                                     'Location type',
                                     'Country',
                                     'Local Authority',
                                     'Postcodes',
                                     'Start Date',
                                     'End Date',
                                     'Last updated',
                                     'Website',
                                     'Conditions for Eligibility',
                                     'Other Notes']].sort_values(by='Scheme ID')

Check for DE measures as these have to match perfectly in order for Solstice code to run

In [310]:
DE_measures = [
"Draft proof your external doors",
"Draft proof your windows",
"Install a biomass boiler (wood pellets)",
"Install a gas combi boiler",
"Install a gas condensing boiler",
"Install a ground source heat pump",
"Install a log stove",
"Install a LPG combi boiler",
"Install a LPG condensing boiler",
"Install a new hot water tank",
"Install a solar hot water system",
"Install additional thermostatic controls, warm air systems",
"Install an air source heat pump",
"Install an oil combi boiler",
"Install an oil combi boiler (plus oil storage tank)",
"Install an oil condensing boiler",
"Install an oil condensing boiler (plus oil storage tank)",
"Install A-rated glazing (uPVC)",
"Install cavity wall insulation",
"Install hot water tank insulation",
"Install hot water tank insulation and new controls",
"Install improved hot water controls",
"Install insulation for flat roofing",
"Install loft insulation",
"Install modern storage heaters",
"Install new insulated uPVC external doors",
"Install new radiators and distribution system",
"Install party wall insulation",
"Install room-in roof insulation",
"Install secondary glazing",
"Install solar PV panels",
"Install solid floor insulation",
"Install solid wall insulation",
"Install storage heater Celect type controls",
"Install suspended wooden floor insulation",
"Install thermostatic radiator valves",
"Install underfloor heating",
"Time and temperature zone control",
"Top-up your loft insulation",
"All heating measures",
"All insulation",
"All renewables"]

def is_DE(measure):
    if measure in DE_measures:
        return True
    else:
        return False

grant_types = ["Home Renewables", "Home Energy Efficiency", "Other"]
def is_category(grant_type):
    if grant_type in grant_types:
        return True
    else:
        return False
    

These extra columns are here to facilitate manual checks. This approach should be changed in the future.

In [311]:
separated_df_reorder['Is measure in DE?'] = separated_df_reorder['Measures'].apply(is_DE)
separated_df_reorder['Is grant type OK?'] = separated_df_reorder['Grant Type'].apply(is_category)
separated_df_reorder['Is the record OK overall?'] = separated_df_reorder['Is measure in DE?'] & separated_df_reorder['Is grant type OK?']

In [312]:
separated_df_reorder.to_csv("combined_df.csv", index=False)