In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

# import data and adjust data types
file = "Buildings_LST_final.csv"
df = pd.read_csv(file, sep=";")

df["geometry_area_m2"] = df["geometry_area_m2"].str.replace(",", ".").astype(float)
df["CURRENT_ENERGY_EFFICIENCY"] = df["CURRENT_ENERGY_EFFICIENCY"].astype(int)
df["LST_Celsius_13"] = df["LST_Celsius_13"].str.replace(",", ".").astype(float)
df["LST_Celsius_12"] = df["LST_Celsius_12"].str.replace(",", ".").astype(float)
df["LST_Celsius_mean"] = df["LST_Celsius_mean"].str.replace(",", ".").astype(float)
df["ROOF_ENERGY_EFF_SCORE"] = df["ROOF_ENERGY_EFF_SCORE"].astype(int)

# define function which calculates regression and creates plot
def ols_regression_and_plot(df, LST, ENERGY_EFFICIENCY, name_energy_effiency, LST_title, Effiency_title, name_fig, ax_limit=None, Filter_title=None, Raeumliche_Ebene=None, ):
    # calculate linear regression (ordinary least square regression = ols)
    X = df[LST]
    y = df[ENERGY_EFFICIENCY]
    X = sm.add_constant(X)
    ols_mod = sm.OLS(y, X).fit()

    data_reg = []
    data_reg.append({
        "intercept": ols_mod.params["const"],
        "slope": ols_mod.params[LST],
        "pearson_r": df[LST].corr(df[ENERGY_EFFICIENCY]),
        "r_squared": ols_mod.rsquared,
        "p_value_slope": ols_mod.pvalues[LST],
        "count": df .shape[0]
        })
    reg = pd.DataFrame(data_reg) # create dataframe from list
    print(reg)

    # plot results
    g = sns.relplot(
        data=df, 
        x=LST, 
        y=ENERGY_EFFICIENCY, 
        marker="o",
        color="#008B8B",
        col_wrap=None,
        facet_kws=dict(sharex=False, sharey=False), 
        legend='auto',
        **{"s": 6} 
    );

    ax = g.ax  
    sns.regplot(
        data=df, 
        x=LST, 
        y=ENERGY_EFFICIENCY, 
        scatter=False, 
        line_kws={"color": "red"},  
        ci=90,  # 90 % confidence interval (explorative analyses)
        ax=ax  
    )
    
    text = (
        f"r²: {reg['r_squared'].iloc[0]:.3f}\n"
    )
    g.set_axis_labels("Land Surface Temperature (LST) in °C", name_energy_effiency, fontweight='bold')
    if(ax_limit):
        ax.set_ylim(0, ax_limit) 
    if(Filter_title):
        ax.set_title(f"LST: {LST_title}\nEnergieeffizienz: {Effiency_title}\nFilterung: {Filter_title}\n\nr²: {reg['r_squared'].iloc[0]:.3f}\nObjektanzahl: {reg['count'].iloc[0]:}", loc="left", fontweight='bold')
    elif(Raeumliche_Ebene):
        ax.set_title(f"LST: {LST_title}\nEnergieeffizienz: {Effiency_title}\nRäumliche Ebene: {Raeumliche_Ebene}\n\nr²: {reg['r_squared'].iloc[0]:.3f}\nObjektanzahl: {reg['count'].iloc[0]:}", loc="left", fontweight='bold')
    else:
        ax.set_title(f"LST: {LST_title}\nEnergieeffizienz: {Effiency_title}\n\nr²: {reg['r_squared'].iloc[0]:.3f}", loc="left", fontweight='bold')
    plt.savefig(name_fig, bbox_inches='tight')
    plt.show()
    
# use the regression function for the different LST values once for the building energy efficiency and once for the roof energy efficiency
ols_regression_and_plot(df, "LST_Celsius_12", "CURRENT_ENERGY_EFFICIENCY", "Energieeffizienz der Gebäude", "Werte vom 12.11.2022", "Gebäude (1 bis 100)", "Reg_Geb_Eff_LST12.png", ax_limit=110)
print()
ols_regression_and_plot(df, "LST_Celsius_13", "CURRENT_ENERGY_EFFICIENCY", "Energieeffizienz der Gebäude", "Werte vom 13.11.2022", "Gebäude (1 bis 100)", "Reg_Geb_Eff_LST13.png", ax_limit=110)
print()
ols_regression_and_plot(df, "LST_Celsius_mean", "CURRENT_ENERGY_EFFICIENCY", "Energieeffizienz der Gebäude", "Mittelwert beider Tage", "Gebäude (1 bis 100)", "Reg_Geb_Eff_LSTmean.png", ax_limit=110)
print()
ols_regression_and_plot(df, "LST_Celsius_12", "ROOF_ENERGY_EFF_SCORE", "Energieeffizienz der Dächer", "Werte vom 12.11.2022", "Dächer (1 bis 5)", "Reg_Dach_Eff_LST12.png", ax_limit=6)
print()
ols_regression_and_plot(df, "LST_Celsius_13", "ROOF_ENERGY_EFF_SCORE", "Energieeffizienz der Dächer", "Werte vom 13.11.2022", "Dächer (1 bis 5)", "Reg_Dach_Eff_LST13.png", ax_limit=6)
print()
ols_regression_and_plot(df, "LST_Celsius_mean", "ROOF_ENERGY_EFF_SCORE", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Dach_Eff_LSTmean.png", ax_limit=6)

# filter the df to different building areas
df_area_50 = df[df["geometry_area_m2"] > 50]
df_area_60 = df[df["geometry_area_m2"] > 60]
df_area_70 = df[df["geometry_area_m2"] > 70]
df_area_80 = df[df["geometry_area_m2"] > 80]
df_area_100 = df[df["geometry_area_m2"] > 100]
df_area_120 = df[df["geometry_area_m2"] > 120]

# use the regression function for the filtered df
ols_regression_and_plot(df_area_50, "LST_Celsius_mean", "ROOF_ENERGY_EFF_SCORE", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Flaeche050_Dach_Eff_LSTmean.png", ax_limit=6, Filter_title="Gebäudefläche > 50 m²")
print()
ols_regression_and_plot(df_area_60, "LST_Celsius_mean", "ROOF_ENERGY_EFF_SCORE", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Flaeche060_Dach_Eff_LSTmean.png", ax_limit=6, Filter_title="Gebäudefläche > 60 m²")
print()
ols_regression_and_plot(df_area_70, "LST_Celsius_mean", "ROOF_ENERGY_EFF_SCORE", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Flaeche070_Dach_Eff_LSTmean.png", ax_limit=6, Filter_title="Gebäudefläche > 70 m²")
print()
ols_regression_and_plot(df_area_80, "LST_Celsius_mean", "ROOF_ENERGY_EFF_SCORE", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Flaeche080_Dach_Eff_LSTmean.png", ax_limit=6, Filter_title="Gebäudefläche > 80 m²")
print()
ols_regression_and_plot(df_area_100, "LST_Celsius_mean", "ROOF_ENERGY_EFF_SCORE", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Flaeche100_Dach_Eff_LSTmean.png", ax_limit=6, Filter_title="Gebäudefläche > 100 m²")
print()
ols_regression_and_plot(df_area_120, "LST_Celsius_mean", "ROOF_ENERGY_EFF_SCORE", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Flaeche120_Dach_Eff_LSTmean.png", ax_limit=6, Filter_title="Gebäudefläche > 120 m²")
print()

# calculate the average LST and average energy efficiency values for different spatial levels
df_grouped_100 = df.groupby("GRID_ID_100").agg(
    LST_grouped=("LST_Celsius_mean", "mean"),
    ROOF_ENERGY_EFFICIENCY_grouped=("ROOF_ENERGY_EFF_SCORE", "mean"),
    ENERGY_EFFICIENCY_grouped=("CURRENT_ENERGY_EFFICIENCY", "mean"),
    count=("GRID_ID_100", "size")  
).reset_index()
df_grouped_100 = df_grouped_100[df_grouped_100["count"]>3]

df_grouped_200 = df.groupby("GRID_ID_200").agg(
    LST_grouped=("LST_Celsius_mean", "mean"),
    ROOF_ENERGY_EFFICIENCY_grouped=("ROOF_ENERGY_EFF_SCORE", "mean"),
    ENERGY_EFFICIENCY_grouped=("CURRENT_ENERGY_EFFICIENCY", "mean"),
    count=("GRID_ID_200", "size")  
).reset_index()
df_grouped_200 = df_grouped_200[df_grouped_200["count"]>5]

df_grouped_300 = df.groupby("GRID_ID_300").agg(
    LST_grouped=("LST_Celsius_mean", "mean"),
    ROOF_ENERGY_EFFICIENCY_grouped=("ROOF_ENERGY_EFF_SCORE", "mean"),
    ENERGY_EFFICIENCY_grouped=("CURRENT_ENERGY_EFFICIENCY", "mean"),
    count=("GRID_ID_300", "size")  
).reset_index()
df_grouped_300 = df_grouped_300[df_grouped_300["count"]>10]

df_grouped_500 = df.groupby("GRID_ID_500").agg(
    LST_grouped=("LST_Celsius_mean", "mean"),
    ROOF_ENERGY_EFFICIENCY_grouped=("ROOF_ENERGY_EFF_SCORE", "mean"),
    ENERGY_EFFICIENCY_grouped=("CURRENT_ENERGY_EFFICIENCY", "mean"),
    count=("GRID_ID_500", "size")  
).reset_index()
df_grouped_500 = df_grouped_500[df_grouped_500["count"]>20]

df_grouped_1026 = df.groupby("GRID_ID_1026").agg(
    LST_grouped=("LST_Celsius_mean", "mean"),
    ROOF_ENERGY_EFFICIENCY_grouped=("ROOF_ENERGY_EFF_SCORE", "mean"),
    ENERGY_EFFICIENCY_grouped=("CURRENT_ENERGY_EFFICIENCY", "mean"),
    count=("GRID_ID_1026", "size")  
).reset_index()
df_grouped_1026 = df_grouped_1026[df_grouped_1026["count"]>20]

df_grouped_ward = df.groupby("Ward_Entity").agg(
    LST_grouped=("LST_Celsius_mean", "mean"),
    ROOF_ENERGY_EFFICIENCY_grouped=("ROOF_ENERGY_EFF_SCORE", "mean"),
    ENERGY_EFFICIENCY_grouped=("CURRENT_ENERGY_EFFICIENCY", "mean"),
    count=("Ward_Entity", "size")  
).reset_index()
df_grouped_ward = df_grouped_ward[df_grouped_ward["count"]>20]

# use the regression function for the df grouped by spatial levels
ols_regression_and_plot(df_grouped_100, "LST_grouped", "ROOF_ENERGY_EFFICIENCY_grouped", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Raeumlich0100_Dach_Eff_LSTmean.png", Raeumliche_Ebene="Hexagonraster (100 m Seitenlänge)")
print()
ols_regression_and_plot(df_grouped_200, "LST_grouped", "ROOF_ENERGY_EFFICIENCY_grouped", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Raeumlich0200_Dach_Eff_LSTmean.png", Raeumliche_Ebene="Hexagonraster (200 m Seitenlänge)")
print()
ols_regression_and_plot(df_grouped_300, "LST_grouped", "ROOF_ENERGY_EFFICIENCY_grouped", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Raeumlich0300_Dach_Eff_LSTmean.png", Raeumliche_Ebene="Hexagonraster (300 m Seitenlänge)")
print()
ols_regression_and_plot(df_grouped_500, "LST_grouped", "ROOF_ENERGY_EFFICIENCY_grouped", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Raeumlich0500_Dach_Eff_LSTmean.png", Raeumliche_Ebene="Hexagonraster (500 m Seitenlänge)")
print()
ols_regression_and_plot(df_grouped_1026, "LST_grouped", "ROOF_ENERGY_EFFICIENCY_grouped", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_Raeumlich1026_Dach_Eff_LSTmean.png", Raeumliche_Ebene="Hexagonraster (1026 m Seitenlänge)")
print()
ols_regression_and_plot(df_grouped_ward, "LST_grouped", "ROOF_ENERGY_EFFICIENCY_grouped", "Energieeffizienz der Dächer", "Mittelwert beider Tage", "Dächer (1 bis 5)", "Reg_RaeumlichWards_Dach_Eff_LSTmean.png", Raeumliche_Ebene="Wards")