In [None]:
import pandas as pd
import altair as alt
import numpy as np
from pyproj import Transformer


## models
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score



Load Data

In [None]:
url = "https://raw.githubusercontent.com/mariagrahamx/mariagrahamx.github.io/main/project/glasgow_simd_2020_distance_wide.csv"

simd_distances_2020 = pd.read_csv(url)
simd_distances_2020

Unnamed: 0,Datazone,dist_km,Income_Domain_Rank,Employment_Domain_Rank,Education_Domain_Rank,Health_Domain_Rank,Crime_Domain_Rank,Access_Domain_Rank
0,S01009758,0.571303,5861.5,4396.0,4903,3273,5523.0,3216
1,S01009759,0.289445,23.0,168.0,1291,255,791.0,6001
2,S01009760,0.159333,1083.0,971.0,715,881,2079.0,5706
3,S01009761,0.362295,2784.0,3348.0,2530,2782,5459.0,3720
4,S01009762,0.317019,830.0,1946.0,547,1499,1904.0,6870
...,...,...,...,...,...,...,...,...
741,S01010499,0.158552,41.0,75.0,64,78,80.0,6363
742,S01010500,0.093344,311.0,618.0,633,179,2280.0,6172
743,S01010501,0.177333,1024.0,990.5,1005,471,3308.0,5528
744,S01010502,0.116682,2026.0,2008.0,2322,1550,3410.0,5171


Explore Relationship

In [None]:
data = simd_distances_2020

In [None]:
feature_vars = ['Income_Domain_Rank',	'Employment_Domain_Rank',	'Education_Domain_Rank',	'Health_Domain_Rank', 'Crime_Domain_Rank','Access_Domain_Rank']

In [None]:
charts = []
for variable in feature_vars:
    chart = alt.Chart(data).mark_point(color='rgba(128,0,0,.8)').encode(
        x=alt.X(f'{variable}:Q').axis(format='0.0f').title(variable).scale(domain=[0, 7500]),
        y=alt.Y('dist_km:Q').scale(zero=False, padding=40).title('Distance From Derelict or Land Site').scale(domain=[0, 1.6]).axis(
            titleAngle=0, titleY=-2, titleAlign='left', titleX=1
        ),
    ).properties(width=250, height=250)
    charts.append(chart)

# horizontal concatenation of all charts
alt.hconcat(*charts)

Prepare Training Data

In [None]:
# select multiple features
features = ['Income_Domain_Rank',	'Employment_Domain_Rank',	'Education_Domain_Rank',	'Health_Domain_Rank', 'Crime_Domain_Rank','Access_Domain_Rank']

X_multi = data[features]
y = data['dist_km']

In [None]:
X_multi

Unnamed: 0,Income_Domain_Rank,Employment_Domain_Rank,Education_Domain_Rank,Health_Domain_Rank,Crime_Domain_Rank,Access_Domain_Rank
0,5861.5,4396.0,4903,3273,5523.0,3216
1,23.0,168.0,1291,255,791.0,6001
2,1083.0,971.0,715,881,2079.0,5706
3,2784.0,3348.0,2530,2782,5459.0,3720
4,830.0,1946.0,547,1499,1904.0,6870
...,...,...,...,...,...,...
741,41.0,75.0,64,78,80.0,6363
742,311.0,618.0,633,179,2280.0,6172
743,1024.0,990.5,1005,471,3308.0,5528
744,2026.0,2008.0,2322,1550,3410.0,5171


Scale Features

In [None]:
# initialise the scaler
scaler = StandardScaler()

# fit the scaler to X and transform it
X_scaled = scaler.fit_transform(X_multi)

Fit The Model

In [None]:
# fit multiple regression
model_multi = linear_model.LinearRegression()
model_multi.fit(X_scaled, y)

Inspect Results

In [None]:
print(f"Coefficients: {model_multi.coef_}")
print(f"Intercept: {model_multi.intercept_}")
print(f"R-squared: {model_multi.score(X_scaled, y):.3f}")

Coefficients: [-0.09402052 -0.00112199  0.10324312  0.07477605  0.05560682  0.00878833]
Intercept: 0.41831259105822133
R-squared: 0.202


In [None]:
# loop through features list and print their corresponding coefficients
for i in range(len(features)):
    print(f"  {features[i]:25s}: {model_multi.coef_[i]:8.2f}")

  Income_Domain_Rank       :    -0.09
  Employment_Domain_Rank   :    -0.00
  Education_Domain_Rank    :     0.10
  Health_Domain_Rank       :     0.07
  Crime_Domain_Rank        :     0.06
  Access_Domain_Rank       :     0.01


Visualisation

In [None]:
from sklearn.linear_model import LinearRegression

# first, calculate predictions and add to data for charting
pred_multi = model_multi.predict(X_scaled)
data['pred_multi'] = pred_multi

title_params = alt.TitleParams(
    text="Predicting Distance to Derelict or Vacant Land Using Deprivation Indicators",
    subtitle=[
        "Multiple Variable Model, with RÂ² = 20.2%",
        "Source: datamap-scotland.co.uk (SIMD 2020)",
    ],
    anchor="middle",
    offset=15,
    color="#1f1b5e",
    font="Montserrat-Bold, sans-serif",
    fontSize=16,
    subtitleFontSize=14,
    subtitleColor="#616373",
    subtitlePadding=6,
    subtitleFontStyle="italic",
)

axis_style = alt.Axis(
    labelFontSize=11,
    titleFontSize=12,
    titleColor="#1f1b5e",
)


multi_scatter = alt.Chart(data).mark_point(color="darkred", opacity=0.6).encode(
    x=alt.X(
        "pred_multi:Q",
        title="Predicted Distance to Derelict or Vacant Site",
        scale=alt.Scale(domain=[0, 1.6]),
        axis=axis_style,
    ),
    y=alt.Y(
        "dist_km:Q",
        title="Actual Distance to Derelict or Vacant Site",
        scale=alt.Scale(domain=[0, 1.6]),
        axis=axis_style,
    ),
    tooltip=[
        "Datazone:N",
        alt.Tooltip("dist_km:Q", format=".3f", title="Actual Distance (km)"),
        alt.Tooltip("pred_multi:Q", format=".3f", title="Predicted Distance (km)"),
    ],
)


perfect_line_data = pd.DataFrame({"x": [0, 1.6], "y": [0, 1.6]})
perfect_line = alt.Chart(perfect_line_data).mark_line(
    strokeDash=[5, 5], color="gray"
).encode(
    x="x:Q",
    y="y:Q",
)

r2 = model_multi.score(X_scaled, y)

layered_chart = (
    (multi_scatter + perfect_line)
    .properties(
        title=alt.TitleParams(
            text=title_params.text,
            subtitle=title_params.subtitle,
            anchor=title_params.anchor,
            offset=title_params.offset,
            color=title_params.color,
            font=title_params.font,
            fontSize=title_params.fontSize,
            subtitleFontSize=title_params.subtitleFontSize,
            subtitleColor=title_params.subtitleColor,
            subtitlePadding=title_params.subtitlePadding,
            subtitleFontStyle=title_params.subtitleFontStyle,
        ),
        width=550,
        height=350,
        background="#EFEFF6",
    )
    .configure_view(stroke="transparent")
    .configure(font="Montserrat, sans-serif")
)

layered_chart