In [1]:
from data_common.notebook import *
from data_common.dataset import get_dataset_df

# Composite IMD calculation for UK Constituencies

Combine LSOA level deprivation scores into constituency wide deprivation score.

In [2]:
# bring in the uk-wide imd info

# split into three dimensions, decile 1 is high, 2,3 is medium, 4,5 is low.

imd = pd.read_csv(Path("data", "packages", "uk_index", "UK_IMD_E.csv")).set_index(
    "lsoa"
)
imd = imd[["UK_IMD_E_pop_quintile"]].rename(
    columns={"UK_IMD_E_pop_quintile": "quintile"}
)
imd["group"] = imd["quintile"].map(
    {
        1: "High deprivation",
        2: "Medium deprivation",
        3: "Medium deprivation",
        4: "Low deprivation",
        5: "Low deprivation",
    }
)

ruc = get_dataset_df("uk_ruc", "uk_ruc", "latest", "composite_ruc.csv",).set_index(
    "lsoa"
)[["pop"]]

imd = imd.join(ruc)
imd

Unnamed: 0_level_0,quintile,group,pop
lsoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
95ZZ06W1,1,High deprivation,1662
95GG47S2,1,High deprivation,1758
95GG35S2,1,High deprivation,1399
95MM12S2,1,High deprivation,1211
95MM27S1,1,High deprivation,1825
...,...,...,...
E01017787,5,Low deprivation,1588
S01008861,5,Low deprivation,769
S01006581,5,Low deprivation,940
S01008066,5,Low deprivation,912


In [3]:
# get this into the form of a sheet where the percentage of each constituency in different deprivation bands is in different columns


df = (
    pd.read_csv(Path("data", "raw", "pcon_lsoa.csv"))
    .drop(columns=["pcds"])
    .set_index("lsoa11")
)

df = imd.join(df, how="outer")
pt = df.pivot_table("pop", index="pcon", columns=["group"], aggfunc="sum").fillna(0)


pt = pt.common.row_percentages().reset_index()
pt = pt.rename(columns=lambda x: x.lower().replace(" ", "-"))


pt = pt[["pcon", "low-deprivation", "medium-deprivation", "high-deprivation"]]
split_percent = pt
pt

group,pcon,low-deprivation,medium-deprivation,high-deprivation
0,E14000530,0.58,0.37,0.05
1,E14000531,0.48,0.39,0.14
2,E14000532,0.85,0.10,0.05
3,E14000533,0.33,0.53,0.14
4,E14000534,0.74,0.26,0.00
...,...,...,...,...
645,W07000076,0.30,0.46,0.24
646,W07000077,0.17,0.72,0.11
647,W07000078,0.55,0.29,0.16
648,W07000079,0.39,0.23,0.38


In [4]:
# This follows methodology in https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/833947/IoD2019_Research_Report.pdf p. 69

lookup = (
    pd.read_csv(Path("data", "raw", "pcon_lsoa.csv"))
    .drop(columns=["pcds"])
    .set_index("lsoa11")
)

# merge lsoa to la lookup column
df = pd.read_csv(Path("data", "packages", "uk_index", "UK_IMD_E.csv")).set_index("lsoa")
df = df.join(lookup)

# merge lsoa population in
pop = get_dataset_df("uk_ruc", "uk_ruc", "latest", "composite_ruc.csv").set_index(
    "lsoa"
)[["pop"]]
df = df.join(pop)

# create a population adjusted score
df["pop_score"] = df["UK_IMD_E_score"] * df["pop"]

# pivot up to the local authority level
pt = df.pivot_table(["pop_score", "pop"], index="pcon", aggfunc="sum")  # type: ignore

# combine these for the higher level authorities
pt = pt.reset_index()

# calculate a new score, dividing the summed score by the summed population
pt["pcon-deprivation-score"] = pt["pop_score"] / pt["pop"]

# at this point we're calculating population quintiles based on lower tiers
df = pt.sort_values("pcon-deprivation-score", ascending=False)
df["cum_pop"] = df["pop"].astype("int").cumsum()
df["pcon-imd-pop-quintile"] = np.ceil(df["cum_pop"] / sum(df["pop"]) * 5).astype(int)
df["pcon-imd-pop-decile"] = np.ceil(df["cum_pop"] / sum(df["pop"]) * 10).astype(int)
df = df.drop(columns=["pop", "pop_score", "cum_pop"])
df

Unnamed: 0,pcon,pcon-deprivation-score,pcon-imd-pop-quintile,pcon-imd-pop-decile
536,N06000004,71.90,1,1
534,N06000002,65.39,1,1
540,N06000008,63.10,1,1
264,E14000794,55.82,1,1
550,N06000018,54.88,1,1
...,...,...,...,...
378,E14000908,7.17,5,10
532,E14001062,7.11,5,10
101,E14000631,6.98,5,10
314,E14000844,6.23,5,10


In [5]:
# get the quintiles as labels


def num_to_ith(num: Union[float, int]) -> str:
    """1 becomes 1st, 2 becomes 2nd, etc."""
    value = str(num)
    before_last_digit = 0
    last_digit = value[-1]
    if len(value) > 1 and before_last_digit == "1":
        return value + "th"
    if last_digit == "1":
        return value + "st"
    if last_digit == "2":
        return value + "nd"
    if last_digit == "3":
        return value + "rd"
    return value + "th"


def label_quintile(v: float) -> str:
    label = f"{num_to_ith(int(v))} IMD quintile"
    return label


df["label"] = df["pcon-imd-pop-quintile"].apply(label_quintile)

# move raw columns to end to preserve versioning

df.to_csv(Path("data", "pcon_labels.csv"), index=False)
df

Unnamed: 0,pcon,pcon-deprivation-score,pcon-imd-pop-quintile,pcon-imd-pop-decile,label
536,N06000004,71.90,1,1,1st IMD quintile
534,N06000002,65.39,1,1,1st IMD quintile
540,N06000008,63.10,1,1,1st IMD quintile
264,E14000794,55.82,1,1,1st IMD quintile
550,N06000018,54.88,1,1,1st IMD quintile
...,...,...,...,...,...
378,E14000908,7.17,5,10,5th IMD quintile
532,E14001062,7.11,5,10,5th IMD quintile
101,E14000631,6.98,5,10,5th IMD quintile
314,E14000844,6.23,5,10,5th IMD quintile


In [6]:
name_lookup = (
    get_dataset_df(
        "uk_westminster_constituency_names_and_codes",
        "uk_westminster_constituency_names_and_codes",
        "latest",
        "constituencies_and_codes.csv",
    )
    .rename(columns={"gss-code": "pcon", "name": "constituency-name"})
    .drop(columns=["country", "mapit-id", "parliament-id"])
)

In [7]:
# make the label description

labels = pd.Series(df["label"].unique()).to_frame().rename(columns={0: "label"})

labels["desc"] = [
    "Constituencies in most deprived quintile (20%)",
    "Constituencies in second most deprived quintile (20%)",
    "Constituencies in middle deprivation quintile (20%)",
    "Constituencies in second least deprived quintile (20%)",
    "Constituencies in least deprived quintile (20%)",
]
final = (
    df.merge(labels, on="label")
    .merge(split_percent, on="pcon")
    .merge(name_lookup, on="pcon")
    .rename(columns={"pcon": "gss-code"})
)

end_columns = ["pcon-imd-pop-quintile", "pcon-imd-pop-decile"]
start_columns = ["gss-code", "constituency-name"]
cols = [x for x in final.columns if x not in end_columns + start_columns]
final = final[start_columns + cols + end_columns]

final.to_csv(Path("data", "packages", "uk_index", "constituency_imd.csv"), index=False)
final.head()

Unnamed: 0,gss-code,constituency-name,pcon-deprivation-score,label,desc,low-deprivation,medium-deprivation,high-deprivation,pcon-imd-pop-quintile,pcon-imd-pop-decile
0,N06000004,Belfast West,71.9,1st IMD quintile,Constituencies in most deprived quintile (20%),0.0,0.05,0.95,1,1
1,N06000002,Belfast North,65.39,1st IMD quintile,Constituencies in most deprived quintile (20%),0.0,0.18,0.82,1,1
2,N06000008,Foyle,63.1,1st IMD quintile,Constituencies in most deprived quintile (20%),0.0,0.2,0.8,1,1
3,E14000794,"Liverpool, Walton",55.82,1st IMD quintile,Constituencies in most deprived quintile (20%),0.02,0.07,0.91,1,1
4,N06000018,West Tyrone,54.88,1st IMD quintile,Constituencies in most deprived quintile (20%),0.0,0.03,0.97,1,1
