In [30]:
from data_common.notebook import *
from data_common.dataset import get_dataset_df

# Composite IMD calculation for UK Constituencies (2025)

Combine LSOA level deprivation scores into constituency wide deprivation score for the new 2025 constituencies.

In [31]:
# bring in the uk-wide imd info

# split into three dimensions, decile 1 is high, 2,3 is medium, 4,5 is low.

imd = pd.read_csv(Path("data", "packages", "uk_index", "UK_IMD_E.csv")).set_index(
    "lsoa"
)
imd = imd[["UK_IMD_E_pop_quintile"]].rename(
    columns={"UK_IMD_E_pop_quintile": "quintile"}
)
imd["group"] = imd["quintile"].map(
    {
        1: "High deprivation",
        2: "Medium deprivation",
        3: "Medium deprivation",
        4: "Low deprivation",
        5: "Low deprivation",
    }
)

ruc = get_dataset_df("uk_ruc", "uk_ruc", "latest", "composite_ruc.csv",).set_index(
    "lsoa"
)[["pop"]]

imd = imd.join(ruc)
imd

Unnamed: 0_level_0,quintile,group,pop
lsoa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
95ZZ06W1,1,High deprivation,1662
95GG47S2,1,High deprivation,1758
95GG35S2,1,High deprivation,1399
95MM12S2,1,High deprivation,1211
95MM27S1,1,High deprivation,1825
...,...,...,...
E01017787,5,Low deprivation,1588
S01008861,5,Low deprivation,769
S01006581,5,Low deprivation,940
S01008066,5,Low deprivation,912


In [32]:
# get this into the form of a sheet where the percentage of each constituency in different deprivation bands is in different columns


# this gets a file of all the overlaps lsoas and constituencies
df = (
    get_dataset_df(
        repo="2025-constituencies",
        package="geographic_overlaps",
        version="latest",
        file="LSOA11_PARL25_combo_overlap.csv",
    )
    .sort_values("percentage_overlap_pop", ascending=False)
    .drop_duplicates(subset=["LSOA11", "PARL25"], keep="first")
    .set_index("LSOA11")["PARL25"]
)


# we need to reduce this so there's just one row per lsoa
# we'll go by highest percentage_overlap_pop
# sort with highest first
# then get first unique for for LSOA11 and PARL25


df = imd.join(df, how="outer")
pt = df.pivot_table("pop", index="PARL25", columns=["group"], aggfunc="sum").fillna(0)


pt = pt.common.row_percentages().reset_index()
pt = pt.rename(columns=lambda x: x.lower().replace(" ", "-"))


pt = pt[["parl25", "low-deprivation", "medium-deprivation", "high-deprivation"]]
split_percent = pt
pt

group,parl25,low-deprivation,medium-deprivation,high-deprivation
0,UKPARL.2025.AAD,0.54,0.35,0.11
1,UKPARL.2025.AAG,0.39,0.43,0.18
2,UKPARL.2025.AAS,0.24,0.43,0.33
3,UKPARL.2025.ABF,0.54,0.29,0.17
4,UKPARL.2025.ABH,0.53,0.35,0.12
...,...,...,...,...
645,UKPARL.2025.WYT,0.27,0.22,0.51
646,UKPARL.2025.YEO,0.35,0.55,0.10
647,UKPARL.2025.YNM,0.28,0.60,0.11
648,UKPARL.2025.YOC,0.61,0.32,0.07


In [33]:
# This follows methodology in https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/833947/IoD2019_Research_Report.pdf p. 69

lookup = (
    get_dataset_df(
        repo="2025-constituencies",
        package="geographic_overlaps",
        version="latest",
        file="LSOA11_PARL25_combo_overlap.csv",
    )
    .sort_values("percentage_overlap_pop", ascending=False)
    .drop_duplicates(subset=["LSOA11", "PARL25"], keep="first")
    .set_index("LSOA11")["PARL25"]
)

# merge lsoa to parl25 lookup column
df = pd.read_csv(Path("data", "packages", "uk_index", "UK_IMD_E.csv")).set_index("lsoa")
df = df.join(lookup)

# merge lsoa population in
pop = get_dataset_df("uk_ruc", "uk_ruc", "latest", "composite_ruc.csv").set_index(
    "lsoa"
)[["pop"]]
df = df.join(pop)

# create a population adjusted score
df["pop_score"] = df["UK_IMD_E_score"] * df["pop"]

# pivot up to the constituency
pt = df.pivot_table(["pop_score", "pop"], index="PARL25", aggfunc="sum")  # type: ignore

# get back our constituency code column
pt = pt.reset_index()

# calculate a new score, dividing the summed score by the summed population
pt["parl25-deprivation-score"] = pt["pop_score"] / pt["pop"]

# at this point we're calculating population quintiles for constituencies
df = pt.sort_values("parl25-deprivation-score", ascending=False)
df["cum_pop"] = df["pop"].astype("int").cumsum()
df["parl25-imd-pop-quintile"] = np.ceil(df["cum_pop"] / sum(df["pop"]) * 5).astype(int)
df["parl25-imd-pop-decile"] = np.ceil(df["cum_pop"] / sum(df["pop"]) * 10).astype(int)
df = df.drop(columns=["pop", "pop_score", "cum_pop"]).rename(
    columns={"PARL25": "parl25"}
)
df

Unnamed: 0,parl25,parl25-deprivation-score,parl25-imd-pop-quintile,parl25-imd-pop-decile
93,UKPARL.2025.BTW,72.52,1,1
229,UKPARL.2025.FOY,63.66,1,1
90,UKPARL.2025.BTN,60.52,1,1
640,UKPARL.2025.WTY,53.57,1,1
392,UKPARL.2025.NAR,50.74,1,1
...,...,...,...,...
473,UKPARL.2025.RCF,7.17,5,10
105,UKPARL.2025.CAA,6.82,5,10
257,UKPARL.2025.HAB,6.60,5,10
408,UKPARL.2025.NEH,6.05,5,10


In [34]:
# get the quintiles as labels


def num_to_ith(num: Union[float, int]) -> str:
    """1 becomes 1st, 2 becomes 2nd, etc."""
    value = str(num)
    before_last_digit = 0
    last_digit = value[-1]
    if len(value) > 1 and before_last_digit == "1":
        return value + "th"
    if last_digit == "1":
        return value + "st"
    if last_digit == "2":
        return value + "nd"
    if last_digit == "3":
        return value + "rd"
    return value + "th"


def label_quintile(v: float) -> str:
    label = f"{num_to_ith(int(v))} IMD quintile"
    return label


df["label"] = df["parl25-imd-pop-quintile"].apply(label_quintile)

# move raw columns to end to preserve versioning

df.to_csv(Path("data", "parl25_labels.csv"), index=False)
df

Unnamed: 0,parl25,parl25-deprivation-score,parl25-imd-pop-quintile,parl25-imd-pop-decile,label
93,UKPARL.2025.BTW,72.52,1,1,1st IMD quintile
229,UKPARL.2025.FOY,63.66,1,1,1st IMD quintile
90,UKPARL.2025.BTN,60.52,1,1,1st IMD quintile
640,UKPARL.2025.WTY,53.57,1,1,1st IMD quintile
392,UKPARL.2025.NAR,50.74,1,1,1st IMD quintile
...,...,...,...,...,...
473,UKPARL.2025.RCF,7.17,5,10,5th IMD quintile
105,UKPARL.2025.CAA,6.82,5,10,5th IMD quintile
257,UKPARL.2025.HAB,6.60,5,10,5th IMD quintile
408,UKPARL.2025.NEH,6.05,5,10,5th IMD quintile


In [35]:
name_lookup = get_dataset_df(
    repo="2025-constituencies",
    package="parliament_con_2025",
    version="latest",
    file="parl_constituencies_2025.csv",
).rename(columns={"short_code": "parl25", "name": "constituency-name"})[
    ["parl25", "constituency-name"]
]

In [36]:
# make the label description

labels = pd.Series(df["label"].unique()).to_frame().rename(columns={0: "label"})

labels["desc"] = [
    "Constituencies in most deprived quintile (20%)",
    "Constituencies in second most deprived quintile (20%)",
    "Constituencies in middle deprivation quintile (20%)",
    "Constituencies in second least deprived quintile (20%)",
    "Constituencies in least deprived quintile (20%)",
]
final = (
    df.merge(labels, on="label")
    .merge(split_percent, on="parl25")
    .merge(name_lookup, on="parl25")
)

end_columns = ["parl25-imd-pop-quintile", "parl25-imd-pop-decile"]
start_columns = ["parl25", "constituency-name"]
cols = [x for x in final.columns if x not in end_columns + start_columns]
final = final[start_columns + cols + end_columns]

final.to_csv(Path("data", "packages", "uk_index", "parl25_imd.csv"), index=False)
final.head()

Unnamed: 0,parl25,constituency-name,parl25-deprivation-score,label,desc,low-deprivation,medium-deprivation,high-deprivation,parl25-imd-pop-quintile,parl25-imd-pop-decile
0,UKPARL.2025.BTW,Belfast West,72.52,1st IMD quintile,Constituencies in most deprived quintile (20%),0.0,0.04,0.96,1,1
1,UKPARL.2025.FOY,Foyle,63.66,1st IMD quintile,Constituencies in most deprived quintile (20%),0.0,0.21,0.79,1,1
2,UKPARL.2025.BTN,Belfast North,60.52,1st IMD quintile,Constituencies in most deprived quintile (20%),0.0,0.24,0.76,1,1
3,UKPARL.2025.WTY,West Tyrone,53.57,1st IMD quintile,Constituencies in most deprived quintile (20%),0.0,0.06,0.94,1,1
4,UKPARL.2025.NAR,Newry and Armagh,50.74,1st IMD quintile,Constituencies in most deprived quintile (20%),0.0,0.18,0.82,1,1
