In [1]:
from data_common.notebook import *

from functools import reduce
from itertools import product

# Composite distance dataset

This notebook creates a new set of distance calculations based on all the individual distances.

Rather than the average distance between two councils on different measures, the overall distance takes each different measure of distance as a different dimention, and calculcates the distance between the councils in the resulting multi-dimentional space.

All measures are curretly assumed to be equal and are not adjusted to make any more important. As there is some overlap in results between them, in general this approach should push councils that are slightly similar on multiple approachs further up. Generally this composite measure has greater agreement with its components than any single measure does with the others.

In [2]:
packages = {
    "geographic": Path("data", "packages", "physical_distance"),
    "emissions": Path("data", "packages", "emissions_distance"),
    "imd": Path("data", "packages", "imd_distance"),
    "ruc": Path("data", "packages", "ruc_distance"),
}

files = {x: pd.read_csv(y / "distance_map.csv") for x, y in packages.items()}

In [3]:
df = (
    pd.DataFrame()
    .space.join_distance(files)
    .joint_space.composite_distance(normalize=False)
    .space.match_distance()
    .space.local_rankings()
)
df.to_csv(
    Path("data", "packages", "composite_distance", "distance_map.csv"), index=False
)
df.head()

Unnamed: 0,local-authority-code_A,local-authority-code_B,distance,match,position
0,ABC,MEA,5.59,64.7,2.0
1,ABC,ANN,5.58,64.7,1.0
2,ABC,RCC,7.91,50.0,3.0
3,ABC,HIG,8.05,49.1,4.0
4,ABC,DRS,8.47,46.4,5.0


In [4]:
# create composite labels

files = [
    pd.read_csv(y / "la_labels.csv")
    .set_index("local-authority-code")
    .rename(columns={"label": x})
    for x, y in packages.items()
]

df = reduce(pd.DataFrame.join, files)

df["emissions"] = "Emissions: " + df["emissions"]


def f(x):
    return "; ".join(x)


ndf = df.apply(f, axis="columns").to_frame().rename(columns={0: "label"}).reset_index()
ndf.to_csv(Path("data", "packages", "composite_distance", "la_labels.csv"), index=False)
ndf.head()

Unnamed: 0,local-authority-code,label
0,ABC,Northern Ireland; Emissions: Industry/Commeric...
1,ABD,Scotland; Emissions: Agriculture; 5th IMD quin...
2,ABE,Scotland; Emissions: Public sector; 4th IMD qu...
3,ADU,South East; Emissions: Urban Mainstream; 4th I...
4,AGB,Scotland; Emissions: Agriculture; 4th IMD quin...


In [5]:
files = {x: pd.read_csv(y / "label_desc.csv") for x, y in packages.items()}

files["emissions"]["label"] = "Emissions: " + files["emissions"]["label"]

files["geographic"]["desc"] = None

labels = [x["label"].tolist() for x in files.values()]
descriptions = [x["desc"].tolist() for x in files.values()]

labels = pd.Series(product(*labels)).apply(lambda x: "; ".join(x))


def process_description(descriptions: List[str]) -> str:
    value = "; ".join([x for x in descriptions if x])
    value = value.replace("Councils ", "")
    return value


descriptions = pd.Series(product(*descriptions)).apply(process_description)

df = pd.DataFrame({"label": labels, "desc": descriptions})
df.to_csv(Path("data", "packages", "composite_distance", "label_desc.csv"), index=False)
df.head()

Unnamed: 0,label,desc
0,Northern Ireland; Emissions: Urban Mainstream;...,Below average for all emissions scores; in mos...
1,Northern Ireland; Emissions: Urban Mainstream;...,Below average for all emissions scores; in mos...
2,Northern Ireland; Emissions: Urban Mainstream;...,Below average for all emissions scores; in mos...
3,Northern Ireland; Emissions: Urban Mainstream;...,Below average for all emissions scores; in mos...
4,Northern Ireland; Emissions: Urban Mainstream;...,Below average for all emissions scores; in sec...
