In [None]:
import csv
import numpy

In [None]:
def _make_unique(ls):
    """Ensure the strings within a list are unique by
    appending an integer
    """
    return [f"{s}:{ls[:i].count(s)}" for (i, s) in enumerate(ls)]


def _sanitize(dollar):
    dollar = dollar.replace("$", "").replace(",", "")
    return float(dollar)


def _transpose(ls):
    """Efficiently transpose a list of lists
    """
    return list(map(list, zip(*ls)))


def outer_join(ds1, ds2):
    """Perform an outer join on two datasets along their
    first column
    """
    key = list(ds2.keys())[0]  # match along the first key
    assert key == list(ds1.keys())[0]  # consistency check
    # ensure name entries are unique
    ds1[key] = _make_unique(ds1[key])
    ds2[key] = _make_unique(ds2[key])
    # parse rows from each dataset
    rows1 = _transpose([ds1[col] for col in list(ds1.keys())[:5]])
    rows2 = _transpose([ds2[col] for col in ds2.keys()])
    # populate entries in the first set, but not the second
    joined = [row[:5] + [""] * 3 + row[5::] for row in rows1
              if row[0] not in ds2[key]]
    # join datasets along rows
    for (i, row) in enumerate(rows2):
        name = row[0]
        if (  # entry is in both sets
            (name in ds1[key]) and
            (name in ds2[key])
        ):
            j = ds1[key].index(name)
            joined.append(row[:3] + rows1[j][3:5] + row[3::])
        elif name in ds2[key]:  # entry is only in set 2
            joined.append(row[:3] + [""] * 2 + row[3::])
    # prepare new column headers
    headers = [
        list(ds2.keys())[:3] +
        [f"2019 {s}" for s in list(ds1.keys())[3:5]] +
        [f"2020 {s}" for s in list(ds2.keys())[3:6]] +
        list(ds2.keys())[6::]
    ]
    # return the joined dataset
    return headers + joined

In [None]:
data = {}

for file in (
    "city-of-milwaukee-salaries-2019.tsv",
    "city-of-milwaukee-salaries-2020.tsv",
):
    # read in data from CSV
    with open(file, "r") as fileobj:
        year = file.split("-")[-1].split(".")[0]
        contents = csv.reader(fileobj, delimiter="\t")
        data[year] = {x[0]: x[1::] for x in list(map(list, zip(*contents)))}

In [None]:
# consistency check
print(len(data["2019"]["Name"]))
print(len(data["2020"]["Name"]))

In [None]:
# go after differences in salary
joined = outer_join(data["2019"], data["2020"])

# clean up introduced artifacts
for row in joined[1::]:
    row[0] = row[0].split(":")[0]  # remove integer from name

In [None]:
# prepare the output file
filename = "city-of-milwaukee-salaries.csv"

# write the data to file
with open(filename, "w") as fileobj:
    writer = csv.writer(fileobj)
    writer.writerows(joined)