# Analysis of outputs

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from typing import Tuple
import math
import numpy as np
from textwrap import wrap
import sys
if ".." not in sys.path:
    sys.path.insert(0, "..")
from analysis_data_processing import (
    create_population_df,
    redact_and_round_column,
    redact_and_round_df,
    code_specific_analysis,
    further_redaction,
    further_redaction_all,
    redact_to_five_and_round,
    produce_plot,
)

In [None]:
# Create dictionaries of oximetry, blood pressure and proactive care codes:
# Keys are SNOMED codes, values are the terms they refer to
oximetry_codes_df = pd.read_csv("../../codelists/opensafely-pulse-oximetry.csv")
oximetry_codes_dict = oximetry_codes_df.set_index("code")["term"].to_dict()
bp_codes_dict = {
    413606001: "Average home systolic blood pressure",
    314446007: "Average day interval systolic blood pressure",
    413605002: "Average home diastolic blood pressure",
    314461008: "Average day interval diastolic blood pressure",
}
proactive_codes_dict = {934231000000106: "Provision of proactive care"}


# Create dictionary of oximetry, blood pressure and proactive care headers:
# Keys are oximetry headers in input csv files (i.e. pulse_oximetry_code),
# values are the terms they refer to
oximetry_headers_dict = {
    f"healthcare_at_home_{k}": v for k, v in oximetry_codes_dict.items()
}
bp_headers_dict = {f"healthcare_at_home_{k}": v for k, v in bp_codes_dict.items()}
proactive_headers_dict = {
    f"healthcare_at_home_{k}": v for k, v in proactive_codes_dict.items()
}

# Create region list
region_list = [
    "North East",
    "North West",
    "Yorkshire and the Humber",
    "East Midlands",
    "West Midlands",
    "East of England",
    "London",
    "South East",
    "South West",
]


In [None]:
def create_population_df(
    homecare_type: str, dir: str = "../../output/"
) -> Tuple[pd.DataFrame, dict]:
    """Function to create population data frame for a particular homecare type
    which includes all weeks and create a dictionary of cohort size for each
    individual week"""
    # find the input csv files
    filepaths = [
        f
        for f in os.listdir(dir)
        if (f.startswith(f"input_{homecare_type}") and f.endswith(".csv"))
    ]
    # append the directory path to filename
    filepaths_dir = [dir + filepath for filepath in filepaths]

    # create empty list to append dataframes
    dfs = []
    # create empty dictionary to store the size of the cohorts
    # (i.e. patients with oximetry codes) each week
    cohort_size = {}

    for file in filepaths_dir:
        # read in files
        output = pd.read_csv(file)
        # Get the index date from the filename
        index_date = pd.to_datetime(
            file.split("_",)[2].split(
                ".csv"
            )[0],
            dayfirst=True,
        )
        # Add the index date to the file
        output["index_date"] = index_date
        # Append the dataframes to the list
        dfs.append(output)
        # Note the number of patients in the file for that index date
        cohort_size[index_date] = len(output)
    # Combine all the dataframes together
    population_df = pd.concat(dfs)
    if homecare_type == "oximetry":
        population_df.rename(columns=oximetry_headers_dict, inplace=True)
    elif homecare_type == "bp":
        population_df.rename(columns=bp_headers_dict, inplace=True)
    elif homecare_type == "proactive":
        population_df.rename(columns=proactive_headers_dict, inplace=True)
    return population_df, cohort_size



## Data Processing

In [None]:
homecare_type = "oximetry"

# Create population data frame which includes all weeks and dictionary of cohort size for each individual week
population_df, cohort_size = create_population_df(homecare_type, "../../output/")

# Define the ranges for each of the age categories
# (e.g. [-1, 17] will find ages such that -1<age<=17 - age is a whole number in the data frame so this is anyone aged 0 to 17 inclusive)
age_bins = [-1, 39, 49, 64, 200]
n = len(age_bins)

# Create list of labels for the different age groups
age_group_labels = []
# For all except the last final age category the label is 'Age between ... and ...'
for i in range(0, len(age_bins)-2):
    age_group_labels.append(f'Age between {age_bins[i]+1} and {age_bins[i+1]+1}')
# For final age category the label is 'Age ... or over'
age_group_labels.append(f'Age {age_bins[-2]+1} or over')

# Create list of age category for each patient and insert into the population data frame
age_category = pd.cut(population_df.age, bins = age_bins, labels = age_group_labels)
population_df.insert(0, 'age_group', age_category)



# Define the ranges for each of the age categories
# (e.g. [-1, 17] will find ages such that -1<age<=17 - age is a whole number in the data frame so this is anyone aged 0 to 17 inclusive)
age_bins_2 = [-1, 39, 49, 64, 200]
n = len(age_bins_2)

# Create list of labels for the different age groups
age_group_labels_2 = []
# For all except the last final age category the label is 'Age between ... and ...'
for i in range(0, len(age_bins_2)-2):
    age_group_labels_2.append(f'Age between {age_bins_2[i]+1} and {age_bins_2[i+1]+1}')
# For final age category the label is 'Age ... or over'
age_group_labels_2.append(f'Age {age_bins_2[-2]+1} or over')

# Create list of age category for each patient and insert into the population data frame
age_category_2 = pd.cut(population_df.age, bins = age_bins_2, labels = age_group_labels_2)
population_df.insert(0, 'age_group_2', age_category_2)

# Add column to population data frame for combined age group and shielding status
conditionlist = [
    (population_df['shielding'] == 1) ,
    (population_df['shielding'] == 0) & (population_df['age'] >=65),
    (population_df['shielding'] == 0) & (population_df['age'] >=50) & (population_df['age'] <65),
    (population_df['shielding'] == 0) & (population_df['age'] <50)]
choicelist = ['1: Shielding',
    '2: Aged 65 or over and not shielding',
    '3: Aged 50 to 64 and not shielding',
    '4: Aged 49 or under and not shielding']
population_df['age_and_shielding'] = np.select(conditionlist, choicelist, default='Not Specified')

region_list = ["North East", "North West", "Yorkshire and the Humber", "East Midlands", "West Midlands", "East of England", "London", "South East", "South West"]

# Create dictionary of oximetry codes: keys are SNOMED codes, values are the terms they refer to
oximetry_codes_df = pd.read_csv('../../codelists/opensafely-pulse-oximetry.csv')
oximetry_codes_dict = oximetry_codes_df.set_index("code")["term"].to_dict()
# Create dictionary of oximetry headers:
# Keys are oximetry headers in input csv files (i.e. pulse_oximetry_code), values are the terms they refer to
oximetry_headers_dict = {f"pulse_oximetry_{k}":v for k,v in oximetry_codes_dict.items()}



## Create dataframe of sums of pulse oximetry codes for each index week

In [None]:
# # Create population data frame which includes all weeks and dictionary of cohort size for each individual week
# population_df, cohort_size = create_population_df("../../output/")

# # Create lists of current and required headers
# # Convert pulse oximetry codelist csv into data frame
# oximetry_codes_df = pd.read_csv('../../codelists/opensafely-pulse-oximetry.csv')
# # Extract list of SNOMED codes
# oximetry_codes_list = oximetry_codes_df['code'].tolist()
# # List of pulse oximetry headers in population dataframe
# oximetry_codes_headers = [f'pulse_oximetry_{x}' for x in oximetry_codes_list]
# # List of headers using descriptions as required
# oximetry_headers = oximetry_codes_df['term'].tolist()

# # Create dictionary for renaming oximetry headers
# oximetry_dictionary = {}
# for n in range(0,len(oximetry_codes_df)):
#     oximetry_dictionary[oximetry_codes_headers[n]] = oximetry_headers[n]

# #Create data frame of sum totals for each index date for each oximetry code
# oximetry_sum = population_df.groupby(['index_date'], as_index=False)[oximetry_codes_headers].sum()

# # Rename oximetry headers in oximetry sums data frame
# oximetry_sum.rename(columns=oximetry_dictionary,inplace=True)

# # Save the dataframe in outputs folder
# #oximetry_sum.to_csv('../../output/oximetry_sums.csv') 


## Test different possibilities for redacting and rounding functions

In [None]:
def redact_to_five_and_round(counts_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """Function which determines for each index date if any value in a dataframe column
    is <= 5 and if so redacts all values <=5 then continues redacting the next lowest
    value until the redacted values add up to >= 5.
    All remaining values are then rounded up to nearest 5"""
    # For each index date
    for index_date in counts_df.index_date.unique():
        # Create temporary dataframe of all the rows with that index date
        temp_df = counts_df[counts_df["index_date"] == index_date]
        # If sum of values in the column <= 5
        if pd.to_numeric(temp_df[column_name], errors="coerce").sum() <= 5:
            # Redact all values in the column
            temp_df[column_name][temp_df[column_name] != 0] = "[REDACTED]"
        # Else if there are any non-zero numbers <= 5 in the column of interest
        elif (
            pd.to_numeric(
                temp_df[column_name][
                    (pd.to_numeric(counts_df["counts"], errors='coerce') <= 5) & (pd.to_numeric(counts_df["counts"], errors='coerce') != 0)
                ],
                errors="coerce",
            ).count()
            > 0
        ):
            # Store total quantity redacted
            total_redacted = 0
            # For each row
            for index in temp_df.index.values:
                # If column value is non-zero and less than 5
                if ((pd.to_numeric(temp_df.loc[index, column_name], errors="coerce") <= 5) 
                    & ((pd.to_numeric(temp_df.loc[index, column_name], errors="coerce") <= 5) != 0)):
                    # Add to the total_redacted variable
                    total_redacted += temp_df.loc[index, column_name]
                    # Redact the value
                    temp_df.loc[index, column_name] = "[REDACTED]"
                    # While total_redacted <= 5
                    while total_redacted <= 5:
                        # Find index of the lowest non-zero non-redacted count for that index date
                        min_index = pd.to_numeric(
                            temp_df[(temp_df[column_name] != "[REDACTED]") & (temp_df[column_name] != "[REDACTED]")][column_name]
                        ).idxmin()
                        # Add to the total_redacted variable
                        total_redacted += temp_df.loc[min_index, column_name]
                        # Redact the value
                        temp_df.at[min_index, column_name] = "[REDACTED]"
        # Update counts dataframe with the redactions
        counts_df.update(temp_df)
        # Round all numeric values in column up to nearest 5
        for index in counts_df.index.values:
            value = counts_df.loc[index, column_name]
            if type(value) == int or type(value) == float:
                counts_df.loc[index, column_name] = int(5 * math.ceil(float(value) / 5))
    return counts_df

In [None]:
code = "1325191000000108"
column_name = "region"
codes_dict = oximetry_codes_dict
variable_title = "Test Title"
term = codes_dict[int(code)]

# Population of interest is all patients with the code
codes_df = population_df.loc[population_df[term] == 1]
# Count the number of patients in each age group for each index date
counts_df = codes_df.groupby(["index_date", column_name]).size().reset_index()
counts_df.rename(columns={0: "counts"}, inplace=True)

# Count the denominator (the total size of the cohort for each week)
counts_df["denominators"] = counts_df["index_date"].map(
    (codes_df.groupby("index_date").size()).to_dict()
)

# Test cases
# One zero value <= 5 0-8
counts_df.iloc[[6,7], 2] = [10, 0]
# Nothing <= 5 9-17
# One non-zero value <=5 18 - 26
counts_df.iloc[21, 2] = 5
# Multiple values <=5 totalling <5 (no zeroes) 27 - 35
counts_df.iloc[[31,32], 2] = [5,10]
# Multiple values <=5 totalling <=5 (some zeroes) 36 - 44
counts_df.iloc[[36, 39, 41, 43], 2] = [1,0,4,0]
# Multiple zero values, all others >5 45 - 53
counts_df.iloc[[47, 49, 50], 2] = 0
# Sum of values <=5 (no zeroes) 54 - 62
counts_df.iloc[[58, 54,62, 60], 2] = [1,2,1,1]
# Sum of values <=5 (all zeroes) 63 - 71
counts_df.iloc[[64, 67, 68, 71], 2] = 0
# Sum of values <=5 (some zeroes) 72 - 80
counts_df.iloc[[72, 74, 75, 76, 77], 2] = [1,1,0,1,0]

#32,58, 78 (or 80)
# 21,25,31,35,36,41,42,54,58,59,60,62,72,74,76,78

# Apply redacting and rounding to the counts
counts_df = redact_to_five_and_round(counts_df, "counts")
#counts_df.iloc[36:45,]
