<div style="background-color: #ADD8E6; padding: 10px;">
    <h2><center>Matched Control Cohort (Using demographic covariants) </center></h2>
</div>

**<center> Regression Model Using propensity_score to create Matched Control Cohorts**

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

# Load the dataset
#dataset_for_matched_control_cohort = pd.read_csv('Dataset for Matched Control Cohort.csv')

# Drop the specified columns
columns_to_drop = ['Date of Death', 'Death Cause Diseases', 'Death Cause Disease ICD10 Codes']
dataset_for_matched_control_cohort.drop(columns=columns_to_drop, inplace=True)

# Drop rows with NaN values in the specific columns
dataset_for_matched_control_cohort.dropna(subset=['Combined ICD10 Codes', 'Participant ID', 'IMD_quintile'], inplace=True)

# Encode categorical variables
df_encoded = pd.get_dummies(dataset_for_matched_control_cohort[['Sex', 'Ethnicity', 'Age Range', 'Smoking Status', 'BMI Range', 'IMD_quintile', 'Alive / Dead']], drop_first=True)

# Logistic Regression to calculate propensity scores
X = df_encoded
y = dataset_for_matched_control_cohort['Combined ICD10 Codes'].apply(lambda x: 1 if any(code in str(x) for code in ['I27.0', 'I27.2', 'I27.9']) else 0)

model = LogisticRegression()
model.fit(X, y)
dataset_for_matched_control_cohort['propensity_score'] = model.predict_proba(X)[:, 1]

# Create treatment and control groups
ph_cohort_group = dataset_for_matched_control_cohort[dataset_for_matched_control_cohort['Combined ICD10 Codes'].str.contains('I27.0|I27.2|I27.9', na=False)]

# Ensure control group does not include participants in the PH cohort
control_group = dataset_for_matched_control_cohort[~dataset_for_matched_control_cohort['Participant ID'].isin(ph_cohort_group['Participant ID'])]

# Perform nearest neighbor matching
caliper = 0.05

nn = NearestNeighbors(n_neighbors=1)
nn.fit(control_group[['propensity_score']])
distances, indices = nn.kneighbors(ph_cohort_group[['propensity_score']])

# Apply caliper and select the matched controls
matched_controls_indices = [index for distance, index in zip(distances.flatten(), indices.flatten()) if distance <= caliper]

# Check if enough controls are matched within the caliper
if len(matched_controls_indices) < len(ph_cohort_group):
    print(f"Initial matching resulted in {len(matched_controls_indices)} controls. Expanding caliper.")
    
    # Expand caliper incrementally until we have enough matches
    while len(matched_controls_indices) < len(ph_cohort_group):
        caliper += 0.01
        matched_controls_indices = [index for distance, index in zip(distances.flatten(), indices.flatten()) if distance <= caliper]
        print(f"Caliper: {caliper}, Matches: {len(matched_controls_indices)}")

# Ensure we only take the first len(ph_cohort_group) matches if we have more
matched_controls_indices = matched_controls_indices[:len(ph_cohort_group)]

matched_controls = control_group.iloc[matched_controls_indices]
matched_controls = matched_controls.drop_duplicates(subset='Participant ID')

# Ensure the number of matched controls is the same as the PH cohort
while matched_controls['Participant ID'].nunique() < len(ph_cohort_group):
    deficit = len(ph_cohort_group) - matched_controls['Participant ID'].nunique()
    additional_controls = control_group.drop(matched_controls.index).sample(n=deficit, replace=False)
    matched_controls = pd.concat([matched_controls, additional_controls]).drop_duplicates(subset='Participant ID')

# Ensure the number of matched controls is exactly 2441 unique participants
matched_controls = matched_controls.drop_duplicates(subset='Participant ID').sample(n=2441, replace=False)

# Combine PH patients and matched controls
Combine_PH_and_matched_cohort = pd.concat([ph_cohort_group, matched_controls])
Combine_PH_and_matched_cohort = Combine_PH_and_matched_cohort.reset_index(drop=True)

# Display the number of unique Participant IDs in each group
unique_ph_ids = ph_cohort_group['Participant ID'].nunique()
unique_matched_control_ids = matched_controls['Participant ID'].nunique()

print(f"Number of unique Participant IDs in PH Cohort: {unique_ph_ids}")
print(f"Number of unique Participant IDs in Matched Control Cohort: {unique_matched_control_ids}")

# Display the matched cohort
print("Matched Control Cohort:\n")
print(f"Number of participants in Matched Control Cohort: {len(matched_controls)}")
display(matched_controls.head(5))
print()

# Display the combined cohort
print("Combine PH patients and matched controls:\n")
print(f"Number of participants in Combined Cohort: {len(Combine_PH_and_matched_cohort)}")
display(Combine_PH_and_matched_cohort.head(3))


In [None]:
# Check for overlaps
#common_participants = pd.merge(ph_cohort_group, matched_controls, on='Participant ID', how='inner')

#if common_participants.empty:
#    print("No overlapping participants between PH cohort and matched controls.")
#else:
#    print(f"Found {len(common_participants)} overlapping participants between PH cohort and matched controls.")


In [None]:
#### Specify the file path where you want to save the CSV file
#file_path = 'MATCHED Control Cohort Dataset.csv'

#### Use the to_csv method to save the DataFrame as a CSV file
#matched_controls.to_csv(file_path, index=False)  # Set index=False to exclude the index column

<div style="background-color: #E6E6FA; padding: 10px;">
    <h2><center>Non-PH Matched Control Cohort </center></h2>
</div>

In [None]:
import pandas as pd
import numpy as np
# Load the dataset
matched_controls = []
matched_controls = pd.read_csv('MATCHED Control Cohort Dataset.csv')




# Create dictionaries to store unique counts, row counts, NaN counts, and empty counts
unique_counts_matched_controls = {}
row_counts_matched_controls = {}
nan_counts_matched_controls = {}
empty_counts_matched_controls = {}
prefer_not_to_say_counts_matched_controls = {}

# Loop through columns and count unique items, row counts, NaN counts, and empty counts
for column_matched_controls in matched_controls.columns:
    unique_count_matched_controls = matched_controls[column_matched_controls].nunique()
    row_count_matched_controls = len(matched_controls[column_matched_controls])
    nan_count_matched_controls = matched_controls[column_matched_controls].isna().sum()  # Count NaN values
    empty_count_matched_controls = matched_controls[column_matched_controls].eq('').sum()  # Count empty string values
    prefer_not_to_say_count_matched_controls = matched_controls[column_matched_controls].eq('Prefer not to answer').sum()  

    unique_counts_matched_controls[column_matched_controls] = [unique_count_matched_controls]
    row_counts_matched_controls[column_matched_controls] = [row_count_matched_controls]
    nan_counts_matched_controls[column_matched_controls] = [nan_count_matched_controls]
    empty_counts_matched_controls[column_matched_controls] = [empty_count_matched_controls]
    prefer_not_to_say_counts_matched_controls[column_matched_controls] = [prefer_not_to_say_count_matched_controls]




    

# Create DataFrames from the dictionaries
unique_counts_matched_control = []
row_counts_matched_control = []
nan_counts_matched_control = []
empty_counts_matched_control = []
prefer_not_to_say_counts_matched_control =[]

unique_counts_matched_control = pd.DataFrame(unique_counts_matched_controls, index=['Unique Count'])
row_counts_matched_control = pd.DataFrame(row_counts_matched_controls, index=['Row Count'])
nan_counts_matched_control = pd.DataFrame(nan_counts_matched_controls, index=['NaN Count'])
empty_counts_matched_control = pd.DataFrame(empty_counts_matched_controls, index=['Empty Count'])
prefer_not_to_say_counts_matched_control = pd.DataFrame(prefer_not_to_say_counts_matched_controls, index=['Prefer not to answer'])

# Concatenate the DataFrames
result_matched_controls = []
result_matched_controls = pd.concat([unique_counts_matched_control, row_counts_matched_control, nan_counts_matched_control, empty_counts_matched_control,prefer_not_to_say_counts_matched_control])

# Display the combined DataFrame
print("matched_controls:")
display(result_matched_controls)
print()
print()
display(matched_controls.head(3))
print()

<div style="background-color: #E6E6FA; padding: 10px;">
    <h2><center>Sex and Ethnicity Counts for Non-PH Matched Control Cohort </center></h2>
</div>

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441



# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls.drop_duplicates(subset=['Participant ID'])



# Count the number of each sex
sex_counts = []
sex_counts = unique_participants['Sex'].value_counts()

# Calculate the percentage for each ethnicity
sex_percentages = []
sex_percentages = (sex_counts / total_participants) * 100

# Combine counts and percentages
sex_counts_with_percentages = []
sex_counts_with_percentages = sex_counts.astype(str) + " (" + sex_percentages.round(2).astype(str) + "%)"



# Count the number of missing (NaN) values in each column
missing_counts = []
missing_counts = unique_participants.isna().sum()

# Get unique items in the "Ethnicity" column
unique_ethnicities = []
unique_ethnicities = unique_participants['Ethnicity'].unique()

# Count the number of each unique ethnicity
ethnicity_counts = []
ethnicity_counts = unique_participants['Ethnicity'].value_counts()



# Calculate the percentage for each ethnicity
ethnicity_percentages = []
ethnicity_percentages = (ethnicity_counts / total_participants) * 100

# Combine counts and percentages
ethnicity_counts_with_percentages = []
ethnicity_counts_with_percentages = ethnicity_counts.astype(str) + " (" + ethnicity_percentages.round(2).astype(str) + "%)"


# Print the counts
print("Sex Counts with Percentages:")
print(sex_counts_with_percentages)
print()
print("\nEthnicity Counts with Percentages:")
print(ethnicity_counts_with_percentages)
print()
print("\nMissing Values Counts:")
print(missing_counts)

<div style="background-color: #E6E6FA; padding: 10px;">
    <h2><center>Age Groups Counts for Non-PH Matched Control Cohort </center></h2>
</div>

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441

# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls.drop_duplicates(subset=['Participant ID'])


# Count occurrences of each age range
age_range_counts = []
age_range_counts = unique_participants['Age Range'].value_counts().sort_index()

# Calculate percentages
age_range_percentages = (age_range_counts / total_participants) * 100

# Display the results
for age_range, count in age_range_counts.items():
    percentage = age_range_percentages[age_range]
    print(f"{age_range} : {count} ({percentage:.2f} %)")

<div style="background-color: #E6E6FA; padding: 10px;">
    <h2><center>Smoking Status Counts for Non-PH Matched Control Cohort </center></h2>
</div>

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441

# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls.drop_duplicates(subset=['Participant ID'])



# Display unique items under the column "Smoking Status"
unique_smoking_status = []
unique_smoking_status = unique_participants['Smoking Status'].unique()
# Print the unique items
display(unique_smoking_status)
print()
print()


# Count occurrences of unique values in the 'Smoking Status' column
smoking_status_counts = []
smoking_status_counts = unique_participants['Smoking Status'].value_counts()

# Calculate the percentage for each ethnicity
smoking_status_percentages = []
smoking_status_percentages = (smoking_status_counts / total_participants) * 100

# Combine counts and percentages
smoking_status_counts_with_percentages = []
smoking_status_counts_with_percentages = smoking_status_counts.astype(str) + " (" + smoking_status_percentages.round(2).astype(str) + "%)"



# Display the counts
print("Counts of unique values in the 'Smoking Status' column:")
display(smoking_status_counts_with_percentages)
print()

<div style="background-color: #E6E6FA; padding: 10px;">
    <h2><center>BMI Counts for Non-PH Matched Control Cohort </center></h2>
</div>

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441

# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls.drop_duplicates(subset=['Participant ID'])



# Display unique items under the column "BMI Range"
unique_BMI_status = []
unique_BMI_status = unique_participants['BMI Range'].unique()
# Print the unique items
display(unique_BMI_status)
print()
print()


# Count occurrences of unique values in the 'BMI Range' column
BMI_status_counts = []
BMI_status_counts = unique_participants['BMI Range'].value_counts()

# Calculate the percentage 
BMI_status_percentages = []
BMI_status_percentages = (BMI_status_counts / total_participants) * 100

# Combine counts and percentages
BMI_status_counts_with_percentages = []
BMI_status_counts_with_percentages = BMI_status_counts.astype(str) + " (" + BMI_status_percentages.round(2).astype(str) + "%)"



# Display the counts
print("Counts of unique values in the 'BMI Range' column:")
display(BMI_status_counts_with_percentages)
print()

<div style="background-color: #E6E6FA; padding: 10px;">
    <h2><center>IMD Counts for Non-PH Matched Control Cohort </center></h2>
</div>

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441

# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls.drop_duplicates(subset=['Participant ID'])



# Display unique items under the column "IMD_quintile"
unique_IMD_status = []
unique_IMD_status = unique_participants['IMD_quintile'].unique()
# Print the unique items
display(unique_IMD_status)
print()
print()


# Count occurrences of unique values in the 'IMD_quintile' column
IMD_status_counts = []
IMD_status_counts = unique_participants['IMD_quintile'].value_counts()

# Calculate the percentage 
IMD_status_percentages = []
IMD_status_percentages = (IMD_status_counts / total_participants) * 100

# Combine counts and percentages
IMD_status_counts_with_percentages = []
IMD_status_counts_with_percentages = IMD_status_counts.astype(str) + " (" + IMD_status_percentages.round(2).astype(str) + "%)"



# Display the counts
print("Counts of unique values in the 'IMD_quintile' column:")
display(IMD_status_counts_with_percentages)
print()

<div style="background-color: #E6E6FA; padding: 10px;">
    <h2><center>Age Specific Mortality Counts for Non-PH Matched Control Cohort </center></h2>
</div>

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441


# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls.drop_duplicates(subset=['Participant ID'])

# Count occurrences of unique values in the 'Alive / Dead' column
total_alive_dead_counts = []
total_alive_dead_counts = unique_participants['Alive / Dead'].value_counts()

# Calculate the percentage 
total_alive_dead_percentages = []
total_alive_dead_percentages = (total_alive_dead_counts / total_participants) * 100

# Combine counts and percentages
total_alive_dead_counts_with_percentages = []
total_alive_dead_counts_with_percentages = total_alive_dead_counts.astype(str) + " (" + total_alive_dead_percentages.round(2).astype(str) + "%)"



# Display the counts
print("Counts of unique values in the 'Alive / Dead' column:")
display(total_alive_dead_counts_with_percentages)
print()
print()

# Assume the DataFrame is already loaded as All_ICD10_with_Diseases_Dates_Data

# Define the age ranges
bins = [30, 40, 50, 60, 70, 80, 90, 100]
labels = ["30 - 40", "41 - 50", "51 - 60", "61 - 70", "71 - 80", "81 - 90", "91 - 100"]


# Total number of participants
total_participants = []
total_participants = 2441


# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls.drop_duplicates(subset=['Participant ID'])

# Separate the counts for 'Alive' and 'Dead' within each age range
alive_counts = []
dead_counts = []
alive_counts = unique_participants[unique_participants['Alive / Dead'] == 'Alive']['Age Range'].value_counts().sort_index()
dead_counts = unique_participants[unique_participants['Alive / Dead'] == 'Dead']['Age Range'].value_counts().sort_index()

# Calculate the percentages
alive_percentages = []
dead_percentages = []
alive_percentages = (alive_counts / total_participants) * 100
dead_percentages = (dead_counts / total_participants) * 100

# Combine counts and percentages for 'Alive'
alive_counts_with_percentages = alive_counts.astype(str) + " (" + alive_percentages.round(2).astype(str) + "%)"

# Combine counts and percentages for 'Dead'
dead_counts_with_percentages = []
dead_counts_with_percentages = dead_counts.astype(str) + " (" + dead_percentages.round(2).astype(str) + "%)"

# Display the counts and percentages for 'Alive'
print("Counts and percentages for 'Alive' in each age range:")
display(alive_counts_with_percentages)
print()
print()

# Display the counts and percentages for 'Dead'
print("Counts and percentages for 'Dead' in each age range:")
display(dead_counts_with_percentages)
print()

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>Heart Failure Matched Control Cohort   </center></h2>
</div>


This file will generate the Heart Failure Matched Control Cohort using **I50.0** and **I50.1** in comparison to PH Cohort for forest plot, sunburst plot, table, tracer plot and Cox Hazard Ratio plots

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors



# Load the dataset
matched_controls = []
matched_controls = pd.read_csv('Dataset for Matched Control Cohort.csv')




# Create dictionaries to store unique counts, row counts, NaN counts, and empty counts
unique_counts_matched_controls = {}
row_counts_matched_controls = {}
nan_counts_matched_controls = {}
empty_counts_matched_controls = {}
prefer_not_to_say_counts_matched_controls = {}

# Loop through columns and count unique items, row counts, NaN counts, and empty counts
for column_matched_controls in matched_controls.columns:
    unique_count_matched_controls = matched_controls[column_matched_controls].nunique()
    row_count_matched_controls = len(matched_controls[column_matched_controls])
    nan_count_matched_controls = matched_controls[column_matched_controls].isna().sum()  # Count NaN values
    empty_count_matched_controls = matched_controls[column_matched_controls].eq('').sum()  # Count empty string values
    prefer_not_to_say_count_matched_controls = matched_controls[column_matched_controls].eq('Prefer not to answer').sum()  

    unique_counts_matched_controls[column_matched_controls] = [unique_count_matched_controls]
    row_counts_matched_controls[column_matched_controls] = [row_count_matched_controls]
    nan_counts_matched_controls[column_matched_controls] = [nan_count_matched_controls]
    empty_counts_matched_controls[column_matched_controls] = [empty_count_matched_controls]
    prefer_not_to_say_counts_matched_controls[column_matched_controls] = [prefer_not_to_say_count_matched_controls]




    

# Create DataFrames from the dictionaries
unique_counts_matched_control = []
row_counts_matched_control = []
nan_counts_matched_control = []
empty_counts_matched_control = []
prefer_not_to_say_counts_matched_control =[]

unique_counts_matched_control = pd.DataFrame(unique_counts_matched_controls, index=['Unique Count'])
row_counts_matched_control = pd.DataFrame(row_counts_matched_controls, index=['Row Count'])
nan_counts_matched_control = pd.DataFrame(nan_counts_matched_controls, index=['NaN Count'])
empty_counts_matched_control = pd.DataFrame(empty_counts_matched_controls, index=['Empty Count'])
prefer_not_to_say_counts_matched_control = pd.DataFrame(prefer_not_to_say_counts_matched_controls, index=['Prefer not to answer'])

# Concatenate the DataFrames
result_matched_controls = []
result_matched_controls = pd.concat([unique_counts_matched_control, row_counts_matched_control, nan_counts_matched_control, empty_counts_matched_control,prefer_not_to_say_counts_matched_control])

# Display the combined DataFrame
print("matched_controls_heart_failure:")
display(result_matched_controls)
print()
print()

dataset_for_matched_control_cohort = []
dataset_for_matched_control_cohort = matched_controls
display(dataset_for_matched_control_cohort.head(3))
print()

In [None]:
# Drop the specified columns
columns_to_drop = ['Death Cause Diseases', 'Death Cause Disease ICD10 Codes']
dataset_for_matched_control_cohort.drop(columns=columns_to_drop, inplace=True)

# Drop rows with NaN values in the specific columns
dataset_for_matched_control_cohort.dropna(subset=['Combined ICD10 Codes', 'Participant ID', 'IMD_quintile'], inplace=True)

# Encode categorical variables
df_encoded = pd.get_dummies(dataset_for_matched_control_cohort[['Sex', 'Ethnicity', 'Age Range', 'Smoking Status', 'BMI Range', 'IMD_quintile', 'Alive / Dead']], drop_first=True)

# Identify PH and COPD cohorts
ph_codes = ['I27.0', 'I27.2', 'I27.9']
heart_failure_codes = ['I50.0', 'I50.1']

dataset_for_matched_control_cohort['PH'] = dataset_for_matched_control_cohort['Combined ICD10 Codes'].apply(lambda x: 1 if any(code in str(x) for code in ph_codes) else 0)
dataset_for_matched_control_cohort['Heart_Failure'] = dataset_for_matched_control_cohort['Combined ICD10 Codes'].apply(lambda x: 1 if any(code in str(x) for code in heart_failure_codes) else 0)
dataset_for_matched_control_cohort.head(2)

In [None]:
# Logistic Regression for propensity score calculation for COPD as control
model = LogisticRegression(max_iter=1000)  # Increase max_iter to ensure convergence
model.fit(df_encoded, dataset_for_matched_control_cohort['Heart_Failure'])
dataset_for_matched_control_cohort['propensity_score'] = model.predict_proba(df_encoded)[:, 1]

# PH cohort as treatment group
ph_cohort_group = dataset_for_matched_control_cohort[dataset_for_matched_control_cohort['PH'] == 1]

# COPD cohort as control group
# Ensure control group does not include participants in the PH cohort
heart_failure_control_group = dataset_for_matched_control_cohort[(dataset_for_matched_control_cohort['Heart_Failure'] == 1) & (~dataset_for_matched_control_cohort['Participant ID'].isin(ph_cohort_group['Participant ID']))]

# Perform nearest neighbor matching
caliper = 0.05
nn = NearestNeighbors(n_neighbors=1)
nn.fit(heart_failure_control_group[['propensity_score']])
distances, indices = nn.kneighbors(ph_cohort_group[['propensity_score']])

# Apply caliper and select the matched controls
matched_controls_indices = [index for distance, index in zip(distances.flatten(), indices.flatten()) if distance <= caliper]

# Expand caliper incrementally until we have enough matches
if len(matched_controls_indices) < 2441:
    print(f"Initial matching resulted in {len(matched_controls_indices)} controls. Expanding caliper.")
    
    while len(matched_controls_indices) < 2441:
        caliper += 0.01
        distances, indices = nn.kneighbors(ph_cohort_group[['propensity_score']])
        matched_controls_indices = [index for distance, index in zip(distances.flatten(), indices.flatten()) if distance <= caliper]
        print(f"Caliper: {caliper}, Matches: {len(matched_controls_indices)}")

# Ensure we only take the first 2441 matches if we have more
matched_controls_indices = matched_controls_indices[:2441]

# Select matched controls and ensure the number matches 2441
matched_controls = heart_failure_control_group.iloc[matched_controls_indices]
matched_controls = matched_controls.drop_duplicates(subset='Participant ID')

# If the number of unique matched controls is less than 2441, adjust by adding more samples if necessary
while matched_controls['Participant ID'].nunique() < 2441:
    deficit = 2441 - matched_controls['Participant ID'].nunique()
    additional_controls = heart_failure_control_group.drop(matched_controls.index).sample(n=deficit, replace=True)
    matched_controls = pd.concat([matched_controls, additional_controls]).drop_duplicates(subset='Participant ID')

# Ensure the number of matched controls is exactly 2441 by sampling if necessary
if matched_controls['Participant ID'].nunique() < 2441:
    matched_controls = matched_controls.sample(n=2441, replace=True)

# Combine PH patients and matched controls
Combine_PH_and_matched_control_cohort = pd.concat([ph_cohort_group, matched_controls])
Combine_PH_and_matched_control_cohort = Combine_PH_and_matched_control_cohort.reset_index(drop=True)

# Display the number of unique Participant IDs in each group
unique_ph_ids = ph_cohort_group['Participant ID'].nunique()
unique_matched_control_ids = matched_controls['Participant ID'].nunique()

print(f"Number of unique Participant IDs in PH Cohort: {unique_ph_ids}")
print(f"Number of unique Participant IDs in Heart Failure Matched Control Cohort: {unique_matched_control_ids}")

In [None]:
# Check for overlaps
common_participants = pd.merge(ph_cohort_group, matched_controls, on='Participant ID', how='inner')

if common_participants.empty:
    print("No overlapping participants between PH cohort and matched controls.")
else:
    print(f"Found {len(common_participants)} overlapping participants between PH cohort and matched controls.")

In [None]:
#### Specify the file path where you want to save the CSV file
#file_path = []
#file_path = 'Heart Failure matched Control Cohort Dataset.csv'

#### Use the to_csv method to save the DataFrame as a CSV file
#matched_controls.to_csv(file_path, index=False)  # Set index=False to exclude the index column

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>Sex and Ethnicity Counts for Heart Failure Matched Control Cohort </center></h2>
</div>

In [None]:
import pandas as pd
import numpy as np
# Load the dataset
matched_controls = []
matched_controls = pd.read_csv('Heart Failure matched Control Cohort Dataset.csv')



# Create dictionaries to store unique counts, row counts, NaN counts, and empty counts
unique_counts_matched_controls = {}
row_counts_matched_controls = {}
nan_counts_matched_controls = {}
empty_counts_matched_controls = {}
prefer_not_to_say_counts_matched_controls = {}

# Loop through columns and count unique items, row counts, NaN counts, and empty counts
for column_matched_controls in matched_controls.columns:
    unique_count_matched_controls = matched_controls[column_matched_controls].nunique()
    row_count_matched_controls = len(matched_controls[column_matched_controls])
    nan_count_matched_controls = matched_controls[column_matched_controls].isna().sum()  # Count NaN values
    empty_count_matched_controls = matched_controls[column_matched_controls].eq('').sum()  # Count empty string values
    prefer_not_to_say_count_matched_controls = matched_controls[column_matched_controls].eq('Prefer not to answer').sum()  

    unique_counts_matched_controls[column_matched_controls] = [unique_count_matched_controls]
    row_counts_matched_controls[column_matched_controls] = [row_count_matched_controls]
    nan_counts_matched_controls[column_matched_controls] = [nan_count_matched_controls]
    empty_counts_matched_controls[column_matched_controls] = [empty_count_matched_controls]
    prefer_not_to_say_counts_matched_controls[column_matched_controls] = [prefer_not_to_say_count_matched_controls]




    

# Create DataFrames from the dictionaries
unique_counts_matched_control = []
row_counts_matched_control = []
nan_counts_matched_control = []
empty_counts_matched_control = []
prefer_not_to_say_counts_matched_control =[]

unique_counts_matched_control = pd.DataFrame(unique_counts_matched_controls, index=['Unique Count'])
row_counts_matched_control = pd.DataFrame(row_counts_matched_controls, index=['Row Count'])
nan_counts_matched_control = pd.DataFrame(nan_counts_matched_controls, index=['NaN Count'])
empty_counts_matched_control = pd.DataFrame(empty_counts_matched_controls, index=['Empty Count'])
prefer_not_to_say_counts_matched_control = pd.DataFrame(prefer_not_to_say_counts_matched_controls, index=['Prefer not to answer'])

# Concatenate the DataFrames
result_matched_controls = []
result_matched_controls = pd.concat([unique_counts_matched_control, row_counts_matched_control, nan_counts_matched_control, empty_counts_matched_control,prefer_not_to_say_counts_matched_control])


matched_controls_heart_failure = []
matched_controls_heart_failure = matched_controls

# Display the combined DataFrame
print("matched_controls:")
display(result_matched_controls)
print()
print()
display(matched_controls_heart_failure.head(2))
print()

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441



# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls_heart_failure.drop_duplicates(subset=['Participant ID'])



# Count the number of each sex
sex_counts = []
sex_counts = unique_participants['Sex'].value_counts()

# Calculate the percentage for each ethnicity
sex_percentages = []
sex_percentages = (sex_counts / total_participants) * 100

# Combine counts and percentages
sex_counts_with_percentages = []
sex_counts_with_percentages = sex_counts.astype(str) + " (" + sex_percentages.round(2).astype(str) + "%)"



# Count the number of missing (NaN) values in each column
missing_counts = []
missing_counts = unique_participants.isna().sum()

# Get unique items in the "Ethnicity" column
unique_ethnicities = []
unique_ethnicities = unique_participants['Ethnicity'].unique()

# Count the number of each unique ethnicity
ethnicity_counts = []
ethnicity_counts = unique_participants['Ethnicity'].value_counts()



# Calculate the percentage for each ethnicity
ethnicity_percentages = []
ethnicity_percentages = (ethnicity_counts / total_participants) * 100

# Combine counts and percentages
ethnicity_counts_with_percentages = []
ethnicity_counts_with_percentages = ethnicity_counts.astype(str) + " (" + ethnicity_percentages.round(2).astype(str) + "%)"


# Print the counts
print("Sex Counts with Percentages:")
print(sex_counts_with_percentages)
print()
print("\nEthnicity Counts with Percentages:")
print(ethnicity_counts_with_percentages)
print()
print("\nMissing Values Counts:")
print(missing_counts)

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>Age Groups Counts for Heart Failure Matched Control Cohort </center></h2>
</div>

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441

# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls_heart_failure.drop_duplicates(subset=['Participant ID'])


# Count occurrences of each age range
age_range_counts = []
age_range_counts = unique_participants['Age Range'].value_counts().sort_index()

# Calculate percentages
age_range_percentages = (age_range_counts / total_participants) * 100

# Display the results
for age_range, count in age_range_counts.items():
    percentage = age_range_percentages[age_range]
    print(f"{age_range} : {count} ({percentage:.2f} %)")

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>Smoking Status Counts for Heart Failure Matched Control Cohort </center></h2>
</div>

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441

# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls_heart_failure.drop_duplicates(subset=['Participant ID'])



# Display unique items under the column "Smoking Status"
unique_smoking_status = []
unique_smoking_status = unique_participants['Smoking Status'].unique()
# Print the unique items
display(unique_smoking_status)
print()
print()


# Count occurrences of unique values in the 'Smoking Status' column
smoking_status_counts = []
smoking_status_counts = unique_participants['Smoking Status'].value_counts()

# Calculate the percentage for each ethnicity
smoking_status_percentages = []
smoking_status_percentages = (smoking_status_counts / total_participants) * 100

# Combine counts and percentages
smoking_status_counts_with_percentages = []
smoking_status_counts_with_percentages = smoking_status_counts.astype(str) + " (" + smoking_status_percentages.round(2).astype(str) + "%)"



# Display the counts
print("Counts of unique values in the 'Smoking Status' column:")
display(smoking_status_counts_with_percentages)
print()

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>BMI Counts for Heart Failure Matched Control Cohort </center></h2>
</div>

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441

# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls_heart_failure.drop_duplicates(subset=['Participant ID'])



# Display unique items under the column "BMI Range"
unique_BMI_status = []
unique_BMI_status = unique_participants['BMI Range'].unique()
# Print the unique items
display(unique_BMI_status)
print()
print()


# Count occurrences of unique values in the 'BMI Range' column
BMI_status_counts = []
BMI_status_counts = unique_participants['BMI Range'].value_counts()

# Calculate the percentage 
BMI_status_percentages = []
BMI_status_percentages = (BMI_status_counts / total_participants) * 100

# Combine counts and percentages
BMI_status_counts_with_percentages = []
BMI_status_counts_with_percentages = BMI_status_counts.astype(str) + " (" + BMI_status_percentages.round(2).astype(str) + "%)"



# Display the counts
print("Counts of unique values in the 'BMI Range' column:")
display(BMI_status_counts_with_percentages)
print()

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>IMD Counts for Heart Failure Matched Control Cohort </center></h2>
</div>

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441

# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls_heart_failure.drop_duplicates(subset=['Participant ID'])



# Display unique items under the column "IMD_quintile"
unique_IMD_status = []
unique_IMD_status = unique_participants['IMD_quintile'].unique()
# Print the unique items
display(unique_IMD_status)
print()
print()


# Count occurrences of unique values in the 'IMD_quintile' column
IMD_status_counts = []
IMD_status_counts = unique_participants['IMD_quintile'].value_counts()

# Calculate the percentage 
IMD_status_percentages = []
IMD_status_percentages = (IMD_status_counts / total_participants) * 100

# Combine counts and percentages
IMD_status_counts_with_percentages = []
IMD_status_counts_with_percentages = IMD_status_counts.astype(str) + " (" + IMD_status_percentages.round(2).astype(str) + "%)"



# Display the counts
print("Counts of unique values in the 'IMD_quintile' column:")
display(IMD_status_counts_with_percentages)
print()

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>Age Specific Mortality Counts for Heart Failure Matched Control Cohort </center></h2>
</div>

In [None]:
# Total number of participants
total_participants = []
total_participants = 2441


# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls_heart_failure.drop_duplicates(subset=['Participant ID'])

# Count occurrences of unique values in the 'Alive / Dead' column
total_alive_dead_counts = []
total_alive_dead_counts = unique_participants['Alive / Dead'].value_counts()

# Calculate the percentage 
total_alive_dead_percentages = []
total_alive_dead_percentages = (total_alive_dead_counts / total_participants) * 100

# Combine counts and percentages
total_alive_dead_counts_with_percentages = []
total_alive_dead_counts_with_percentages = total_alive_dead_counts.astype(str) + " (" + total_alive_dead_percentages.round(2).astype(str) + "%)"



# Display the counts
print("Counts of unique values in the 'Alive / Dead' column:")
display(total_alive_dead_counts_with_percentages)
print()
print()

# Assume the DataFrame is already loaded as All_ICD10_with_Diseases_Dates_Data

# Define the age ranges
bins = [30, 40, 50, 60, 70, 80, 90, 100]
labels = ["30 - 40", "41 - 50", "51 - 60", "61 - 70", "71 - 80", "81 - 90", "91 - 100"]


# Total number of participants
total_participants = []
total_participants = 2441


# Drop duplicate Participant ID rows to get unique participants
unique_participants = []
unique_participants = matched_controls.drop_duplicates(subset=['Participant ID'])

# Separate the counts for 'Alive' and 'Dead' within each age range
alive_counts = []
dead_counts = []
alive_counts = unique_participants[unique_participants['Alive / Dead'] == 'Alive']['Age Range'].value_counts().sort_index()
dead_counts = unique_participants[unique_participants['Alive / Dead'] == 'Dead']['Age Range'].value_counts().sort_index()

# Calculate the percentages
alive_percentages = []
dead_percentages = []
alive_percentages = (alive_counts / total_participants) * 100
dead_percentages = (dead_counts / total_participants) * 100

# Combine counts and percentages for 'Alive'
alive_counts_with_percentages = alive_counts.astype(str) + " (" + alive_percentages.round(2).astype(str) + "%)"

# Combine counts and percentages for 'Dead'
dead_counts_with_percentages = []
dead_counts_with_percentages = dead_counts.astype(str) + " (" + dead_percentages.round(2).astype(str) + "%)"

# Display the counts and percentages for 'Alive'
print("Counts and percentages for 'Alive' in each age range:")
display(alive_counts_with_percentages)
print()
print()

# Display the counts and percentages for 'Dead'
print("Counts and percentages for 'Dead' in each age range:")
display(dead_counts_with_percentages)
print()

**<center> This code is for creating the Process Mining for Heart Failure Matched Control Cohorts Versus PH Cohort Process Mining**

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>Process Mining of Heart Failure Matched Control Cohort </center></h2>
</div>

Heart Failure Matched Control Using ICD10 Codes **I50.0** and **I50.1**

In [None]:
import pandas as pd
import numpy as np
# Load the dataset
matched_controls = []
matched_controls = pd.read_csv('Heart Failure matched Control Cohort Dataset.csv')



# Create dictionaries to store unique counts, row counts, NaN counts, and empty counts
unique_counts_matched_controls = {}
row_counts_matched_controls = {}
nan_counts_matched_controls = {}
empty_counts_matched_controls = {}
prefer_not_to_say_counts_matched_controls = {}

# Loop through columns and count unique items, row counts, NaN counts, and empty counts
for column_matched_controls in matched_controls.columns:
    unique_count_matched_controls = matched_controls[column_matched_controls].nunique()
    row_count_matched_controls = len(matched_controls[column_matched_controls])
    nan_count_matched_controls = matched_controls[column_matched_controls].isna().sum()  # Count NaN values
    empty_count_matched_controls = matched_controls[column_matched_controls].eq('').sum()  # Count empty string values
    prefer_not_to_say_count_matched_controls = matched_controls[column_matched_controls].eq('Prefer not to answer').sum()  

    unique_counts_matched_controls[column_matched_controls] = [unique_count_matched_controls]
    row_counts_matched_controls[column_matched_controls] = [row_count_matched_controls]
    nan_counts_matched_controls[column_matched_controls] = [nan_count_matched_controls]
    empty_counts_matched_controls[column_matched_controls] = [empty_count_matched_controls]
    prefer_not_to_say_counts_matched_controls[column_matched_controls] = [prefer_not_to_say_count_matched_controls]




    

# Create DataFrames from the dictionaries
unique_counts_matched_control = []
row_counts_matched_control = []
nan_counts_matched_control = []
empty_counts_matched_control = []
prefer_not_to_say_counts_matched_control =[]

unique_counts_matched_control = pd.DataFrame(unique_counts_matched_controls, index=['Unique Count'])
row_counts_matched_control = pd.DataFrame(row_counts_matched_controls, index=['Row Count'])
nan_counts_matched_control = pd.DataFrame(nan_counts_matched_controls, index=['NaN Count'])
empty_counts_matched_control = pd.DataFrame(empty_counts_matched_controls, index=['Empty Count'])
prefer_not_to_say_counts_matched_control = pd.DataFrame(prefer_not_to_say_counts_matched_controls, index=['Prefer not to answer'])

# Concatenate the DataFrames
result_matched_controls = []
result_matched_controls = pd.concat([unique_counts_matched_control, row_counts_matched_control, nan_counts_matched_control, empty_counts_matched_control,prefer_not_to_say_counts_matched_control])


matched_controls_heart_failure = []
matched_controls_heart_failure = matched_controls

# Display the combined DataFrame
print("matched_controls:")
display(result_matched_controls)
print()
print()
display(matched_controls_heart_failure.head(2))
print()

In [None]:
# Import necessary libraries
import pandas as pd


# Extract unique Participant IDs
unique_participant_ids = []
unique_participant_ids = matched_controls_heart_failure['Participant ID'].unique()
#unique_participant_ids = pd.DataFrame(unique_participant_ids)
# Display the unique Participant IDs
print("Unique Participant IDs:")
display(unique_participant_ids)

In [None]:
import pandas as pd
Dataset_with_ICD10_and_Diseases_Names_and_Death_Records = []
Dataset_with_ICD10_and_Diseases_Names_and_Death_Records = pd.read_csv('Dataset with ICD10 and Diseases Names and Death Records.csv')

# Create dictionaries to store unique counts, row counts, NaN counts, and empty counts
unique_counts_Final_Dataset2 = {}
row_counts_Final_Dataset2 = {}
nan_counts_Final_Dataset2 = {}
empty_counts_Final_Dataset2 = {}

# Loop through columns and count unique items, row counts, NaN counts, and empty counts
for column_Final_Dataset2 in Dataset_with_ICD10_and_Diseases_Names_and_Death_Records.columns:
    unique_count_Final_Dataset2 = Dataset_with_ICD10_and_Diseases_Names_and_Death_Records[column_Final_Dataset2].nunique()
    row_count_Final_Dataset2 = len(Dataset_with_ICD10_and_Diseases_Names_and_Death_Records[column_Final_Dataset2])
    nan_count_Final_Dataset2 = Dataset_with_ICD10_and_Diseases_Names_and_Death_Records[column_Final_Dataset2].isna().sum()  # Count NaN values
    empty_count_Final_Dataset2 = Dataset_with_ICD10_and_Diseases_Names_and_Death_Records[column_Final_Dataset2].eq('').sum()  # Count empty string values

    unique_counts_Final_Dataset2[column_Final_Dataset2] = [unique_count_Final_Dataset2]
    row_counts_Final_Dataset2[column_Final_Dataset2] = [row_count_Final_Dataset2]
    nan_counts_Final_Dataset2[column_Final_Dataset2] = [nan_count_Final_Dataset2]
    empty_counts_Final_Dataset2[column_Final_Dataset2] = [empty_count_Final_Dataset2]

# Create DataFrames from the dictionaries
unique_counts_Final_Dataset_Record2 = []
row_counts_Final_Dataset_Record2 = []
nan_counts_Final_Dataset_Record2 = []
empty_counts_Final_Dataset_Record2 = []

unique_counts_Final_Dataset_Record2 = pd.DataFrame(unique_counts_Final_Dataset2, index=['Unique Count'])
row_counts_Final_Dataset_Record2 = pd.DataFrame(row_counts_Final_Dataset2, index=['Row Count'])
nan_counts_Final_Dataset_Record2 = pd.DataFrame(nan_counts_Final_Dataset2, index=['NaN Count'])
empty_counts_Final_Dataset_Record2 = pd.DataFrame(empty_counts_Final_Dataset2, index=['Empty Count'])

# Concatenate the DataFrames
result_Final_Dataset2 = []
result_Final_Dataset2 = pd.concat([unique_counts_Final_Dataset_Record2, row_counts_Final_Dataset_Record2, nan_counts_Final_Dataset_Record2, empty_counts_Final_Dataset_Record2])

# Display the combined DataFrame
print("Dataset_with_ICD10_and_Diseases_Names_and_Death_Records:")
print()
display(result_Final_Dataset2)
print()
print()
Dataset_with_ICD10_and_Diseases_Names_and_Death_Records.head(2)

### <center> Extract Heart Failure Patients Datafrom the Main dataframe

In [None]:
heart_failure_Matched_cohort_with_Comorbidities = []
heart_failure_Matched_cohort_with_Comorbidities = Dataset_with_ICD10_and_Diseases_Names_and_Death_Records[
    Dataset_with_ICD10_and_Diseases_Names_and_Death_Records['Participant ID'].isin(unique_participant_ids)
]



# Create dictionaries to store unique counts, row counts, NaN counts, and empty counts
unique_counts_heart_failure_Matched_cohort_with_Comorbidities = {}
row_counts_heart_failure_Matched_cohort_with_Comorbidities = {}
nan_counts_heart_failure_Matched_cohort_with_Comorbidities = {}
empty_counts_heart_failure_Matched_cohort_with_Comorbidities = {}

# Loop through columns and count unique items, row counts, NaN counts, and empty counts
for column_heart_failure_Matched_cohort_with_Comorbidities in heart_failure_Matched_cohort_with_Comorbidities.columns:
    unique_count_heart_failure_Matched_cohort_with_Comorbidities = heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities].nunique()
    row_count_heart_failure_Matched_cohort_with_Comorbidities = len(heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities])
    nan_count_heart_failure_Matched_cohort_with_Comorbidities = heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities].isna().sum()  # Count NaN values
    empty_count_heart_failure_Matched_cohort_with_Comorbidities = heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities].eq('').sum()  # Count empty string values

    unique_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [unique_count_heart_failure_Matched_cohort_with_Comorbidities]
    row_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [row_count_heart_failure_Matched_cohort_with_Comorbidities]
    nan_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [nan_count_heart_failure_Matched_cohort_with_Comorbidities]
    empty_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [empty_count_heart_failure_Matched_cohort_with_Comorbidities]

# Create DataFrames from the dictionaries
unique_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []
row_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []
nan_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []
empty_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []

unique_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(unique_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['Unique Count'])
row_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(row_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['Row Count'])
nan_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(nan_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['NaN Count'])
empty_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(empty_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['Empty Count'])

# Concatenate the DataFrames
result_heart_failure_Matched_cohort_with_Comorbidities = []
result_heart_failure_Matched_cohort_with_Comorbidities = pd.concat([unique_counts_heart_failure_Matched_cohort_with_Comorbidities2, row_counts_heart_failure_Matched_cohort_with_Comorbidities2, nan_counts_heart_failure_Matched_cohort_with_Comorbidities2, empty_counts_heart_failure_Matched_cohort_with_Comorbidities2])

# Display the combined DataFrame

display(result_heart_failure_Matched_cohort_with_Comorbidities)
print()
print()


#pd.set_option('display.max_rows', None)  # Adjust as needed to see all rows if it's a large output
display(heart_failure_Matched_cohort_with_Comorbidities.head(3))

### <center> To check that this is Heart Failure Matched Control Cohort with Non-PH rows

In [None]:
# Define the target values to check
target_icd10_values = []
target_icd10_values = ['I27.0', 'I27.2', 'I27.9']

# Filter the DataFrame based on whether 'Combined ICD10 Codes' contains any of the target values
check = []
check = heart_failure_Matched_cohort_with_Comorbidities[
    heart_failure_Matched_cohort_with_Comorbidities['Combined ICD10 Codes'].isin(target_icd10_values)
]

check

### <center> to check that this is right Heart Failure Control Cohort with Comorbidities

In [None]:
# Define the heart_failure ICD10 codes to check
heart_failure_icd10_codes = []
heart_failure_icd10_codes = ['I50.0', 'I50.1']

# Filter the DataFrame to find participants who have at least one of the heart_failure ICD10 codes
participants_with_heart_failure = []
participants_with_heart_failure = heart_failure_Matched_cohort_with_Comorbidities[
    heart_failure_Matched_cohort_with_Comorbidities['Combined ICD10 Codes'].isin(heart_failure_icd10_codes)
]['Participant ID'].unique()

# Check if all participants in the DataFrame have a heart_failure code
all_have_heart_failure  = []
all_have_heart_failure = set(heart_failure_Matched_cohort_with_Comorbidities['Participant ID'].unique()).issubset(set(participants_with_heart_failure))

# Display the result
if all_have_heart_failure:
    print("All participants have at least one of the specified heart_failure ICD10 codes.")
else:
    print("Not all participants have one of the specified heart_failure ICD10 codes.")

# Optionally, display the IDs of participants who do not have any of the COPD codes
participants_without_heart_failure = []
participants_without_heart_failure = set(heart_failure_Matched_cohort_with_Comorbidities['Participant ID'].unique()) - set(participants_with_heart_failure)
participants_without_heart_failure

### <center> Save "Heart Failure Control with Comorbidities Cohort Dataset"

In [None]:
# Specify the file path where you want to save the CSV file
#file_path = []
#file_path = 'Heart Failure Matched cohort with Comorbidities.csv'

# Use the to_csv method to save the DataFrame as a CSV file
#heart_failure_Matched_cohort_with_Comorbidities.to_csv(file_path, index=False)  # Set index=False to exclude the index column

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>Read the File "Heart Failure Matched cohort with Comorbidities.csv" </center></h2>
</div>

In [None]:
import pandas as pd
heart_failure_Matched_cohort_with_Comorbidities = []
heart_failure_Matched_cohort_with_Comorbidities = pd.read_csv('Heart Failure Matched cohort with Comorbidities.csv')



# Create dictionaries to store unique counts, row counts, NaN counts, and empty counts
unique_counts_heart_failure_Matched_cohort_with_Comorbidities = {}
row_counts_heart_failure_Matched_cohort_with_Comorbidities = {}
nan_counts_heart_failure_Matched_cohort_with_Comorbidities = {}
empty_counts_heart_failure_Matched_cohort_with_Comorbidities = {}

# Loop through columns and count unique items, row counts, NaN counts, and empty counts
for column_heart_failure_Matched_cohort_with_Comorbidities in heart_failure_Matched_cohort_with_Comorbidities.columns:
    unique_count_heart_failure_Matched_cohort_with_Comorbidities = heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities].nunique()
    row_count_heart_failure_Matched_cohort_with_Comorbidities = len(heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities])
    nan_count_heart_failure_Matched_cohort_with_Comorbidities = heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities].isna().sum()  # Count NaN values
    empty_count_heart_failure_Matched_cohort_with_Comorbidities = heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities].eq('').sum()  # Count empty string values

    unique_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [unique_count_heart_failure_Matched_cohort_with_Comorbidities]
    row_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [row_count_heart_failure_Matched_cohort_with_Comorbidities]
    nan_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [nan_count_heart_failure_Matched_cohort_with_Comorbidities]
    empty_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [empty_count_heart_failure_Matched_cohort_with_Comorbidities]

# Create DataFrames from the dictionaries
unique_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []
row_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []
nan_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []
empty_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []

unique_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(unique_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['Unique Count'])
row_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(row_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['Row Count'])
nan_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(nan_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['NaN Count'])
empty_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(empty_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['Empty Count'])

# Concatenate the DataFrames
result_heart_failure_Matched_cohort_with_Comorbidities = []
result_heart_failure_Matched_cohort_with_Comorbidities = pd.concat([unique_counts_heart_failure_Matched_cohort_with_Comorbidities2, row_counts_heart_failure_Matched_cohort_with_Comorbidities2, nan_counts_heart_failure_Matched_cohort_with_Comorbidities2, empty_counts_heart_failure_Matched_cohort_with_Comorbidities2])

# Display the combined DataFrame

display(result_heart_failure_Matched_cohort_with_Comorbidities)
print()
print()


#pd.set_option('display.max_rows', None)  # Adjust as needed to see all rows if it's a large output
display(heart_failure_Matched_cohort_with_Comorbidities.head(3))

### <center>Drop the Columns

In [None]:
# Drop columns by name
heart_failure_Matched_cohort_with_Comorbidities = heart_failure_Matched_cohort_with_Comorbidities.drop(columns=['Death Cause Diseases','Death Cause Disease ICD10 Codes'])
heart_failure_Matched_cohort_with_Comorbidities.head(3)

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>Create "Disease Chapters" and "Diseases Subchapters" Variables in the Dataset </center></h2>
</div>

In [None]:
def categorize_icd10(icd10_code):
    if 'A00.0' <= icd10_code <= 'A09.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'A00-A09 Intestinal infectious diseases'
    elif 'A15.0' <= icd10_code <= 'A19.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'A15-A19 Tuberculosis'
    elif 'A20.0' <= icd10_code <= 'A28.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'A20-A28 Certain zoonotic bacterial diseases'
    elif 'A30.0' <= icd10_code <= 'A49.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'A30-A49 Other bacterial diseases'
    elif 'A50.0' <= icd10_code <= 'A64.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'A50-A64 Infections with a predominantly sexual mode of transmission'
    elif 'A65.0' <= icd10_code <= 'A69.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'A65-A69 Other spirochaetal diseases'
    elif 'A70.0' <= icd10_code <= 'A74.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'A70-A74 Other diseases caused by chlamydiae'
    elif 'A70' <= icd10_code <= 'A74':
        return 'Chapter I Certain infectious and parasitic diseases', 'A70-A74 Other diseases caused by chlamydiae'
    elif 'A75.0' <= icd10_code <= 'A79.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'A75-A79 Rickettsioses'
    elif 'A80.0' <= icd10_code <= 'A89.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'A80-A89 Viral infections of the central nervous system'
    elif 'A90' <= icd10_code <= 'A90':
        return 'Chapter I Certain infectious and parasitic diseases', 'A80-A89 Viral infections of the central nervous system' 
    elif 'A92.0' <= icd10_code <= 'A99.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'A92-A99 Arthropod-borne viral fevers and viral haemorrhagic fevers'
    elif 'B00.0' <= icd10_code <= 'B09.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B00-B09 Viral infections characterized by skin and mucous membrane lesions'
    elif 'B15.0' <= icd10_code <= 'B19.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B15-B19 Viral hepatitis'
    elif 'B20.0' <= icd10_code <= 'B24.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B20-B24 Human immunodeficiency virus [HIV] disease'
    elif 'B25.0' <= icd10_code <= 'B34.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B25-B34 Other viral diseases'
    elif 'B35.0' <= icd10_code <= 'B49.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B35-B49 Mycoses'
    elif 'B50.0' <= icd10_code <= 'B64.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B50-B64 Protozoal diseases'
    elif 'B65.0' <= icd10_code <= 'B83.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B65-B83 Helminthiases'
    elif 'B85.0' <= icd10_code <= 'B89.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B85-B89 Pediculosis, acariasis and other infestations'
    elif 'B90.0' <= icd10_code <= 'B94.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B90-B94 Sequelae of infectious and parasitic diseases'
    elif 'B95.0' <= icd10_code <= 'B98.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B95-B98 Bacterial, viral and other infectious agents'
    elif 'B99.0' <= icd10_code <= 'B99.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B99-B99 Other infectious diseases'
    elif 'B99.0' <= icd10_code <= 'B99.9':
        return 'Chapter I Certain infectious and parasitic diseases', 'B99-B99 Other infectious diseases'
    elif 'B99' <= icd10_code <= 'B99':
        return 'Chapter I Certain infectious and parasitic diseases', 'B99-B99 Other infectious diseases'
    
    
    
    elif 'C00.0' <= icd10_code <= 'C14.9':
        return 'Chapter II Neoplasms', 'C00-C14 Malignant neoplasms of lip, oral cavity and pharynx'
    elif 'C15.0' <= icd10_code <= 'C26.9':
        return 'Chapter II Neoplasms', 'C15-C26 Malignant neoplasms of digestive organs'
    elif 'C30.0' <= icd10_code <= 'C39.9':
        return 'Chapter II Neoplasms', 'C30-C39 Malignant neoplasms of respiratory and intrathoracic organs'
    elif 'C40.0' <= icd10_code <= 'C41.9':
        return 'Chapter II Neoplasms', 'C40-C41 Malignant neoplasms of bone and articular cartilage'
    elif 'C43.0' <= icd10_code <= 'C44.9':
        return 'Chapter II Neoplasms', 'C43-C44 Melanoma and other malignant neoplasms of skin'
    elif 'C45.0' <= icd10_code <= 'C49.9':
        return 'Chapter II Neoplasms', 'C45-C49 Malignant neoplasms of mesothelial and soft tissue'
    elif 'C50.0' <= icd10_code <= 'C50.9':
        return 'Chapter II Neoplasms', 'C50-C50 Malignant neoplasm of breast'
    elif 'C51.0' <= icd10_code <= 'C58.9':
        return 'Chapter II Neoplasms', 'C51-C58 Malignant neoplasms of female genital organs'
    elif 'C60.0' <= icd10_code <= 'C63.9':
        return 'Chapter II Neoplasms', 'C60-C63 Malignant neoplasms of male genital organs'
    elif 'C64.0' <= icd10_code <= 'C68.9':
        return 'Chapter II Neoplasms', 'C64-C68 Malignant neoplasms of urinary tract'
    elif 'C64' <= icd10_code <= 'C68':
        return 'Chapter II Neoplasms', 'C64-C68 Malignant neoplasms of urinary tract'
    elif 'C69.0' <= icd10_code <= 'C72.9':
        return 'Chapter II Neoplasms', 'C69-C72 Malignant neoplasms of eye, brain and other parts of central nervous system'
    elif 'C73.0' <= icd10_code <= 'C75.9':
        return 'Chapter II Neoplasms', 'C73-C75 Malignant neoplasms of thyroid and other endocrine glands'
    elif 'C73' <= icd10_code <= 'C75':
        return 'Chapter II Neoplasms', 'C73-C75 Malignant neoplasms of thyroid and other endocrine glands'
    elif 'C76.0' <= icd10_code <= 'C80.9':
        return 'Chapter II Neoplasms', 'C76-C80 Malignant neoplasms of ill-defined, secondary and unspecified sites'
    elif 'C81.0' <= icd10_code <= 'C96.9':
        return 'Chapter II Neoplasms', 'C81-C96 Malignant neoplasms, stated or presumed to be primary, of lymphoid, haematopoietic and related tissue'
    elif 'C97.0' <= icd10_code <= 'C97.9':
        return 'Chapter II Neoplasms', 'C97-C97 Malignant neoplasms of independent (primary) multiple sites'
    elif 'C97' <= icd10_code <= 'C97':
        return 'Chapter II Neoplasms', 'C97-C97 Malignant neoplasms of independent (primary) multiple sites'
    elif 'D00.0' <= icd10_code <= 'D09.9':
        return 'Chapter II Neoplasms', 'D00-D09 In situ neoplasms'
    elif 'D10.0' <= icd10_code <= 'D36.9':
        return 'Chapter II Neoplasms', 'D10-D36 Benign neoplasms'
    elif 'D37.0' <= icd10_code <= 'D48.9':
        return 'Chapter II Neoplasms', 'D37-D48 Neoplasms of uncertain or unknown behaviour'
    
    
    
    elif 'D50.0' <= icd10_code <= 'D53.9':
        return 'Chapter III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism', 'D50-D53 Nutritional anaemias'
    elif 'D55.0' <= icd10_code <= 'D59.9':
        return 'Chapter III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism', 'D55-D59 Haemolytic anaemias'
    elif 'D60.0' <= icd10_code <= 'D64.9':
        return 'Chapter III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism', 'D60-D64 Aplastic and other anaemias'
    elif 'D65' <= icd10_code <= 'D65':
        return 'Chapter III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism', 'D65-D69 Coagulation defects, purpura and other haemorrhagic conditions'
    elif 'D65.0' <= icd10_code <= 'D69.9':
        return 'Chapter III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism', 'D65-D69 Coagulation defects, purpura and other haemorrhagic conditions'
    elif 'D70.0' <= icd10_code <= 'D77.9':
        return 'Chapter III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism', 'D70-D77 Other diseases of blood and blood-forming organs'
    elif 'D70' <= icd10_code <= 'D77':
        return 'Chapter III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism', 'D70-D77 Other diseases of blood and blood-forming organs'
    elif 'D80.0' <= icd10_code <= 'D89.9':
        return 'Chapter III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism', 'D80-D89 Certain disorders involving the immune mechanism'
    
    
    
    
    elif 'E00.0' <= icd10_code <= 'E07.9':
        return 'Chapter IV Endocrine, nutritional and metabolic diseases', 'E00-E07 Disorders of thyroid gland'
    elif 'E10.0' <= icd10_code <= 'E14.9':
        return 'Chapter IV Endocrine, nutritional and metabolic diseases', 'E10-E14 Diabetes mellitus'
    elif 'E15.0' <= icd10_code <= 'E16.9':
        return 'Chapter IV Endocrine, nutritional and metabolic diseases', 'E15-E16 Other disorders of glucose regulation and pancreatic internal secretion'
    elif 'E20.0' <= icd10_code <= 'E35.9':
        return 'Chapter IV Endocrine, nutritional and metabolic diseases', 'E20-E35 Disorders of other endocrine glands'
    elif 'E40.0' <= icd10_code <= 'E46.9':
        return 'Chapter IV Endocrine, nutritional and metabolic diseases', 'E40-E46 Malnutrition'
    elif 'E50.0' <= icd10_code <= 'E64.9':
        return 'Chapter IV Endocrine, nutritional and metabolic diseases', 'E50-E64 Other nutritional deficiencies'
    elif 'E65.0' <= icd10_code <= 'E68.9':
        return 'Chapter IV Endocrine, nutritional and metabolic diseases', 'E65-E68 Obesity and other hyperalimentation'
    elif 'E65' <= icd10_code <= 'E68':
        return 'Chapter IV Endocrine, nutritional and metabolic diseases', 'E65-E68 Obesity and other hyperalimentation'
    elif 'E70.0' <= icd10_code <= 'E90.9':
        return 'Chapter IV Endocrine, nutritional and metabolic diseases', 'E70-E90 Metabolic disorders'
    
    
    
    
    
    elif 'F00.0' <= icd10_code <= 'F09.9':
        return 'Chapter V Mental and behavioural disorders', 'F00-F09 Organic, including symptomatic, mental disorders'
    elif 'F10.0' <= icd10_code <= 'F19.9':
        return 'Chapter V Mental and behavioural disorders', 'F10-F19 Mental and behavioural disorders due to psychoactive substance use'
    elif 'F20.0' <= icd10_code <= 'F29.9':
        return 'Chapter V Mental and behavioural disorders', 'F20-F29 Schizophrenia, schizotypal and delusional disorders'
    elif 'F30.0' <= icd10_code <= 'F39.9':
        return 'Chapter V Mental and behavioural disorders', 'F30-F39 Mood [affective] disorders'
    elif 'F40.0' <= icd10_code <= 'F48.9':
        return 'Chapter V Mental and behavioural disorders', 'F40-F48 Neurotic, stress-related and somatoform disorders'
    elif 'F50.0' <= icd10_code <= 'F59.9':
        return 'Chapter V Mental and behavioural disorders', 'F50-F59 Behavioural syndromes associated with physiological disturbances and physical factors'
    elif 'F60.0' <= icd10_code <= 'F69.9':
        return 'Chapter V Mental and behavioural disorders', 'F60-F69 Disorders of adult personality and behaviour'
    elif 'F70.0' <= icd10_code <= 'F79.9':
        return 'Chapter V Mental and behavioural disorders', 'F70-F79 Mental retardation'
    elif 'F80.0' <= icd10_code <= 'F89.9':
        return 'Chapter V Mental and behavioural disorders', 'F80-F89 Disorders of psychological development'
    elif 'F90.0' <= icd10_code <= 'F98.9':
        return 'Chapter V Mental and behavioural disorders', 'F90-F98 Behavioural and emotional disorders with onset usually occurring in childhood and adolescence'
    elif 'F99.0' <= icd10_code <= 'F99.9':
        return 'Chapter V Mental and behavioural disorders', 'F99-F99 Unspecified mental disorder'
    elif 'F99' <= icd10_code <= 'F99':
        return 'Chapter V Mental and behavioural disorders', 'F99-F99 Unspecified mental disorder'
    
    
    
    
    elif 'G00.0' <= icd10_code <= 'G09.9':
        return 'Chapter VI Diseases of the nervous system', 'G00-G09 Inflammatory diseases of the central nervous system'
    elif 'G10.0' <= icd10_code <= 'G14.9':
        return 'Chapter VI Diseases of the nervous system', 'G10-G14 Systemic atrophies primarily affecting the central nervous system'
    elif 'G10' <= icd10_code <= 'G10':
        return 'Chapter VI Diseases of the nervous system', 'G10-G14 Systemic atrophies primarily affecting the central nervous system'
    elif 'G20.0' <= icd10_code <= 'G26.9':
        return 'Chapter VI Diseases of the nervous system', 'G20-G26 Extrapyramidal and movement disorders'
    elif 'G20' <= icd10_code <= 'G26':
        return 'Chapter VI Diseases of the nervous system', 'G20-G26 Extrapyramidal and movement disorders'
    elif 'G30.0' <= icd10_code <= 'G32.9':
        return 'Chapter VI Diseases of the nervous system', 'G30-G32 Other degenerative diseases of the nervous system'
    elif 'G35.0' <= icd10_code <= 'G37.9':
        return 'Chapter VI Diseases of the nervous system', 'G35-G37 Demyelinating diseases of the central nervous system'
    elif 'G35' <= icd10_code <= 'G37':
        return 'Chapter VI Diseases of the nervous system', 'G35-G37 Demyelinating diseases of the central nervous system'
    elif 'G40.0' <= icd10_code <= 'G47.9':
        return 'Chapter VI Diseases of the nervous system', 'G40-G47 Episodic and paroxysmal disorders'
    elif 'G50.0' <= icd10_code <= 'G59.9':
        return 'Chapter VI Diseases of the nervous system', 'G50-G59 Nerve, nerve root and plexus disorders'
    elif 'G60.0' <= icd10_code <= 'G64.9':
        return 'Chapter VI Diseases of the nervous system', 'G60-G64 Polyneuropathies and other disorders of the peripheral nervous system'
    elif 'G70.0' <= icd10_code <= 'G73.9':
        return 'Chapter VI Diseases of the nervous system', 'G70-G73 Diseases of myoneural junction and muscle'
    elif 'G80.0' <= icd10_code <= 'G83.9':
        return 'Chapter VI Diseases of the nervous system', 'G80-G83 Cerebral palsy and other paralytic syndromes'
    elif 'G90.0' <= icd10_code <= 'G99.9':
        return 'Chapter VI Diseases of the nervous system', 'G90-G99 Other disorders of the nervous system'
    
    
    
    elif 'H00.0' <= icd10_code <= 'H06.9':
        return 'Chapter VII Diseases of the eye and adnexa', 'H00-H06 Disorders of eyelid, lacrimal system and orbit'
    elif 'H10.0' <= icd10_code <= 'H13.9':
        return 'Chapter VII Diseases of the eye and adnexa', 'H10-H13 Disorders of conjunctiva'
    elif 'H15.0' <= icd10_code <= 'H22.9':
        return 'Chapter VII Diseases of the eye and adnexa', 'H15-H22 Disorders of sclera, cornea, iris and ciliary body'
    elif 'H25.0' <= icd10_code <= 'H28.9':
        return 'Chapter VII Diseases of the eye and adnexa', 'H25-H28 Disorders of lens'
    elif 'H30.0' <= icd10_code <= 'H36.9':
        return 'Chapter VII Diseases of the eye and adnexa', 'H30-H36 Disorders of choroid and retina'
    elif 'H40.0' <= icd10_code <= 'H42.9':
        return 'Chapter VII Diseases of the eye and adnexa', 'H40-H42 Glaucoma'
    elif 'H43.0' <= icd10_code <= 'H45.9':
        return 'Chapter VII Diseases of the eye and adnexa', 'H43-H45 Disorders of vitreous body and globe'
    elif 'H46.0' <= icd10_code <= 'H48.9':
        return 'Chapter VII Diseases of the eye and adnexa', 'H46-H48 Disorders of optic nerve and visual pathways'
    elif 'H49.0' <= icd10_code <= 'H52.9':
        return 'Chapter VII Diseases of the eye and adnexa', 'H49-H52 Disorders of ocular muscles, binocular movement, accommodation and refraction'
    elif 'H53.0' <= icd10_code <= 'H54.9':
        return 'Chapter VII Diseases of the eye and adnexa', 'H53-H54 Visual disturbances and blindness'
    elif 'H55.0' <= icd10_code <= 'H59.9':
        return 'Chapter VII Diseases of the eye and adnexa', 'H55-H59 Other disorders of eye and adnexa'
    elif 'H55' <= icd10_code <= 'H59':
        return 'Chapter VII Diseases of the eye and adnexa', 'H55-H59 Other disorders of eye and adnexa'
    
    
    
    
    elif 'H60.0' <= icd10_code <= 'H62.9':
        return 'Chapter VIII Diseases of the ear and mastoid process', 'H60-H62 Diseases of external ear'
    elif 'H46' <= icd10_code <= 'H46':
        return 'Chapter VIII Diseases of the ear and mastoid process', 'H46-H48 Disorders of optic nerve and visual pathways'
    elif 'H65.0' <= icd10_code <= 'H75.9':
        return 'Chapter VIII Diseases of the ear and mastoid process', 'H65-H75 Diseases of middle ear and mastoid'
    elif 'H80.0' <= icd10_code <= 'H83.9':
        return 'Chapter VIII Diseases of the ear and mastoid process', 'H80-H83 Diseases of inner ear'
    elif 'H90.0' <= icd10_code <= 'H95.9':
        return 'Chapter VIII Diseases of the ear and mastoid process', 'H90-H95 Other disorders of ear'
    
    
    
    elif 'I00.0' <= icd10_code <= 'I02.9':
        return 'Chapter IX Diseases of the circulatory system', 'I00-I02 Acute rheumatic fever'
    elif 'I00' <= icd10_code <= 'I02':
        return 'Chapter IX Diseases of the circulatory system', 'I00-I02 Acute rheumatic fever'
    elif 'I05.0' <= icd10_code <= 'I09.9':
        return 'Chapter IX Diseases of the circulatory system', 'I05-I09 Chronic rheumatic heart diseases'
    elif 'I10.0' <= icd10_code <= 'I15.9':
        return 'Chapter IX Diseases of the circulatory system', 'I10-I15 Hypertensive diseases'
    elif 'I10' <= icd10_code <= 'I15':
        return 'Chapter IX Diseases of the circulatory system', 'I10-I15 Hypertensive diseases'
    elif 'I20.0' <= icd10_code <= 'I25.9':
        return 'Chapter IX Diseases of the circulatory system', 'I20-I25 Ischaemic heart diseases'
    elif 'I26.0' <= icd10_code <= 'I28.9':
        return 'Chapter IX Diseases of the circulatory system', 'I26-I28 Pulmonary heart disease and diseases of pulmonary circulation'
    elif 'I30.0' <= icd10_code <= 'I52.9':
        return 'Chapter IX Diseases of the circulatory system', 'I30-I52 Other forms of heart disease'
    elif 'I60.0' <= icd10_code <= 'I69.9':
        return 'Chapter IX Diseases of the circulatory system', 'I60-I69 Cerebrovascular diseases'
    elif 'I70.0' <= icd10_code <= 'I79.9':
        return 'Chapter IX Diseases of the circulatory system', 'I70-I79 Diseases of arteries, arterioles and capillaries'
    elif 'I80.0' <= icd10_code <= 'I89.9':
        return 'Chapter IX Diseases of the circulatory system', 'I80-I89 Diseases of veins, lymphatic vessels and lymph nodes, not elsewhere classified'
    elif 'I95.0' <= icd10_code <= 'I99.9':
        return 'Chapter IX Diseases of the circulatory system', 'I95-I99 Other and unspecified disorders of the circulatory system'
    
    
    
    
    
    elif 'J00.0' <= icd10_code <= 'J06.9':
        return 'Chapter X Diseases of the respiratory system', 'J00-J06 Acute upper respiratory infections'
    elif 'J00' <= icd10_code <= 'J06':
        return 'Chapter X Diseases of the respiratory system', 'J00-J06 Acute upper respiratory infections'
    elif 'J09.0' <= icd10_code <= 'J18.9':
        return 'Chapter X Diseases of the respiratory system', 'J09-J18 Influenza and pneumonia'
    elif 'J09' <= icd10_code <= 'J18':
        return 'Chapter X Diseases of the respiratory system', 'J09-J18 Influenza and pneumonia'
    elif 'J20.0' <= icd10_code <= 'J22.9':
        return 'Chapter X Diseases of the respiratory system', 'J20-J22 Other acute lower respiratory infections'
    elif 'J30.0' <= icd10_code <= 'J39.9':
        return 'Chapter X Diseases of the respiratory system', 'J30-J39 Other diseases of upper respiratory tract'
    elif 'J40.0' <= icd10_code <= 'J47.9':
        return 'Chapter X Diseases of the respiratory system', 'J40-J47 Chronic lower respiratory diseases'
    elif 'J40' <= icd10_code <= 'J47':
        return 'Chapter X Diseases of the respiratory system', 'J40-J47 Chronic lower respiratory diseases'
    elif 'J60.0' <= icd10_code <= 'J70.9':
        return 'Chapter X Diseases of the respiratory system', 'J60-J70 Lung diseases due to external agents'
    elif 'J60' <= icd10_code <= 'J60':
        return 'Chapter X Diseases of the respiratory system', 'J60-J70 Lung diseases due to external agents'
    elif 'J80.0' <= icd10_code <= 'J84.9':
        return 'Chapter X Diseases of the respiratory system', 'J80-J84 Other respiratory diseases principally affecting the interstitium'
    elif 'J80' <= icd10_code <= 'J84':
        return 'Chapter X Diseases of the respiratory system', 'J80-J84 Other respiratory diseases principally affecting the interstitium'
    elif 'J85.0' <= icd10_code <= 'J86.9':
        return 'Chapter X Diseases of the respiratory system', 'J85-J86 Suppurative and necrotic conditions of lower respiratory tract'
    elif 'J90' <= icd10_code <= 'J94':
        return 'Chapter X Diseases of the respiratory system', 'J90-J94 Other diseases of pleura'
    elif 'J90.0' <= icd10_code <= 'J94.9':
        return 'Chapter X Diseases of the respiratory system', 'J90-J94 Other diseases of pleura'
    elif 'J95.0' <= icd10_code <= 'J99.9':
        return 'Chapter X Diseases of the respiratory system', 'J95-J99 Other diseases of the respiratory system'
    
    
    
    
    elif 'K00.0' <= icd10_code <= 'K14.9':
        return 'Chapter XI Diseases of the digestive system', 'K00-K14 Diseases of oral cavity, salivary glands and jaws'
    elif 'K20.0' <= icd10_code <= 'K31.9':
        return 'Chapter XI Diseases of the digestive system', 'K20-K31 Diseases of oesophagus, stomach and duodenum'
    elif 'K20' <= icd10_code <= 'K31':
        return 'Chapter XI Diseases of the digestive system', 'K20-K31 Diseases of oesophagus, stomach and duodenum'
    elif 'K35.0' <= icd10_code <= 'K38.9':
        return 'Chapter XI Diseases of the digestive system', 'K35-K38 Diseases of appendix'
    elif 'K40.0' <= icd10_code <= 'K46.9':
        return 'Chapter XI Diseases of the digestive system', 'K40-K46 Hernia'
    elif 'K50.0' <= icd10_code <= 'K52.9':
        return 'Chapter XI Diseases of the digestive system', 'K50-K52 Noninfective enteritis and colitis'
    elif 'K55.0' <= icd10_code <= 'K64.9':
        return 'Chapter XI Diseases of the digestive system', 'K55-K64 Other diseases of intestines'
    elif 'K65.0' <= icd10_code <= 'K67.9':
        return 'Chapter XI Diseases of the digestive system', 'K65-K67 Diseases of peritoneum'
    elif 'K70.0' <= icd10_code <= 'K77.9':
        return 'Chapter XI Diseases of the digestive system', 'K70-K77 Diseases of liver'
    elif 'K80.0' <= icd10_code <= 'K87.9':
        return 'Chapter XI Diseases of the digestive system', 'K80-K87 Disorders of gallbladder, biliary tract and pancreas'
    elif 'K90.0' <= icd10_code <= 'K93.9':
        return 'Chapter XI Diseases of the digestive system', 'K90-K93 Other diseases of the digestive system'
    
    
    
    
    
    elif 'L00.0' <= icd10_code <= 'L08.9':
        return 'Chapter XII Diseases of the skin and subcutaneous tissue', 'L00-L08 Infections of the skin and subcutaneous tissue'
    elif 'L10.0' <= icd10_code <= 'L14.9':
        return 'Chapter XII Diseases of the skin and subcutaneous tissue', 'L10-L14 Bullous disorders'
    elif 'L20.0' <= icd10_code <= 'L30.9':
        return 'Chapter XII Diseases of the skin and subcutaneous tissue', 'L20-L30 Dermatitis and eczema'
    elif 'L40.0' <= icd10_code <= 'L45.9':
        return 'Chapter XII Diseases of the skin and subcutaneous tissue', 'L40-L45 Papulosquamous disorders'
    elif 'L50.0' <= icd10_code <= 'L54.9':
        return 'Chapter XII Diseases of the skin and subcutaneous tissue', 'L50-L54 Urticaria and erythema'
    elif 'L55.0' <= icd10_code <= 'L59.9':
        return 'Chapter XII Diseases of the skin and subcutaneous tissue', 'L55-L59 Radiation-related disorders of the skin and subcutaneous tissue'
    elif 'L60.0' <= icd10_code <= 'L75.9':
        return 'Chapter XII Diseases of the skin and subcutaneous tissue', 'L60-L75 Disorders of skin appendages'
    elif 'L80.0' <= icd10_code <= 'L99.9':
        return 'Chapter XII Diseases of the skin and subcutaneous tissue', 'L80-L99 Other disorders of the skin and subcutaneous tissue'
    elif 'L80' <= icd10_code <= 'L99':
        return 'Chapter XII Diseases of the skin and subcutaneous tissue', 'L80-L99 Other disorders of the skin and subcutaneous tissue'
    
    
    
    elif 'M00.0' <= icd10_code <= 'M03.9':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M00-M03 Infectious arthropathies'
    elif 'M05.0' <= icd10_code <= 'M14.9':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M05-M14 Inflammatory polyarthropathies'
    elif 'M15.0' <= icd10_code <= 'M19.99':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M15-M19 Arthrosis'
    elif 'M20.0' <= icd10_code <= 'M25.99':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M20-M25 Other joint disorders'
    elif 'M30.0' <= icd10_code <= 'M36.9':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M30-M36 Systemic connective tissue disorders'
    elif 'M40.0' <= icd10_code <= 'M43.9':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M40-M43 Deforming dorsopathies'
    elif 'M43.99' <= icd10_code <= 'M43.99':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M40-M43 Deforming dorsopathies'
    elif 'M45.0' <= icd10_code <= 'M49.9':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M45-M49 Spondylopathies'
    elif 'M45' <= icd10_code <= 'M49':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M45-M49 Spondylopathies'
    elif 'M50.0' <= icd10_code <= 'M54.99':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M50-M54 Other dorsopathies'
    elif 'M60.0' <= icd10_code <= 'M63.9':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M60-M63 Disorders of muscles'
    elif 'M65.0' <= icd10_code <= 'M68.9':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M65-M68 Disorders of synovium and tendon'
    elif 'M70.0' <= icd10_code <= 'M79.99':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M70-M79 Other soft tissue disorders'
    elif 'M80.0' <= icd10_code <= 'M85.99':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M80-M85 Disorders of bone density and structure'
    elif 'M86.0' <= icd10_code <= 'M90.9':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M86-M90 Other osteopathies'
    elif 'M91.0' <= icd10_code <= 'M94.9':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M91-M94 Chondropathies'
    elif 'M95.0' <= icd10_code <= 'M99.9':
        return 'Chapter XIII Diseases of the musculoskeletal system and connective tissue', 'M95-M99 Other disorders of the musculoskeletal system and connective tissue'
    
    
    
    
    
    elif 'N00.0' <= icd10_code <= 'N08.9':
        return 'Chapter XIV Diseases of the genitourinary system', 'N00-N08 Glomerular diseases'
    elif 'N10.0' <= icd10_code <= 'N16.9':
        return 'Chapter XIV Diseases of the genitourinary system', 'N10-N16 Renal tubulo-interstitial diseases'
    elif 'N10' <= icd10_code <= 'N16':
        return 'Chapter XIV Diseases of the genitourinary system', 'N10-N16 Renal tubulo-interstitial diseases'
    elif 'N17.0' <= icd10_code <= 'N19.9':
        return 'Chapter XIV Diseases of the genitourinary system', 'N17-N19 Renal failure'
    elif 'N20.0' <= icd10_code <= 'N23.9':
        return 'Chapter XIV Diseases of the genitourinary system', 'N20-N23 Urolithiasis'
    elif 'N25.0' <= icd10_code <= 'N29.9':
        return 'Chapter XIV Diseases of the genitourinary system', 'N25-N29 Other disorders of kidney and ureter'
    elif 'N30.0' <= icd10_code <= 'N39.9':
        return 'Chapter XIV Diseases of the genitourinary system', 'N30-N39 Other diseases of urinary system'
    elif 'N40' <= icd10_code <= 'N51':
        return 'Chapter XIV Diseases of the genitourinary system', 'N40-N51 Diseases of male genital organs'
    elif 'N40.0' <= icd10_code <= 'N51.9':
        return 'Chapter XIV Diseases of the genitourinary system', 'N40-N51 Diseases of male genital organs'
    elif 'N60.0' <= icd10_code <= 'N64.9':
        return 'Chapter XIV Diseases of the genitourinary system', 'N60-N64 Disorders of breast'
    elif 'N70.0' <= icd10_code <= 'N77.9':
        return 'Chapter XIV Diseases of the genitourinary system', 'N70-N77 Inflammatory diseases of female pelvic organs'
    elif 'N80.0' <= icd10_code <= 'N98.9':
        return 'Chapter XIV Diseases of the genitourinary system', 'N80-N98 Noninflammatory disorders of female genital tract'
    elif 'N99.0' <= icd10_code <= 'N99.9':
        return 'Chapter XIV Diseases of the genitourinary system', 'N99-N99 Other disorders of the genitourinary system'
    
    
    
    
    elif 'O00.0' <= icd10_code <= 'O08.9':
        return 'Chapter XV Pregnancy, childbirth and the puerperium', 'O00-O08 Pregnancy with abortive outcome'
    elif 'O10.0' <= icd10_code <= 'O16.9':
        return 'Chapter XV Pregnancy, childbirth and the puerperium', 'O10-O16 Oedema, proteinuria and hypertensive disorders in pregnancy, childbirth and the puerperium'
    elif 'O20.0' <= icd10_code <= 'O29.9':
        return 'Chapter XV Pregnancy, childbirth and the puerperium', 'O20-O29 Other maternal disorders predominantly related to pregnancy'
    elif 'O30.0' <= icd10_code <= 'O48.9':
        return 'Chapter XV Pregnancy, childbirth and the puerperium', 'O30-O48 Maternal care related to the fetus and amniotic cavity and possible delivery problems'
    elif 'O60.0' <= icd10_code <= 'O75.9':
        return 'Chapter XV Pregnancy, childbirth and the puerperium', 'O60-O75 Complications of labour and delivery'
    elif 'O60' <= icd10_code <= 'O60':
        return 'Chapter XV Pregnancy, childbirth and the puerperium', 'O60-O75 Complications of labour and delivery'
    elif 'O80.0' <= icd10_code <= 'O84.9':
        return 'Chapter XV Pregnancy, childbirth and the puerperium', 'O80-O84 Delivery'
    elif 'O85.0' <= icd10_code <= 'O92.9':
        return 'Chapter XV Pregnancy, childbirth and the puerperium', 'O85-O92 Complications predominantly related to the puerperium'
    elif 'O85' <= icd10_code <= 'O85':
        return 'Chapter XV Pregnancy, childbirth and the puerperium', 'O85-O92 Complications predominantly related to the puerperium'
    elif 'O94.0' <= icd10_code <= 'O99.9':
        return 'Chapter XV Pregnancy, childbirth and the puerperium', 'O94-O99 Other obstetric conditions, not elsewhere classified'
    
    
    
    elif 'P00.0' <= icd10_code <= 'P04.9':
        return 'Chapter XVI Certain conditions originating in the perinatal period', 'P00-P04 Fetus and newborn affected by maternal factors and by complications of pregnancy, labour and delivery'
    elif 'P05.0' <= icd10_code <= 'P08.9':
        return 'Chapter XVI Certain conditions originating in the perinatal period', 'P05-P08 Disorders related to length of gestation and fetal growth'
    elif 'P10.0' <= icd10_code <= 'P15.9':
        return 'Chapter XVI Certain conditions originating in the perinatal period', 'P10-P15 Birth trauma'
    elif 'P20.0' <= icd10_code <= 'P29.9':
        return 'Chapter XVI Certain conditions originating in the perinatal period', 'P20-P29 Respiratory and cardiovascular disorders specific to the perinatal period'
    elif 'P35.0' <= icd10_code <= 'P39.9':
        return 'Chapter XVI Certain conditions originating in the perinatal period', 'P35-P39 Infections specific to the perinatal period'
    elif 'P50.0' <= icd10_code <= 'P61.9':
        return 'Chapter XVI Certain conditions originating in the perinatal period', 'P50-P61 Haemorrhagic and haematological disorders of fetus and newborn'
    elif 'P80.0' <= icd10_code <= 'P83.9':
        return 'Chapter XVI Certain conditions originating in the perinatal period', 'P80-P83 Conditions involving the integument and temperature regulation of fetus and newborn'
    elif 'P90.0' <= icd10_code <= 'P96.9':
        return 'Chapter XVI Certain conditions originating in the perinatal period', 'P90-P96 Other disorders originating in the perinatal period'
    
    
    
    elif 'Q00.0' <= icd10_code <= 'Q07.9':
        return 'Chapter XVII Congenital malformations, deformations and chromosomal abnormalities', 'Q00-Q07 Congenital malformations of the nervous system'
    elif 'Q10.0' <= icd10_code <= 'Q18.9':
        return 'Chapter XVII Congenital malformations, deformations and chromosomal abnormalities', 'Q10-Q18 Congenital malformations of eye, ear, face and neck'
    elif 'Q20.0' <= icd10_code <= 'Q28.9':
        return 'Chapter XVII Congenital malformations, deformations and chromosomal abnormalities', 'Q20-Q28 Congenital malformations of the circulatory system'
    elif 'Q30.0' <= icd10_code <= 'Q34.9':
        return 'Chapter XVII Congenital malformations, deformations and chromosomal abnormalities', 'Q30-Q34 Congenital malformations of the respiratory system'
    elif 'Q35.0' <= icd10_code <= 'Q37.9':
        return 'Chapter XVII Congenital malformations, deformations and chromosomal abnormalities', 'Q35-Q37 Cleft lip and cleft palate'
    elif 'Q38.0' <= icd10_code <= 'Q45.9':
        return 'Chapter XVII Congenital malformations, deformations and chromosomal abnormalities', 'Q38-Q45 Other congenital malformations of the digestive system'
    elif 'Q50.0' <= icd10_code <= 'Q56.9':
        return 'Chapter XVII Congenital malformations, deformations and chromosomal abnormalities', 'Q50-Q56 Congenital malformations of genital organs'
    elif 'Q60.0' <= icd10_code <= 'Q64.9':
        return 'Chapter XVII Congenital malformations, deformations and chromosomal abnormalities', 'Q60-Q64 Congenital malformations of the urinary system'
    elif 'Q65.0' <= icd10_code <= 'Q79.9':
        return 'Chapter XVII Congenital malformations, deformations and chromosomal abnormalities', 'Q65-Q79 Congenital malformations and deformations of the musculoskeletal system'
    elif 'Q80.0' <= icd10_code <= 'Q89.9':
        return 'Chapter XVII Congenital malformations, deformations and chromosomal abnormalities', 'Q80-Q89 Other congenital malformations'
    elif 'Q90.0' <= icd10_code <= 'Q99.9':
        return 'Chapter XVII Congenital malformations, deformations and chromosomal abnormalities', 'Q90-Q99 Chromosomal abnormalities, not elsewhere classified'
    
    
    
    
    
    elif 'R00.0' <= icd10_code <= 'R09.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R00-R09 Symptoms and signs involving the circulatory and respiratory systems'
    elif 'R10.0' <= icd10_code <= 'R19.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R10-R19 Symptoms and signs involving the digestive system and abdomen'
    elif 'R20.0' <= icd10_code <= 'R23.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R20-R23 Symptoms and signs involving the skin and subcutaneous tissue'
    elif 'R25.0' <= icd10_code <= 'R29.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R25-R29 Symptoms and signs involving the nervous and musculoskeletal systems'
    elif 'R30.0' <= icd10_code <= 'R39.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R30-R39 Symptoms and signs involving the urinary system'
    elif 'R40.0' <= icd10_code <= 'R46.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R40-R46 Symptoms and signs involving cognition, perception, emotional state and behaviour'
    elif 'R47.0' <= icd10_code <= 'R49.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R47-R49 Symptoms and signs involving speech and voice'
    elif 'R50.0' <= icd10_code <= 'R69.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R50-R69 General symptoms and signs'
    elif 'R70.0' <= icd10_code <= 'R79.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R70-R79 Abnormal findings on examination of blood, without diagnosis'
    elif 'R80' <= icd10_code <= 'R82':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R80-R82 Abnormal findings on examination of urine, without diagnosis'
    elif 'R80.0' <= icd10_code <= 'R82.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R80-R82 Abnormal findings on examination of urine, without diagnosis'
    elif 'R83.0' <= icd10_code <= 'R89.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R83-R89 Abnormal findings on examination of other body fluids, substances and tissues, without diagnosis'
    elif 'R90.0' <= icd10_code <= 'R94.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R90-R94 Abnormal findings on diagnostic imaging and in function studies, without diagnosis'
    elif 'R95.0' <= icd10_code <= 'R99.9':
        return 'Chapter XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified', 'R95-R99 Ill-defined and unknown causes of mortality'
    
    
    
    elif 'S00.0' <= icd10_code <= 'S09.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'S00-S09 Injuries to the head'
    elif 'S10.0' <= icd10_code <= 'S19.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'S10-S19 Injuries to the neck'
    elif 'S20.0' <= icd10_code <= 'S29.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'S20-S29 Injuries to the thorax'
    elif 'S30.0' <= icd10_code <= 'S39.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'S30-S39 Injuries to the abdomen, lower back, lumbar spine and pelvis'
    elif 'S40.0' <= icd10_code <= 'S49.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'S40-S49 Injuries to the shoulder and upper arm'
    elif 'S50.0' <= icd10_code <= 'S59.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'S50-S59 Injuries to the elbow and forearm'
    elif 'S60.0' <= icd10_code <= 'S69.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'S60-S69 Injuries to the wrist and hand'
    elif 'S70.0' <= icd10_code <= 'S79.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'S70-S79 Injuries to the hip and thigh'
    elif 'S80.0' <= icd10_code <= 'S89.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'S80-S89 Injuries to the knee and lower leg'
    elif 'S90.0' <= icd10_code <= 'S99.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'S90-S99 Injuries to the ankle and foot'
    elif 'T00.0' <= icd10_code <= 'T07.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T00-T07 Injuries involving multiple body regions'
    elif 'T08' <= icd10_code <= 'T14':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T08-T14 Injuries to unspecified part of trunk, limb or body region'
    elif 'T08.0' <= icd10_code <= 'T14.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T08-T14 Injuries to unspecified part of trunk, limb or body region'
    elif 'T15.0' <= icd10_code <= 'T19.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T15-T19 Effects of foreign body entering through natural orifice'
    elif 'T20.0' <= icd10_code <= 'T25.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T20-T25 Burns and corrosions of external body surface, specified by site'
    elif 'T26.0' <= icd10_code <= 'T28.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T26-T28 Burns and corrosions confined to eye and internal organs'
    elif 'T29.0' <= icd10_code <= 'T32.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T29-T32 Burns and corrosions of multiple and unspecified body regions'
    elif 'T33.0' <= icd10_code <= 'T35.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T33-T35 Frostbite'
    elif 'T36.0' <= icd10_code <= 'T50.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T36-T50 Poisoning by drugs, medicaments and biological substances'
    elif 'T51.0' <= icd10_code <= 'T65.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T51-T65 Toxic effects of substances chiefly nonmedicinal as to source'
    elif 'T66.0' <= icd10_code <= 'T78.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T66-T78 Other and unspecified effects of external causes'
    elif 'T66' <= icd10_code <= 'T66':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T66-T78 Other and unspecified effects of external causes'
    elif 'T79.0' <= icd10_code <= 'T79.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T79-T79 Certain early complications of trauma'
    elif 'T80.0' <= icd10_code <= 'T88.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T80-T88 Complications of surgical and medical care, not elsewhere classified'
    elif 'T90.0' <= icd10_code <= 'T98.9':
        return 'Chapter XIX Injury, poisoning and certain other consequences of external causes', 'T90-T98 Sequelae of injuries, of poisoning and of other consequences of external causes'    
   



    elif 'V01.0' <= icd10_code <= 'V09.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V01-V09 Pedestrian injured in transport accident'
    elif 'V10.0' <= icd10_code <= 'V19.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V10-V19 Pedal cyclist injured in transport accident'
    elif 'V20.0' <= icd10_code <= 'V29.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V20-V29 Motorcycle rider injured in transport accident'
    elif 'V30.0' <= icd10_code <= 'V39.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V30-V39 Occupant of three-wheeled motor vehicle injured in transport accident'
    elif 'V40.0' <= icd10_code <= 'V49.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V40-V49 Car occupant injured in transport accident'
    elif 'V50.0' <= icd10_code <= 'V59.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V50-V59 Occupant of pick-up truck or van injured in transport accident'
    elif 'V60.0' <= icd10_code <= 'V69.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V60-V69 Occupant of heavy transport vehicle injured in transport accident'
    elif 'V70.0' <= icd10_code <= 'V79.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V70-V79 Bus occupant injured in transport accident'
    elif 'V80.0' <= icd10_code <= 'V89.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V80-V89 Other land transport accidents'
    elif 'V90.0' <= icd10_code <= 'V94.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V90-V94 Water transport accidents'
    elif 'V95.0' <= icd10_code <= 'V97.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V95-V97 Air and space transport accidents'
    elif 'V98.0' <= icd10_code <= 'V99.9':
        return 'Chapter XX External causes of morbidity and mortality', 'V98-V99 Other and unspecified transport accidents'
    elif 'W00.0' <= icd10_code <= 'W19.9':
        return 'Chapter XX External causes of morbidity and mortality', 'W00-W19 Falls'
    elif 'W20.0' <= icd10_code <= 'W49.9':
        return 'Chapter XX External causes of morbidity and mortality', 'W20-W49 Exposure to inanimate mechanical forces'
    elif 'W50.0' <= icd10_code <= 'W64.9':
        return 'Chapter XX External causes of morbidity and mortality', 'W50-W64 Exposure to animate mechanical forces'
    elif 'W65.0' <= icd10_code <= 'W74.9':
        return 'Chapter XX External causes of morbidity and mortality', 'W65-W74 Accidental drowning and submersion'
    elif 'W75.0' <= icd10_code <= 'W84.9':
        return 'Chapter XX External causes of morbidity and mortality', 'W75-W84 Other accidental threats to breathing'
    elif 'W85.0' <= icd10_code <= 'W99.9':
        return 'Chapter XX External causes of morbidity and mortality', 'W85-W99 Exposure to electric current, radiation and extreme ambient air temperature and pressure'
    elif 'X00.0' <= icd10_code <= 'X09.9':
        return 'Chapter XX External causes of morbidity and mortality', 'X00-X09 Exposure to smoke, fire and flames'
    elif 'X10.0' <= icd10_code <= 'X19.9':
        return 'Chapter XX External causes of morbidity and mortality', 'X10-X19 Contact with heat and hot substances'
    elif 'X20.0' <= icd10_code <= 'X29.9':
        return 'Chapter XX External causes of morbidity and mortality', 'X20-X29 Contact with venomous animals and plants'
    elif 'X30.0' <= icd10_code <= 'X39.9':
        return 'Chapter XX External causes of morbidity and mortality', 'X30-X39 Exposure to forces of nature'
    elif 'X40.0' <= icd10_code <= 'X49.9':
        return 'Chapter XX External causes of morbidity and mortality', 'X40-X49 Accidental poisoning by and exposure to noxious substances'
    elif 'X50.0' <= icd10_code <= 'X57.9':
        return 'Chapter XX External causes of morbidity and mortality', 'X50-X57 Overexertion, travel and privation'
    elif 'X58.0' <= icd10_code <= 'X59.9':
        return 'Chapter XX External causes of morbidity and mortality', 'X58-X59 Accidental exposure to other and unspecified factors'
    elif 'X60.0' <= icd10_code <= 'X84.9':
        return 'Chapter XX External causes of morbidity and mortality', 'X60-X84 Intentional self-harm'
    elif 'X85.0' <= icd10_code <= 'X99.9':
        return 'Chapter XX External causes of morbidity and mortality', 'X85-X99 Assault'
    elif 'Y00.0' <= icd10_code <= 'Y09.9':
        return 'Chapter XX External causes of morbidity and mortality', 'Y00-Y09 Assault'
    elif 'Y10.0' <= icd10_code <= 'Y34.9':
        return 'Chapter XX External causes of morbidity and mortality', 'Y10-Y34 Event of undetermined intent'
    elif 'Y35.0' <= icd10_code <= 'Y36.9':
        return 'Chapter XX External causes of morbidity and mortality', 'Y35-Y36 Legal intervention and operations of war'
    elif 'Y40.0' <= icd10_code <= 'Y59.9':
        return 'Chapter XX External causes of morbidity and mortality', 'Y40-Y59 Drugs, medicaments and biological substances causing adverse effects in therapeutic use'
    elif 'Y60.0' <= icd10_code <= 'Y69.9':
        return 'Chapter XX External causes of morbidity and mortality', 'Y60-Y69 Misadventures to patients during surgical and medical care'
    elif 'Y70.0' <= icd10_code <= 'Y82.9':
        return 'Chapter XX External causes of morbidity and mortality', 'Y70-Y82 Medical devices associated with adverse incidents in diagnostic and therapeutic use'
    elif 'Y83.0' <= icd10_code <= 'Y84.9':
        return 'Chapter XX External causes of morbidity and mortality', 'Y83-Y84 Surgical and other medical procedures as the cause of abnormal reaction of the patient, or of later complication, without mention of misadventure at the time of the procedure'
    elif 'Y85.0' <= icd10_code <= 'Y89.9':
        return 'Chapter XX External causes of morbidity and mortality', 'Y85-Y89 Sequelae of external causes of morbidity and mortality'
    elif 'Y90.0' <= icd10_code <= 'Y98.9':
        return 'Chapter XX External causes of morbidity and mortality', 'Y90-Y98 Supplementary factors related to causes of morbidity and mortality classified elsewhere'
    
    
    
    
    
    elif 'Z00.0' <= icd10_code <= 'Z13.9':
        return 'Chapter XXI Factors influencing health status and contact with health services', 'Z00-Z13 Persons encountering health services for examination and investigation'
    elif 'Z20.0' <= icd10_code <= 'Z29.9':
        return 'Chapter XXI Factors influencing health status and contact with health services', 'Z20-Z29 Persons with potential health hazards related to communicable diseases'
    elif 'Z30.0' <= icd10_code <= 'Z39.9':
        return 'Chapter XXI Factors influencing health status and contact with health services', 'Z30-Z39 Persons encountering health services in circumstances related to reproduction'
    elif 'Z40.0' <= icd10_code <= 'Z54.9':
        return 'Chapter XXI Factors influencing health status and contact with health services', 'Z40-Z54 Persons encountering health services for specific procedures and health care'
    elif 'Z55.0' <= icd10_code <= 'Z65.9':
        return 'Chapter XXI Factors influencing health status and contact with health services', 'Z55-Z65 Persons with potential health hazards related to socioeconomic and psychosocial circumstances'
    elif 'Z70.0' <= icd10_code <= 'Z76.9':
        return 'Chapter XXI Factors influencing health status and contact with health services', 'Z70-Z76 Persons encountering health services in other circumstances'
    elif 'Z80.0' <= icd10_code <= 'Z99.9':
        return 'Chapter XXI Factors influencing health status and contact with health services', 'Z80-Z99 Persons with potential health hazards related to family and personal history and certain conditions influencing health status'
    
    
    
    
    elif 'U00.0' <= icd10_code <= 'U49.9':
        return 'Chapter XXII Codes for special purposes', 'U00-U49 Provisional assignment of new diseases of uncertain etiology or emergency use'
    elif 'U51' <= icd10_code <= 'U51':
        return 'Chapter XXII Codes for special purposes', 'U00-U49 Provisional assignment of new diseases of uncertain etiology or emergency use'
    elif 'U51.0' <= icd10_code <= 'U51.0':
        return 'Chapter XXII Codes for special purposes', 'U00-U49 Provisional assignment of new diseases of uncertain etiology or emergency use'
    elif 'U51.1' <= icd10_code <= 'U51.1':
        return 'Chapter XXII Codes for special purposes', 'U00-U49 Provisional assignment of new diseases of uncertain etiology or emergency use'
    elif 'U82.0' <= icd10_code <= 'U85.9':
        return 'Chapter XXII Codes for special purposes', 'U00-U49 Provisional assignment of new diseases of uncertain etiology or emergency use'
    elif 'U80.0' <= icd10_code <= 'U80.9':
        return 'Chapter XXII Codes for special purposes', 'U00-U49 Provisional assignment of new diseases of uncertain etiology or emergency use'
    elif 'U81.0' <= icd10_code <= 'U81.9':
        return 'Chapter XXII Codes for special purposes', 'U00-U49 Provisional assignment of new diseases of uncertain etiology or emergency use'
    elif 'U88' <= icd10_code <= 'U89':
        return 'Chapter XXII Codes for special purposes', 'U82-U85 Resistance to antimicrobial and antineoplastic drugs'
    elif 'U89.0' <= icd10_code <= 'U89.9':
        return 'Chapter XXII Codes for special purposes', 'U82-U85 Resistance to antimicrobial and antineoplastic drugs'
    else:
        return None, None
    
    




# Apply the categorization function to create new columns
heart_failure_Matched_cohort_with_Comorbidities[['Diseases Chapter', 'Diseases Sub-Chapter']] = heart_failure_Matched_cohort_with_Comorbidities['Combined ICD10 Codes'].apply(lambda x: categorize_icd10(x)).apply(pd.Series)

# Define the column order
#column_order = ['Participant ID', 'Year of Birth', 'Month of Birth', 'Sex', 'Ethnicity', 'Age', 'Age Group',
#                 'Diagnosis Date', 'Diseases', 'ICD10 Codes', 'Diseases Sub-Chapter', 'Diseases Chapter',
#                 'PH Types', 'Date of Death','Alive / Dead']

# Rearrange columns
#heart_failure_Matched_cohort_with_Comorbidities = PH_with_comorbidities_model[column_order]


heart_failure_Matched_cohort_with_Comorbidities.head(3)

### <center> Check "None" or "NaN" values in either "Diseases Chapter" or "Diseases Sub-Chapter"

In [None]:
# Display rows with "None" or "NaN" values in either "Diseases Chapter" or "Diseases Sub-Chapter"
filtered_df = []
filtered_df = heart_failure_Matched_cohort_with_Comorbidities[(heart_failure_Matched_cohort_with_Comorbidities['Diseases Chapter'].isna()) | (heart_failure_Matched_cohort_with_Comorbidities['Diseases Chapter'] == 'None') |
                 (heart_failure_Matched_cohort_with_Comorbidities['Diseases Sub-Chapter'].isna()) | (heart_failure_Matched_cohort_with_Comorbidities['Diseases Sub-Chapter'] == 'None')]

filtered_df

### <center> Drop duplicates based on "Participant ID" and "ICD10 Codes"

In [None]:
# Drop duplicates based on "Participant ID" and "Diagnosis Date"
heart_failure_Matched_cohort_with_Comorbidities.drop_duplicates(subset=['Participant ID', 'Combined ICD10 Codes'], inplace=True)


# Create dictionaries to store unique counts, row counts, NaN counts, and empty counts
unique_counts_heart_failure_Matched_cohort_with_Comorbidities = {}
row_counts_heart_failure_Matched_cohort_with_Comorbidities = {}
nan_counts_heart_failure_Matched_cohort_with_Comorbidities = {}
empty_counts_heart_failure_Matched_cohort_with_Comorbidities = {}

# Loop through columns and count unique items, row counts, NaN counts, and empty counts
for column_heart_failure_Matched_cohort_with_Comorbidities in heart_failure_Matched_cohort_with_Comorbidities.columns:
    unique_count_heart_failure_Matched_cohort_with_Comorbidities = heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities].nunique()
    row_count_heart_failure_Matched_cohort_with_Comorbidities = len(heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities])
    nan_count_heart_failure_Matched_cohort_with_Comorbidities = heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities].isna().sum()  # Count NaN values
    empty_count_heart_failure_Matched_cohort_with_Comorbidities = heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities].eq('').sum()  # Count empty string values

    unique_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [unique_count_heart_failure_Matched_cohort_with_Comorbidities]
    row_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [row_count_heart_failure_Matched_cohort_with_Comorbidities]
    nan_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [nan_count_heart_failure_Matched_cohort_with_Comorbidities]
    empty_counts_heart_failure_Matched_cohort_with_Comorbidities[column_heart_failure_Matched_cohort_with_Comorbidities] = [empty_count_heart_failure_Matched_cohort_with_Comorbidities]

# Create DataFrames from the dictionaries
unique_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []
row_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []
nan_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []
empty_counts_heart_failure_Matched_cohort_with_Comorbidities2 = []

unique_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(unique_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['Unique Count'])
row_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(row_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['Row Count'])
nan_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(nan_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['NaN Count'])
empty_counts_heart_failure_Matched_cohort_with_Comorbidities2 = pd.DataFrame(empty_counts_heart_failure_Matched_cohort_with_Comorbidities, index=['Empty Count'])

# Concatenate the DataFrames
result_heart_failure_Matched_cohort_with_Comorbidities = []
result_heart_failure_Matched_cohort_with_Comorbidities = pd.concat([unique_counts_heart_failure_Matched_cohort_with_Comorbidities2, row_counts_heart_failure_Matched_cohort_with_Comorbidities2, nan_counts_heart_failure_Matched_cohort_with_Comorbidities2, empty_counts_heart_failure_Matched_cohort_with_Comorbidities2])

# Display the combined DataFrame

display(result_heart_failure_Matched_cohort_with_Comorbidities)
print()
print()


#pd.set_option('display.max_rows', None)  # Adjust as needed to see all rows if it's a large output
display(heart_failure_Matched_cohort_with_Comorbidities.head(2))

### <center> Create "ICD10 Codes Range" Variable

In [None]:
# Extract the first portion of the "Diseases Sub-Chapter" values
heart_failure_Matched_cohort_with_Comorbidities['ICD10 Codes Range'] = heart_failure_Matched_cohort_with_Comorbidities['Diseases Sub-Chapter'].str.extract(r'([A-Z]\d+-[A-Z]\d+)')
heart_failure_Matched_cohort_with_Comorbidities.head(3)

In [None]:
# Display rows with NaN values
nan_rows = []
nan_rows = heart_failure_Matched_cohort_with_Comorbidities[heart_failure_Matched_cohort_with_Comorbidities[['Combined ICD10 Diseases', 'Combined ICD10 Codes', 'Diseases Sub-Chapter', 'Diseases Chapter']].isnull().any(axis=1)]
nan_rows

### <center> Combine "Combined ICD10 Codes" and "Combined ICD10 Diseases" into a new column named "ICD10 Codes Diseases"

In [None]:
heart_failure_Matched_cohort_with_Comorbidities['ICD10 Codes Diseases'] = heart_failure_Matched_cohort_with_Comorbidities['Combined ICD10 Codes'] + ' - ' + heart_failure_Matched_cohort_with_Comorbidities['Combined ICD10 Diseases']

# Rearrange columns
#heart_failure_Matched_cohort_with_Comorbidities = heart_failure_Matched_cohort_with_Comorbidities[['Participant ID','Year of Birth', 'Month of Birth','Sex','Ethnicity','Age','Age Group','Diagnosis Date','ICD10 Codes Diseases','Diseases Sub-Chapter','Diseases Chapter','PH Types','Diseases','ICD10 Codes','ICD10 Codes Range','Alive / Dead','Date of Death']]
heart_failure_Matched_cohort_with_Comorbidities.head(3)

### <center> Save the Data as csv

In [None]:
#### Specify the file path where you want to save the CSV file
#file_path = []
#file_path = 'Heart Failure Matched cohort with Comorbidities.csv'

#### Use the to_csv method to save the DataFrame as a CSV file
#heart_failure_Matched_cohort_with_Comorbidities.to_csv(file_path, index=False)  # Set index=False to exclude the index column

In [None]:
#import pandas as pd
#heart_failure_Matched_cohort_with_Comorbidities = []
#heart_failure_Matched_cohort_with_Comorbidities = pd.read_csv('Heart Failure Matched cohort with Comorbidities.csv')

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>Heart Failure Sunburst Plots </center></h2>
</div>

In [None]:
heart_failure_Control_with_comorbidities_model_plotting = []
heart_failure_Control_with_comorbidities_model_plotting = pd.read_csv('Heart Failure Control Cohort with Comorbidities Model (for Sunburst Plot).csv')

# Calculate the sum of the 'Record Count' column
total_count = []
total_count = heart_failure_Control_with_comorbidities_model_plotting['Record Count'].sum()

# Display the result
print("Total Record Count:", total_count)

heart_failure_Control_with_comorbidities_model_plotting.head(3)

In [None]:
row_length = []
row_length = len(heart_failure_Control_with_comorbidities_model_plotting)
print(row_length)

### <center> Remove Heart Failure ICD10 Codes from Sunburst Dataset

**I50.0**, **I50.1** aand all ICD10 codes except selected Comorbidities

In [None]:
# Define the list of ICD10 codes to keep
codes_to_keep = []
codes_to_keep = ["E78.0", "I25.8", "J45.9", "I48", "I34.0", "I25.1", "I10", "E11.9"]

# Filter rows where the 'ICD10 Codes Diseases' column starts with any of the codes in the list
heart_failure_Control_with_comorbidities_model_plotting = heart_failure_Control_with_comorbidities_model_plotting[
    heart_failure_Control_with_comorbidities_model_plotting['ICD10 Codes Diseases'].str.split(' ').str[0].isin(codes_to_keep)
]

# Calculate the sum of the 'Record Count' column
total_count = heart_failure_Control_with_comorbidities_model_plotting['Record Count'].sum()

# Display the result
print("Total Record Count:", total_count)

# Sort the DataFrame by 'Record Count' in descending order
heart_failure_Control_with_comorbidities_model_plotting = heart_failure_Control_with_comorbidities_model_plotting.sort_values(
    by='Record Count', ascending=False
)

# Display the top 5 rows
heart_failure_Control_with_comorbidities_model_plotting

In [None]:
row_length = []
row_length = len(heart_failure_Control_with_comorbidities_model_plotting)
print(row_length)

### <center> Display Sunburst plot

In [None]:
import plotly.express as px
fig = px.sunburst(heart_failure_Control_with_comorbidities_model_plotting, path=['Diseases Chapter', 'Diseases Sub-Chapter', 'ICD10 Codes Diseases'], values='Record Count')
fig.update_layout(
    width=1600,  # Set your desired width
    height=1200  # Set your desired height
)


# Save the figure as an HTML file
fig.write_html('heart_failure_Control_with_comorbidities_sunburst_chart.html')

# Open the HTML file in a new tab in your default web browser
import webbrowser
webbrowser.open('heart_failure_Control_with_comorbidities_sunburst_chart.html', new=2)
#fig.show()

### <center> Create the "post_heart_failure_matched_control_comorbidities_rows" datase

In [None]:
import pandas as pd
heart_failure_Matched_cohort_with_Comorbidities = []
heart_failure_Matched_cohort_with_Comorbidities = pd.read_csv('Heart Failure Matched cohort with Comorbidities.csv')
heart_failure_Matched_cohort_with_Comorbidities.head(3)

In [None]:
import pandas as pd

# Assuming result_df is your DataFrame
result_df = []
result_df = heart_failure_Matched_cohort_with_Comorbidities.copy()

# Convert "Combined ICD10 Diagnosis Date" to datetime
result_df["Combined ICD10 Diagnosis Date"] = pd.to_datetime(result_df["Combined ICD10 Diagnosis Date"])

# Initialize an empty DataFrame for post-COPD rows
post_copd_rows = pd.DataFrame()

# List of heart_failure ICD10 codes to identify heart_failure diagnoses
heart_failure_icd10_codes = ['I50.0', 'I50.1']

# Function to filter rows diagnosed after the first heart_failure diagnosis
def filter_post_heart_failure_rows(group):
    heart_failure_rows = group[group["Combined ICD10 Codes"].isin(heart_failure_icd10_codes)]
    if not heart_failure_rows.empty:
        # Get the index of the first heart_failure diagnosis
        heart_failure_index = heart_failure_rows.index[0]
        # Filter rows diagnosed after the first heart_failure diagnosis
        post_heart_failure_rows = group[group["Combined ICD10 Diagnosis Date"] > group.loc[heart_failure_index, "Combined ICD10 Diagnosis Date"]]
        return post_heart_failure_rows
    return pd.DataFrame()

# Apply the function to each group of "Participant ID"
post_heart_failure_rows = result_df.groupby("Participant ID").apply(filter_post_heart_failure_rows).reset_index(drop=True)

# Sort the data by "Participant ID" and "Combined ICD10 Diagnosis Date"
post_heart_failure_rows.sort_values(by=['Participant ID', 'Combined ICD10 Diagnosis Date'], inplace=True)

# Create a new DataFrame for post-heart_failure with comorbidities
Post_heart_failure_with_comorbidities = post_heart_failure_rows.copy()

# Mask for identifying heart_failure diagnosis
heart_failure_mask = Post_heart_failure_with_comorbidities['Combined ICD10 Codes'].isin(heart_failure_icd10_codes)

# Create a new column 'heart_failure Diagnosis Date' and fill it with the corresponding dates for heart_failure diagnoses
Post_heart_failure_with_comorbidities['heart_failure Diagnosis Date'] = None
Post_heart_failure_with_comorbidities.loc[heart_failure_mask, 'heart_failure Diagnosis Date'] = Post_heart_failure_with_comorbidities.loc[heart_failure_mask, 'Combined ICD10 Diagnosis Date']

# Group by 'Participant ID' and fill NaN values in 'heart_failure Diagnosis Date' with the first diagnosis date for heart_failure
Post_heart_failure_with_comorbidities['heart_failure Diagnosis Date'] = Post_heart_failure_with_comorbidities.groupby('Participant ID')['heart_failure Diagnosis Date'].transform(lambda x: x.ffill().bfill())

# Calculate 'heart_failure Matched followup Time' as the time difference in years
Post_heart_failure_with_comorbidities['Combined ICD10 Diagnosis Date'] = pd.to_datetime(Post_heart_failure_with_comorbidities['Combined ICD10 Diagnosis Date'])
Post_heart_failure_with_comorbidities['heart_failure Diagnosis Date'] = pd.to_datetime(Post_heart_failure_with_comorbidities['heart_failure Diagnosis Date'])
Post_heart_failure_with_comorbidities['heart_failure Matched followup Time'] = ((Post_heart_failure_with_comorbidities['Combined ICD10 Diagnosis Date'] - Post_heart_failure_with_comorbidities['heart_failure Diagnosis Date']).dt.days / 365.25).round(1)

# Filter out rows with 'heart_failure Matched followup Time' less than or equal to 0
Post_heart_failure_with_comorbidities_forest_plotting = Post_heart_failure_with_comorbidities[Post_heart_failure_with_comorbidities['heart_failure Matched followup Time'] > 0]

# Display unique values of "heart_failure Matched followup Time"
heart_failure_followup_times = Post_heart_failure_with_comorbidities_forest_plotting["heart_failure Matched followup Time"].unique()
print("heart_failure Matched followup Time values:")
display(heart_failure_followup_times)
print()
print()

# Display rows where "heart_failure Matched followup Time" is less than or equal to 0 (should be none)
rows_with_non_positive_followup = Post_heart_failure_with_comorbidities_forest_plotting[Post_heart_failure_with_comorbidities_forest_plotting["heart_failure Matched followup Time"] <= 0]
print("\nRows with heart_failure Matched followup Time less than or equal to 0:")
display(rows_with_non_positive_followup)
print()
print()

print("\nPost_heart_failure with comorbidities forest plotting DataFrame:")
Post_heart_failure_with_comorbidities_forest_plotting.head(3)

In [None]:
# Extract unique values under the "ICD10 Codes" column
unique_icd10_codes_Post_heart_failure_with_comorbidities = []
unique_icd10_codes_Post_heart_failure_with_comorbidities = Post_heart_failure_with_comorbidities_forest_plotting['Combined ICD10 Codes'].unique()

# Print the unique ICD10 codes
print("Unique ICD10 codes in Post_heart_failure_with_comorbidities_forest_plotting:")
print(unique_icd10_codes_Post_heart_failure_with_comorbidities)

In [None]:
unique_participant_ids = []
unique_participant_ids = Post_heart_failure_with_comorbidities_forest_plotting["Participant ID"].nunique()
display("Number of unique Participant IDs:", unique_participant_ids)

In [None]:
# Specify the file path where you want to save the CSV file
#file_path = []
#file_path = 'Post heart_failure COMMON with comorbidities.csv'

# Use the to_csv method to save the DataFrame as a CSV file
#Post_heart_failure_with_comorbidities_forest_plotting.to_csv(file_path, index=False)  # Set index=False to exclude the index column

### <center> Create the "pre_heart_failure_matched_control_comorbidities_rows" dataset

In [None]:
import pandas as pd
heart_failure_Matched_cohort_with_Comorbidities = []
heart_failure_Matched_cohort_with_Comorbidities = pd.read_csv('Heart Failure Matched cohort with Comorbidities.csv')
heart_failure_Matched_cohort_with_Comorbidities.head(3)

In [None]:
import pandas as pd

# Assuming result_df is your DataFrame
result_df = []
result_df = heart_failure_Matched_cohort_with_Comorbidities.copy()

# Convert "Combined ICD10 Diagnosis Date" to datetime
result_df["Combined ICD10 Diagnosis Date"] = pd.to_datetime(result_df["Combined ICD10 Diagnosis Date"])

# Initialize an empty DataFrame for pre-heart_failure rows
pre_heart_failure_rows = pd.DataFrame()

# List of heart_failure ICD10 codes to identify COPD diagnoses
heart_failure_icd10_codes = ['I50.0', 'I50.1']

# Function to filter rows diagnosed before and on the same date as heart_failure
def filter_pre_heart_failure_rows(group):
    heart_failure_rows = group[group["Combined ICD10 Codes"].isin(heart_failure_icd10_codes)]
    if not heart_failure_rows.empty:
        # Get the index of the first heart_failure diagnosis
        heart_failure_index = heart_failure_rows.index[0]
        # Filter rows diagnosed before or on the same date as the first heart_failure diagnosis
        pre_heart_failure_rows = group[group["Combined ICD10 Diagnosis Date"] <= group.loc[heart_failure_index, "Combined ICD10 Diagnosis Date"]]
        return pre_heart_failure_rows
    return pd.DataFrame()

# Apply the function to each group of "Participant ID"
pre_heart_failure_rows = result_df.groupby("Participant ID").apply(filter_pre_heart_failure_rows).reset_index(drop=True)

# Sort the data by "Participant ID" and "Combined ICD10 Diagnosis Date"
pre_heart_failure_rows.sort_values(by=['Participant ID', 'Combined ICD10 Diagnosis Date'], inplace=True)

# Create a new DataFrame for pre-heart_failure with comorbidities
Pre_heart_failure_with_comorbidities = pre_heart_failure_rows.copy()

# Mask for identifying heart_failure diagnosis
heart_failure_mask = Pre_heart_failure_with_comorbidities['Combined ICD10 Codes'].isin(heart_failure_icd10_codes)

# Create a new column 'heart_failure Diagnosis Date' and fill it with the corresponding dates for heart_failure diagnoses
Pre_heart_failure_with_comorbidities['heart_failure Diagnosis Date'] = None
Pre_heart_failure_with_comorbidities.loc[heart_failure_mask, 'heart_failure Diagnosis Date'] = Pre_heart_failure_with_comorbidities.loc[heart_failure_mask, 'Combined ICD10 Diagnosis Date']

# Group by 'Participant ID' and fill NaN values in 'heart_failure Diagnosis Date' with the first diagnosis date for heart_failure
Pre_heart_failure_with_comorbidities['heart_failure Diagnosis Date'] = Pre_heart_failure_with_comorbidities.groupby('Participant ID')['heart_failure Diagnosis Date'].transform(lambda x: x.ffill().bfill())

# Calculate 'heart_failure Matched followup Time' as the time difference in years
Pre_heart_failure_with_comorbidities['Combined ICD10 Diagnosis Date'] = pd.to_datetime(Pre_heart_failure_with_comorbidities['Combined ICD10 Diagnosis Date'])
Pre_heart_failure_with_comorbidities['heart_failure Diagnosis Date'] = pd.to_datetime(Pre_heart_failure_with_comorbidities['heart_failure Diagnosis Date'])
Pre_heart_failure_with_comorbidities['heart_failure Matched followup Time'] = ((Pre_heart_failure_with_comorbidities['Combined ICD10 Diagnosis Date'] - Pre_heart_failure_with_comorbidities['heart_failure Diagnosis Date']).dt.days / 365.25).round(1)

# Filter out rows with 'heart_failure Matched followup Time' greater than 0
Pre_heart_failure_with_comorbidities_forest_plotting = Pre_heart_failure_with_comorbidities[Pre_heart_failure_with_comorbidities['heart_failure Matched followup Time'] <= 0]

# Display unique values of "heart_failure Matched followup Time"
heart_failure_followup_times = Pre_heart_failure_with_comorbidities_forest_plotting["heart_failure Matched followup Time"].unique()
print("heart_failure Matched followup Time values:")
print(heart_failure_followup_times)
print()
print()

# Display rows where "heart_failure Matched followup Time" is greater than 0 (should be none)
rows_with_positive_followup = Pre_heart_failure_with_comorbidities_forest_plotting[Pre_heart_failure_with_comorbidities_forest_plotting["heart_failure Matched followup Time"] > 0]
print("\nRows with heart_failure Matched followup Time greater than 0:")
display(rows_with_positive_followup)
print()
print()

print("\nPre_heart_failure with comorbidities forest plotting DataFrame:")
display(Pre_heart_failure_with_comorbidities_forest_plotting.head(3))

In [None]:
# Extract unique values under the "ICD10 Codes" column
unique_icd10_codes_Pre_heart_failure_with_comorbidities = []
unique_icd10_codes_Pre_heart_failure_with_comorbidities = Pre_heart_failure_with_comorbidities_forest_plotting['Combined ICD10 Codes'].unique()

# Print the unique ICD10 codes
print("Unique ICD10 codes in Pre_heart_failure_with_comorbidities_forest_plotting:")
print(unique_icd10_codes_Pre_heart_failure_with_comorbidities)

In [None]:
unique_participant_ids = []
unique_participant_ids = Pre_heart_failure_with_comorbidities_forest_plotting["Participant ID"].nunique()
display("Number of unique Participant IDs:", unique_participant_ids)

In [None]:
# Specify the file path where you want to save the CSV file
#file_path = []
#file_path = 'Pre heart_failure COMMON with comorbidities.csv'

# Use the to_csv method to save the DataFrame as a CSV file
#Pre_heart_failure_with_comorbidities_forest_plotting.to_csv(file_path, index=False)  # Set index=False to exclude the index column

In [None]:
# Extract unique values under the "ICD10 Codes" column
unique_icd10_codes_Pre_heart_failure = []
unique_icd10_codes_Post_heart_failure = []

unique_icd10_codes_Pre_heart_failure = Pre_heart_failure_with_comorbidities_forest_plotting['Combined ICD10 Codes'].unique()
unique_icd10_codes_Post_heart_failure = Post_heart_failure_with_comorbidities_forest_plotting['Combined ICD10 Codes'].unique()

# Convert the arrays to sets
set_pre_heart_failure = set(unique_icd10_codes_Pre_heart_failure)
set_post_heart_failure = set(unique_icd10_codes_Post_heart_failure)

# Find the common ICD10 codes
common_icd10_codes = set_pre_heart_failure.intersection(set_post_heart_failure)

# Print the common ICD10 codes
print("Common ICD10 codes in both Pre_heart_failure and Post_heart_failure datasets:")
print(common_icd10_codes)

# Count the number of common codes
count_common_icd10_codes = len(common_icd10_codes)
print("Number of common ICD10 codes:", count_common_icd10_codes)

In [None]:
#import pandas as pd
# Specify the file path where you want to save the CSV file
#file_path = []
#file_path = 'common Pre-heart_failure and post-heart_failure icd10 codes.csv'

# Use the to_csv method to save the DataFrame as a CSV file
#common_icd10_codes.to_csv(file_path, index=False)  # Set index=False to exclude the index column

In [None]:
#file_path = []
#common_icd10_codes_df = []
# Assuming common_icd10_codes is a set
#common_icd10_codes_df = pd.DataFrame(list(common_icd10_codes), columns=['ICD10 Codes'])

# Specify the file path where you want to save the CSV file
#file_path = 'common Pre-heart_failure and post-heart_failure icd10 codes.csv'

# Save the DataFrame as a CSV file
#common_icd10_codes_df.to_csv(file_path, index=False)

#print("File saved successfully!")

### <center> Steps to find Pre_heart_failure and Post_heart_failure with Uncommon ICD10 Codes rows

In [None]:
import pandas as pd

# Assuming you already have the common ICD10 codes from previous steps
common_icd10_codes = set_pre_heart_failure.intersection(set_post_heart_failure)

# Filter out rows with common ICD10 codes in Pre_heart_failure_with_comorbidities_forest_ploting
filtered_Pre_heart_failure = []
filtered_Pre_heart_failure = Pre_heart_failure_with_comorbidities_forest_plotting[~Pre_heart_failure_with_comorbidities_forest_plotting['Combined ICD10 Codes'].isin(common_icd10_codes)]

# Filter out rows with common ICD10 codes in Post_heart_failure_with_comorbidities_forest_ploting
filtered_Post_heart_failure  = []
filtered_Post_heart_failure = Post_heart_failure_with_comorbidities_forest_plotting[~Post_heart_failure_with_comorbidities_forest_plotting['Combined ICD10 Codes'].isin(common_icd10_codes)]

# Print the number of rows in the filtered dataframes to confirm the filtering
print("Number of rows in filtered Pre_heart_failure_with_comorbidities_forest_ploting:", filtered_Pre_heart_failure.shape[0])
print("Number of rows in filtered Post_heart_failure_with_comorbidities_forest_ploting:", filtered_Post_heart_failure.shape[0])
print()

# Display the first few rows of the filtered dataframes to verify the filtering
print("Filtered Pre_heart_failure_with_comorbidities_forest_ploting:")
display(filtered_Pre_heart_failure.head(3))
print()
print()

print("Filtered Post_heart_failure_with_comorbidities_forest_ploting:")
display(filtered_Post_heart_failure.head(3))

In [None]:
# Get unique values of 'Combined ICD10 Codes' in each filtered dataset
unique_pre_heart_failure_codes = []
unique_post_heart_failure_codes = []
unique_pre_heart_failure_codes = set(filtered_Pre_heart_failure['Combined ICD10 Codes'].unique())
unique_post_heart_failure_codes = set(filtered_Post_heart_failure['Combined ICD10 Codes'].unique())

# Find intersection between the two sets to check if there are any common codes
common_codes = unique_pre_heart_failure_codes.intersection(unique_post_heart_failure_codes)

# Print the result
if len(common_codes) == 0:
    print("No common 'Combined ICD10 Codes' found between filtered_Pre_heart_failure and filtered_Post_heart_failure.")
else:
    print("Common 'Combined ICD10 Codes' found between filtered_Pre_heart_failure and filtered_Post_heart_failure:")
    print(common_codes)


In [None]:
# Specify the file path where you want to save the CSV file
#file_path = []
#file_path = 'Pre heart_failure with comorbidities.csv'

# Use the to_csv method to save the DataFrame as a CSV file
#filtered_Pre_heart_failure.to_csv(file_path, index=False)  # Set index=False to exclude the index column

In [None]:
# Specify the file path where you want to save the CSV file
#file_path = []
#file_path = 'Post heart_failure with comorbidities.csv'

# Use the to_csv method to save the DataFrame as a CSV file
#filtered_Post_heart_failure.to_csv(file_path, index=False)  # Set index=False to exclude the index column

### <center> Common Pre_heart_failure and Post_heart_failure ICD10 codes Dataframe

In [None]:
# Extract unique values under the "ICD10 Codes" column
unique_icd10_codes_Pre_heart_failure = []
unique_icd10_codes_Post_heart_failure = []

unique_icd10_codes_Pre_heart_failure = Pre_heart_failure_with_comorbidities_forest_plotting['Combined ICD10 Codes'].unique()
unique_icd10_codes_Post_heart_failure = Post_heart_failure_with_comorbidities_forest_plotting['Combined ICD10 Codes'].unique()

# Convert the arrays to sets
set_pre_heart_failure = set(unique_icd10_codes_Pre_heart_failure)
set_post_heart_failure = set(unique_icd10_codes_Post_heart_failure)

# Find the common ICD10 codes
common_icd10_codes = set_pre_heart_failure.intersection(set_post_heart_failure)

# Print the common ICD10 codes
print("Common ICD10 codes in both Pre_heart_failure and Post_heart_failure datasets:")
#print(common_icd10_codes)
# Create the DataFrame
common_icd10_codes = pd.DataFrame(common_icd10_codes)
common_icd10_codes.head(3)


unique_participant_ids_count = []
unique_participant_ids_count = common_icd10_codes.nunique()
print("Number of unique Participant IDs:", unique_participant_ids_count)

In [None]:
import pandas as pd
Common_ICD10_codes_in_both_Pre_heart_failure_and_Post_heart_failure_datasets = []
Common_ICD10_codes_in_both_Pre_heart_failure_and_Post_heart_failure_datasets = pd.read_csv('common Pre-heart_failure and post-heart_failure icd10 codes.csv', header=None)
Common_ICD10_codes_in_both_Pre_heart_failure_and_Post_heart_failure_datasets = Common_ICD10_codes_in_both_Pre_heart_failure_and_Post_heart_failure_datasets[0].tolist()


#unique_participant_ids_count = []
#unique_participant_ids_count = Common_ICD10_codes_in_both_Pre_heart_failure_and_Post_heart_failure_datasets.nunique()
#print("Number of unique Participant IDs:", unique_participant_ids_count)


print("Common_ICD10_codes_in_both_Pre_heart_failure_and_Post_heart_failure_datasets:")
display(Common_ICD10_codes_in_both_Pre_heart_failure_and_Post_heart_failure_datasets[:5])  # Display the first 10 common ICD-10 codes
print()
print()


# Load your pre-heart_failure and post-heart_failure datasets
Pre_heart_failure_Common_dataframe = []
Post_heart_failure_Common_dataframe = []

Pre_heart_failure_Common_dataframe = pd.read_csv('Pre heart_failure COMMON with comorbidities.csv')
Post_heart_failure_Common_dataframe = pd.read_csv('Post heart_failure COMMON with comorbidities.csv')

# Check the first few rows to ensure they are loaded correctly
display(Pre_heart_failure_Common_dataframe.head(2))
print()
print()
display(Post_heart_failure_Common_dataframe.head(2))

In [None]:
# Check if all common ICD-10 codes are present in the pre-heart_failure dataset
pre_heart_failure_icd10_codes = Pre_heart_failure_Common_dataframe['Combined ICD10 Codes'].unique().tolist()
missing_in_pre_heart_failure = [code for code in Common_ICD10_codes_in_both_Pre_heart_failure_and_Post_heart_failure_datasets if code not in pre_heart_failure_icd10_codes]

# Check if all common ICD-10 codes are present in the post-heart_failure dataset
post_heart_failure_icd10_codes = Post_heart_failure_Common_dataframe['Combined ICD10 Codes'].unique().tolist()
missing_in_post_heart_failure = [code for code in Common_ICD10_codes_in_both_Pre_heart_failure_and_Post_heart_failure_datasets if code not in post_heart_failure_icd10_codes]

# Print results
#print("ICD-10 codes missing in Pre-heart_failure Common dataset:")
#print(missing_in_pre_heart_failure)
#print()
#print()

#print("ICD-10 codes missing in Post-heart_failure Common dataset:")
#print(missing_in_post_heart_failure)

# Filter pre-heart_failure and post-heart_failure DataFrames for common ICD-10 codes
Pre_heart_failure_common = []
Post_heart_failure_common = []

Pre_heart_failure_common = Pre_heart_failure_Common_dataframe[Pre_heart_failure_Common_dataframe['Combined ICD10 Codes'].isin(Common_ICD10_codes_in_both_Pre_heart_failure_and_Post_heart_failure_datasets)]
Post_heart_failure_common = Post_heart_failure_Common_dataframe[Post_heart_failure_Common_dataframe['Combined ICD10 Codes'].isin(Common_ICD10_codes_in_both_Pre_heart_failure_and_Post_heart_failure_datasets)]

# Display the filtered DataFrames
#display(Pre_heart_failure_common.head(2))
#display(Post_heart_failure_common.head(2))

# Combine the filtered pre-heart_failure and post-heart_failure DataFrames
Combined_heart_failure_common = []
Combined_heart_failure_common = pd.concat([Pre_heart_failure_common, Post_heart_failure_common], ignore_index=True)

# Display the combined DataFrame
display(Combined_heart_failure_common.head(3))
print()
print()

In [None]:
unique_participant_ids_count = []
unique_participant_ids_count = Combined_heart_failure_common['Participant ID'].nunique()
print("Number of unique Participant IDs:", unique_participant_ids_count)

unique_Combined_ICD10_Diseases_count = []
unique_Combined_ICD10_Diseases_count = Combined_heart_failure_common['Combined ICD10 Diseases'].nunique()
print("Number of unique ICD10 Codes:", unique_Combined_ICD10_Diseases_count)

In [None]:
# Specify the file path where you want to save the CSV file
#file_path = []
#file_path = 'Combined heart_failure Common with comorbidities.csv'

# Use the to_csv method to save the DataFrame as a CSV file
#Combined_heart_failure_common.to_csv(file_path, index=False)  # Set index=False to exclude the index column

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>Heart Failure Matched Control Forest Plots </center></h2>
</div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


Pre_PH_dataframe = []
Post_PH_dataframe = []
Common_PH_dataframe = []
categorized_conditions = []

# Load the datasets
Pre_PH_dataframe = pd.read_csv('Pre-PH Common PH Icd10 Codes.csv')
Post_PH_dataframe = pd.read_csv('Post-PH Common PH Icd10 Codes.csv')
Common_PH_dataframe = pd.read_csv('Common-PH Common PH Icd10 Codes.csv')

# Define the conditions based on the final list
conditions = {
    "Essential hypertension(1611)": "I10",
    "Atrial fibrillation & flutter(868)": "I48",
    "Pure hypercholesterolemia(830)": "E78.0",
    "Atherosclerotic heart disease(682)": "I25.1",
    "Type 2 diabetes mellitus(579)": "E11.9",
    "Mitral (valve) insufficiency(438)": "I34.0",
    "Asthma(410)": "J45.9",
    "Chronic ischemic heart disease(229)": "I25.8",
}

# Create a list to store the categorized data
categorized_conditions_list = []

# Helper function to count the occurrences of a condition in a dataframe
def count_condition_in_df(df, icd_codes):
    if isinstance(icd_codes, list):
        return sum(df['ICD10 Codes'].isin(icd_codes))
    else:
        return sum(df['ICD10 Codes'] == icd_codes)

# Populate the list
for condition, icd_codes in conditions.items():
    pre_ph_count = count_condition_in_df(Pre_PH_dataframe, icd_codes)
    post_ph_count = count_condition_in_df(Post_PH_dataframe, icd_codes)
    common_ph_count = count_condition_in_df(Common_PH_dataframe, icd_codes)
    categorized_conditions_list.append({
        'Condition': condition,
        'ICD-10 Codes': icd_codes,
        'Pre PH': pre_ph_count,
        'Post PH': post_ph_count,
        'Common PH': common_ph_count
    })

# Convert the list to a DataFrame
categorized_conditions = pd.DataFrame(categorized_conditions_list)

# Display the resulting dataframe
print("Categorized Conditions DataFrame:")
display(categorized_conditions)


# Normalize the counts to get the frequencies
total_ph_patients = 2727  # Total number of PH patients

categorized_conditions['Pre PH Frequency'] = categorized_conditions['Pre PH'] / total_ph_patients
categorized_conditions['Post PH Frequency'] = categorized_conditions['Post PH'] / total_ph_patients
categorized_conditions['Common PH Frequency'] = categorized_conditions['Common PH'] / total_ph_patients

# Prepare data for the forest plot
conditions = categorized_conditions['Condition']
pre_ph_freq = categorized_conditions['Pre PH Frequency']
post_ph_freq = categorized_conditions['Post PH Frequency']
common_ph_freq = categorized_conditions['Common PH Frequency']


# Plotting function for the forest plot with a central line for PH diagnosis
def plot_forest_with_central_line(conditions, pre_ph_freq, post_ph_freq, common_ph_freq):
    fig, ax = plt.subplots(figsize=(22, 12))

    y_pos = np.arange(len(conditions))

    for i, condition in enumerate(conditions):
        before_freq = pre_ph_freq[i]
        after_freq = post_ph_freq[i]
        common_freq = common_ph_freq[i]

        # Prioritize the color assignment: blue (common), green (pre), red (post)
        if common_freq > 0:
            color = 'blue'   # Common PH Frequency
        elif before_freq > 0:
            color = 'green'  # Pre PH Frequency
        elif after_freq > 0:
            color = 'red'    # Post PH Frequency
        else:
            color = 'gray'   # Default color for zero frequencies

        # Plot the lines
        ax.plot([0.5 - before_freq, 0.5 + after_freq], [y_pos[i], y_pos[i]], '-', color=color, lw=18)
        ax.plot(0.5, y_pos[i], 'o', color=color)

        # Add percentage text next to the ends of the lines
        before_percentage = before_freq * 100
        after_percentage = after_freq * 100
        ax.text(0.50 - before_freq - 0.05, y_pos[i], f'{before_percentage:.1f}%', 
                verticalalignment='center', fontsize=13, color='black')
        ax.text(0.47 + after_freq + 0.05, y_pos[i], f'{after_percentage:.1f}%', 
                verticalalignment='center', fontsize=13, color='black')

        # Add thin horizontal lines extending from the bars to the ytick labels
        ax.hlines(y=y_pos[i], xmin=0, xmax=0.5 - before_freq, colors='gray', linestyles='dotted', lw=0)
        ax.hlines(y=y_pos[i], xmin=0.5 + after_freq, xmax=1, colors='gray', linestyles='dotted', lw=0)

    ax.set_yticks(y_pos)
    ax.set_yticklabels(conditions)
    ax.set_xticks([0, 0.5, 0.91])
    ax.set_xticklabels(['Comorbidities diagnosed before PH', 'Average PH Diagnosis\n Time for all Patients', 'Comorbidities diagnosed after PH'])
    ax.set_xlabel('Frequency', fontsize=16)  # Set font size of the x-axis label
    ax.set_title('Forest Plot of Comorbidities Relative to PH Diagnosis', fontsize=18)  # Set font size of title

    # Adjust the font size of xticks and yticks
    ax.tick_params(axis='x', labelsize=14)  # Set font size of x-axis ticks
    ax.tick_params(axis='y', labelsize=12)  # Set font size of y-axis ticks
    
    plt.grid(True)
    plt.axvline(0.5, color='black', linestyle='--', lw=6)

    # Create custom legend with adjusted font size and position
    custom_lines = [
        plt.Line2D([0], [0], color='green', lw=18),
        plt.Line2D([0], [0], color='red', lw=18),
        plt.Line2D([0], [0], color='blue', lw=18)
    ]
    
    # Adjust legend position, fontsize, and size
    ax.legend(custom_lines, ['Pre-PH Comorbidities', 'Post-PH Comorbidities', 'Common Comorbidities'], loc='upper right', fontsize=14, 
              bbox_to_anchor=(0.99, 0.99), borderaxespad=0., markerscale=1.5)

    plt.show()

# Plot the forest plot
plot_forest_with_central_line(conditions, pre_ph_freq, post_ph_freq, common_ph_freq)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Pre_COPD_dataframe =[]
Post_COPD_dataframe = []
Common_COPD_dataframe = []

# Assume these dataframes are defined elsewhere in your code
Pre_COPD_dataframe = pd.read_csv('Pre COPD COMMON with comorbidities.csv')
Post_COPD_dataframe = pd.read_csv('Post COPD COMMON with comorbidities.csv')
Common_COPD_dataframe = pd.read_csv('Combined COPD Common with comorbidities.csv')



# Define the conditions based on the final list
copd_conditions = {
    "Essential hypertension(1138)": "I10",
    "Atrial fibrillation & flutter(231)": "I48",
    "Pure hypercholesterolemia(543)": "E78.0",
    "Atherosclerotic heart disease(297)": "I25.1",
    "Type 2 diabetes mellitus(417)": "E11.9",
    "Mitral (valve) insufficiency(62)": "I34.0",
    "Asthma(1377)": "J45.9",
    "Chronic ischemic heart disease(140)": "I25.8",

}

# Create a list to store the categorized data
categorized_copd_conditions_list = []

# Helper function to count the occurrences of a condition in a dataframe
def count_copd_condition_in_df(df, icd_codes):
    if isinstance(icd_codes, list):
        return sum(df['Combined ICD10 Codes'].isin(icd_codes))
    else:
        return sum(df['Combined ICD10 Codes'] == icd_codes)

# Populate the list for COPD conditions
for condition, icd_codes in copd_conditions.items():
    pre_copd_count = count_copd_condition_in_df(Pre_COPD_dataframe, icd_codes)
    post_copd_count = count_copd_condition_in_df(Post_COPD_dataframe, icd_codes)
    common_copd_count = count_copd_condition_in_df(Common_COPD_dataframe, icd_codes)
    categorized_copd_conditions_list.append({
        'Condition': condition,
        'Combined ICD-10 Codes': icd_codes,
        'Pre COPD': pre_copd_count,
        'Common COPD': common_copd_count,
        'Post COPD': post_copd_count
    })

# Convert the list to a DataFrame
categorized_copd_conditions = pd.DataFrame(categorized_copd_conditions_list)

# Reorder the columns as requested
categorized_copd_conditions = categorized_copd_conditions[
    ["Condition", "Combined ICD-10 Codes", "Pre COPD", "Common COPD", "Post COPD"]
]

# Display the resulting dataframe
print("Categorized COPD Conditions DataFrame:")
display(categorized_copd_conditions)





# Assuming categorized_copd_conditions DataFrame is already created
total_copd_patients = 2441  # Total number of COPD patients

# Normalize the counts to get the frequencies
categorized_copd_conditions['Pre COPD Frequency'] = categorized_copd_conditions['Pre COPD'] / total_copd_patients
categorized_copd_conditions['Post COPD Frequency'] = categorized_copd_conditions['Post COPD'] / total_copd_patients
categorized_copd_conditions['Common COPD Frequency'] = categorized_copd_conditions['Common COPD'] / total_copd_patients

# Prepare data for the forest plot
conditions = categorized_copd_conditions['Condition']
pre_copd_freq = categorized_copd_conditions['Pre COPD Frequency']
post_copd_freq = categorized_copd_conditions['Post COPD Frequency']
common_copd_freq = categorized_copd_conditions['Common COPD Frequency']

# Plotting function for the forest plot with a central line for COPD diagnosis
def plot_forest_with_central_line(conditions, pre_copd_freq, post_copd_freq, common_copd_freq):
    fig, ax = plt.subplots(figsize=(22, 12))

    y_pos = np.arange(len(conditions))

    for i, condition in enumerate(conditions):
        before_freq = pre_copd_freq[i]
        after_freq = post_copd_freq[i]
        common_freq = common_copd_freq[i]

        # Prioritize the color assignment: blue (common), green (pre), red (post)
        if common_freq > 0:
            color = 'blue'   # Common copd Frequency
        elif before_freq > 0:
            color = 'green'  # Pre copd Frequency
        elif after_freq > 0:
            color = 'red'    # Post copd Frequency
        else:
            color = 'gray'   # Default color for zero frequencies

        # Plot the lines
        ax.plot([0.5 - before_freq, 0.5 + after_freq], [y_pos[i], y_pos[i]], '-', color=color, lw=18)
        ax.plot(0.5, y_pos[i], 'o', color=color)

        # Add percentage text next to the ends of the lines
        before_percentage = before_freq * 100
        after_percentage = after_freq * 100
        ax.text(0.50 - before_freq - 0.05, y_pos[i], f'{before_percentage:.1f}%', 
                verticalalignment='center', fontsize=13, color='black')
        ax.text(0.47 + after_freq + 0.05, y_pos[i], f'{after_percentage:.1f}%', 
                verticalalignment='center', fontsize=13, color='black')

        # Add thin horizontal lines extending from the bars to the ytick labels
        ax.hlines(y=y_pos[i], xmin=0, xmax=0.5 - before_freq, colors='gray', linestyles='dotted', lw=0)
        ax.hlines(y=y_pos[i], xmin=0.5 + after_freq, xmax=1, colors='gray', linestyles='dotted', lw=0)

    ax.set_yticks(y_pos)
    ax.set_yticklabels(conditions)
    ax.set_xticks([0, 0.5, 0.91])
    ax.set_xticklabels(['Comorbidities diagnosed before COPD', 'Average COPD Diagnosis\n Time for all Patients', 'Comorbidities diagnosed after COPD'])
    ax.set_xlabel('Frequency', fontsize=16)  # Set font size of the x-axis label
    ax.set_title('Forest Plot of Comorbidities Relative to COPD Diagnosis', fontsize=18)  # Set font size of title

    # Adjust the font size of xticks and yticks
    ax.tick_params(axis='x', labelsize=14)  # Set font size of x-axis ticks
    ax.tick_params(axis='y', labelsize=12)  # Set font size of y-axis ticks
    
    plt.grid(True)
    plt.axvline(0.5, color='black', linestyle='--', lw=6)

    # Create custom legend with adjusted font size and position
    custom_lines = [
        plt.Line2D([0], [0], color='green', lw=18),
        plt.Line2D([0], [0], color='red', lw=18),
        plt.Line2D([0], [0], color='blue', lw=18)
    ]
    
    # Adjust legend position, fontsize, and size
    ax.legend(custom_lines, ['Pre-COPD Comorbidities', 'Post-COPD Comorbidities', 'Common Comorbidities'], loc='upper right', fontsize=14, 
              bbox_to_anchor=(0.99, 0.99), borderaxespad=0., markerscale=1.5)

    plt.show()

# Plot the forest plot
plot_forest_with_central_line(conditions, pre_copd_freq, post_copd_freq, common_copd_freq)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Pre_heart_failure_dataframe =[]
Post_heart_failure_dataframe = []
Common_heart_failure_dataframe = []

# Assume these dataframes are defined elsewhere in your code
Pre_heart_failure_dataframe = pd.read_csv('Pre heart_failure COMMON with comorbidities.csv')
Post_heart_failure_dataframe = pd.read_csv('Post heart_failure COMMON with comorbidities.csv')
Common_heart_failure_dataframe = pd.read_csv('Combined heart_failure Common with comorbidities.csv')



# Define the conditions based on the final list
heart_failure_conditions = {
    "Essential hypertension(1138)": "I10",
    "Atrial fibrillation & flutter(231)": "I48",
    "Pure hypercholesterolemia(543)": "E78.0",
    "Atherosclerotic heart disease(297)": "I25.1",
    "Type 2 diabetes mellitus(417)": "E11.9",
    "Mitral (valve) insufficiency(62)": "I34.0",
    "Asthma(1377)": "J45.9",
    "Chronic ischemic heart disease(140)": "I25.8",

}

# Create a list to store the categorized data
categorized_heart_failure_conditions_list = []

# Helper function to count the occurrences of a condition in a dataframe
def count_heart_failure_condition_in_df(df, icd_codes):
    if isinstance(icd_codes, list):
        return sum(df['Combined ICD10 Codes'].isin(icd_codes))
    else:
        return sum(df['Combined ICD10 Codes'] == icd_codes)

# Populate the list for heart_failure conditions
for condition, icd_codes in heart_failure_conditions.items():
    pre_heart_failure_count = count_heart_failure_condition_in_df(Pre_heart_failure_dataframe, icd_codes)
    post_heart_failure_count = count_heart_failure_condition_in_df(Post_heart_failure_dataframe, icd_codes)
    common_heart_failure_count = count_heart_failure_condition_in_df(Common_heart_failure_dataframe, icd_codes)
    categorized_heart_failure_conditions_list.append({
        'Condition': condition,
        'Combined ICD-10 Codes': icd_codes,
        'Pre heart_failure': pre_heart_failure_count,
        'Common heart_failure': common_heart_failure_count,
        'Post heart_failure': post_heart_failure_count
    })

# Convert the list to a DataFrame
categorized_heart_failure_conditions = pd.DataFrame(categorized_heart_failure_conditions_list)

# Reorder the columns as requested
categorized_heart_failure_conditions = categorized_heart_failure_conditions[
    ["Condition", "Combined ICD-10 Codes", "Pre heart_failure", "Common heart_failure", "Post heart_failure"]
]

# Display the resulting dataframe
print("Categorized heart_failure Conditions DataFrame:")
display(categorized_heart_failure_conditions)





# Assuming categorized_copd_conditions DataFrame is already created
total_heart_failure_patients = 2441  # Total number of heart_failure patients

# Normalize the counts to get the frequencies
categorized_heart_failure_conditions['Pre heart_failure Frequency'] = categorized_heart_failure_conditions['Pre heart_failure'] / total_heart_failure_patients
categorized_heart_failure_conditions['Post heart_failure Frequency'] = categorized_heart_failure_conditions['Post heart_failure'] / total_heart_failure_patients
categorized_heart_failure_conditions['Common heart_failure Frequency'] = categorized_heart_failure_conditions['Common heart_failure'] / total_heart_failure_patients

# Prepare data for the forest plot
conditions = categorized_heart_failure_conditions['Condition']
pre_heart_failure_freq = categorized_heart_failure_conditions['Pre heart_failure Frequency']
post_heart_failure_freq = categorized_heart_failure_conditions['Post heart_failure Frequency']
common_heart_failure_freq = categorized_heart_failure_conditions['Common heart_failure Frequency']

# Plotting function for the forest plot with a central line for heart_failure diagnosis
def plot_forest_with_central_line(conditions, pre_heart_failure_freq, post_heart_failure_freq, common_heart_failure_freq):
    fig, ax = plt.subplots(figsize=(22, 12))

    y_pos = np.arange(len(conditions))

    for i, condition in enumerate(conditions):
        before_freq = pre_heart_failure_freq[i]
        after_freq = post_heart_failure_freq[i]
        common_freq = common_heart_failure_freq[i]

        # Prioritize the color assignment: blue (common), green (pre), red (post)
        if common_freq > 0:
            color = 'blue'   # Common copd Frequency
        elif before_freq > 0:
            color = 'green'  # Pre copd Frequency
        elif after_freq > 0:
            color = 'red'    # Post copd Frequency
        else:
            color = 'gray'   # Default color for zero frequencies

        # Plot the lines
        ax.plot([0.5 - before_freq, 0.5 + after_freq], [y_pos[i], y_pos[i]], '-', color=color, lw=18)
        ax.plot(0.5, y_pos[i], 'o', color=color)

        # Add percentage text next to the ends of the lines
        before_percentage = before_freq * 100
        after_percentage = after_freq * 100
        ax.text(0.50 - before_freq - 0.05, y_pos[i], f'{before_percentage:.1f}%', 
                verticalalignment='center', fontsize=13, color='black')
        ax.text(0.47 + after_freq + 0.05, y_pos[i], f'{after_percentage:.1f}%', 
                verticalalignment='center', fontsize=13, color='black')

        # Add thin horizontal lines extending from the bars to the ytick labels
        ax.hlines(y=y_pos[i], xmin=0, xmax=0.5 - before_freq, colors='gray', linestyles='dotted', lw=0)
        ax.hlines(y=y_pos[i], xmin=0.5 + after_freq, xmax=1, colors='gray', linestyles='dotted', lw=0)

    ax.set_yticks(y_pos)
    ax.set_yticklabels(conditions)
    ax.set_xticks([0, 0.5, 0.91])
    ax.set_xticklabels(['Comorbidities diagnosed before heart_failure', 'heart_failure Diagnosis\n Time for all Patients', 'Comorbidities diagnosed after heart_failure'])
    ax.set_xlabel('Frequency', fontsize=16)  # Set font size of the x-axis label
    ax.set_title('Forest Plot of Comorbidities Relative to heart_failure Diagnosis', fontsize=18)  # Set font size of title

    # Adjust the font size of xticks and yticks
    ax.tick_params(axis='x', labelsize=14)  # Set font size of x-axis ticks
    ax.tick_params(axis='y', labelsize=12)  # Set font size of y-axis ticks
    
    plt.grid(True)
    plt.axvline(0.5, color='black', linestyle='--', lw=6)

    # Create custom legend with adjusted font size and position
    custom_lines = [
        plt.Line2D([0], [0], color='green', lw=18),
        plt.Line2D([0], [0], color='red', lw=18),
        plt.Line2D([0], [0], color='blue', lw=18)
    ]
    
    # Adjust legend position, fontsize, and size
    ax.legend(custom_lines, ['Pre-heart_failure Comorbidities', 'Post-heart_failure Comorbidities', 'Common heart_failure Comorbidities'], loc='upper right', fontsize=14, 
              bbox_to_anchor=(0.99, 0.99), borderaxespad=0., markerscale=1.5)

    plt.show()

# Plot the forest plot
plot_forest_with_central_line(conditions, pre_heart_failure_freq, post_heart_failure_freq, common_heart_failure_freq)


## <center> Combine Forest Plot (Based on Number of Individuals)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


Pre_PH_dataframe = []
Post_PH_dataframe = []
Common_PH_dataframe = []
Pre_COPD_dataframe = []
Post_COPD_dataframe = []
Common_COPD_dataframe = []
Pre_heart_failure_dataframe = []
Post_heart_failure_dataframe = []
Common_heart_failure_dataframe = []

# Load your data (adjust file paths as necessary)
Pre_PH_dataframe = pd.read_csv('Pre-PH Common PH Icd10 Codes.csv')
Post_PH_dataframe = pd.read_csv('Post-PH Common PH Icd10 Codes.csv')
Common_PH_dataframe = pd.read_csv('Common-PH Common PH Icd10 Codes.csv')

Pre_COPD_dataframe = pd.read_csv('Pre COPD COMMON with comorbidities.csv')
Pre_COPD_dataframe.rename(columns={'Combined ICD10 Codes': 'ICD10 Codes'}, inplace=True)
Post_COPD_dataframe = pd.read_csv('Post COPD COMMON with comorbidities.csv')
Post_COPD_dataframe.rename(columns={'Combined ICD10 Codes': 'ICD10 Codes'}, inplace=True)
Common_COPD_dataframe = pd.read_csv('Combined COPD Common with comorbidities.csv')
Common_COPD_dataframe.rename(columns={'Combined ICD10 Codes': 'ICD10 Codes'}, inplace=True)



Pre_heart_failure_dataframe = pd.read_csv('Pre heart_failure COMMON with comorbidities.csv')
Pre_heart_failure_dataframe.rename(columns={'Combined ICD10 Codes': 'ICD10 Codes'}, inplace=True)
Post_heart_failure_dataframe = pd.read_csv('Post heart_failure COMMON with comorbidities.csv')
Post_heart_failure_dataframe.rename(columns={'Combined ICD10 Codes': 'ICD10 Codes'}, inplace=True)
Common_heart_failure_dataframe = pd.read_csv('Combined heart_failure Common with comorbidities.csv')
Common_heart_failure_dataframe.rename(columns={'Combined ICD10 Codes': 'ICD10 Codes'}, inplace=True)

# Define the conditions and corresponding ICD-10 codes (adjust as necessary)
#onditions = {
#   "Essential\n hypertension": "I10",
#   "Atrial fibrillation\n& flutter": "I48",
#   "Pure\n hypercholesterolemia": "E78.0",
#   "Atherosclerotic\neart disease": "I25.1",
#   "Type 2 diabetes\nellitus": "E11.9",
#   "Mitral (valve) insufficiency": "I34.0",
#   "Asthma": "J45.9",
#   "Chronic ischemic heart disease": "I25.8",
#   "Tobacco use": "Z72.0",
#   "Obesity": "E66.8"
#}
# Define the conditions and corresponding ICD-10 codes (adjust as necessary)
conditions = {
    "Essential\n hypertension": "I10",
    "Atrial fibrillation\n& flutter": "I48",
    "Pure\n hypercholesterolemia": "E78.0",
    "Atherosclerotic\n heart disease": "I25.1",
    "Type 2 diabetes\n Mellitus": "E11.9",
    "Asthma": "J45.9",
    "Chronic ischemic \n heart disease": "I25.8",
    "Mitral (valve)\n insufficiency": "I34.0",
}



# Plotting function for the combined forest plot
def plot_combined_forest_with_central_line(conditions,pre_ph_freq, post_ph_freq, common_ph_freq, 
                                           pre_copd_freq, post_copd_freq, common_copd_freq,
                                           pre_heart_failure_freq, post_heart_failure_freq, common_heart_failure_freq):
    fig, ax = plt.subplots(figsize=(22, 15))
    #y_pos = np.arange(len(conditions))
    y_pos = [i * 1.5 for i in range(len(conditions))]



    for i, condition in enumerate(conditions):
        ph_before_freq = pre_ph_freq[i]
        ph_after_freq = post_ph_freq[i]
        ph_common_freq = common_ph_freq[i]

        copd_before_freq = pre_copd_freq[i]
        copd_after_freq = post_copd_freq[i]
        copd_common_freq = common_copd_freq[i]
        
        heart_failure_before_freq = pre_heart_failure_freq[i]
        heart_failure_after_freq = post_heart_failure_freq[i]
        heart_failure_common_freq = common_heart_failure_freq[i]

        # PH color: gray for this combined plot
        if ph_common_freq > 0:
            ph_color = 'gray'
        elif ph_before_freq > 0:
            ph_color = 'gray'
        elif ph_after_freq > 0:
            ph_color = 'gray'
        else:
            ph_color = 'lightgray'

        # COPD color: green
        if copd_common_freq > 0:
            copd_color = 'green'
        elif copd_before_freq > 0:
            copd_color = 'green'
        elif copd_after_freq > 0:
            copd_color = 'green'
        else:
            copd_color = 'green'
            
            
        # heart_failure color: green
        if heart_failure_common_freq > 0:
            heart_failure_color = 'brown'
        elif heart_failure_before_freq > 0:
            heart_failure_color = 'brown'
        elif heart_failure_after_freq > 0:
            heart_failure_color = 'brown'
        else:
            heart_failure_color = 'brown'

        # Plot PH bars
        ax.plot([0.5 - ph_before_freq, 0.5 + ph_after_freq], [y_pos[i], y_pos[i]], '-', color=ph_color, lw=18)
        ax.plot(0.5, y_pos[i], 'o', color=ph_color)

        # Plot COPD bars slightly offset for clarity
        ax.plot([0.5 - copd_before_freq, 0.5 + copd_after_freq], [y_pos[i] - 0.3, y_pos[i] - 0.3], '-', color=copd_color, lw=18)
        ax.plot(0.5, y_pos[i] - 0.3, 'o', color=copd_color)
        
        
        # Plot Heart Failure bars further offset for clarity
        ax.plot([0.5 - heart_failure_before_freq, 0.5 + heart_failure_after_freq], [y_pos[i] - 0.6, y_pos[i] - 0.6], '-', color=heart_failure_color, lw=18)
        ax.plot(0.5, y_pos[i] - 0.6, 'o', color=heart_failure_color)

        
        # Calculate percentage values for PH
        ph_before_percentage = ph_before_freq * 100
        ph_after_percentage = ph_after_freq * 100

        # Add participant count and percentage text next to the ends of the lines for PH
        ph_before_count = categorized_conditions['Pre PH'].iloc[i]
        ph_after_count = categorized_conditions['Post PH'].iloc[i]
        ax.text(0.46 - ph_before_freq - 0.05, y_pos[i], f'{ph_before_count} ({ph_before_percentage:.1f}%)', 
                verticalalignment='center', fontsize=13, color='black')
        ax.text(0.47 + ph_after_freq + 0.05, y_pos[i], f'{ph_after_count} ({ph_after_percentage:.1f}%)', 
                verticalalignment='center', fontsize=13, color='black')
        
        # Calculate percentage values for COPD
        copd_before_percentage = copd_before_freq * 100
        copd_after_percentage = copd_after_freq * 100

        # Add participant count and percentage text for COPD
        copd_before_count = categorized_copd_conditions['Pre COPD'].iloc[i]
        copd_after_count = categorized_copd_conditions['Post COPD'].iloc[i]
        ax.text(0.46 - copd_before_freq - 0.05, y_pos[i] - 0.3, f'{copd_before_count} ({copd_before_percentage:.1f}%)', 
                verticalalignment='center', fontsize=13, color='black')
        ax.text(0.47 + copd_after_freq + 0.05, y_pos[i] - 0.3, f'{copd_after_count} ({copd_after_percentage:.1f}%)', 
                verticalalignment='center', fontsize=13, color='black')
        
        
        
        # Calculate percentage values for heart_failure
        heart_failure_before_percentage = heart_failure_before_freq * 100
        heart_failure_after_percentage = heart_failure_after_freq * 100

        # Add participant count and percentage text for Heart Failure
        heart_failure_before_count = categorized_heart_failure_conditions['Pre heart_failure'].iloc[i]
        heart_failure_after_count = categorized_heart_failure_conditions['Post heart_failure'].iloc[i]
        
        ax.text(0.46 - heart_failure_before_freq - 0.05, y_pos[i] - 0.6, f'{heart_failure_before_count} ({heart_failure_before_percentage:.1f}%)', 
                verticalalignment='center', fontsize=13, color='black')
        ax.text(0.47 + heart_failure_after_freq + 0.05, y_pos[i] - 0.6, f'{heart_failure_after_count} ({heart_failure_after_percentage:.1f}%)', 
                verticalalignment='center', fontsize=13, color='black')
        
        #ax.text(0.46 - heart_failure_before_freq - 0.05, y_pos[i] - 0.6, f'{heart_failure_before_freq * 100:.1f}%', 
        #        verticalalignment='center', fontsize=14, color='black')
        #ax.text(0.47 + heart_failure_after_freq + 0.05, y_pos[i] - 0.6, f'{heart_failure_after_freq * 100:.1f}%', 
        #        verticalalignment='center', fontsize=14, color='black')


    ax.set_yticks(y_pos)
    ax.set_yticklabels(conditions)
    ax.set_xticks([0, 0.5, 0.91])
    ax.set_xticklabels(['Comorbidities diagnosed\n before PH/COPD/HF', 'PH, COPD, HF\n Diagnosis Date', 'Comorbidities diagnosed\n after PH/COPD/HF'])
    #ax.set_xlabel('Frequency', fontsize=20, fontweight='bold')  # Set font size of the x-axis label
    ax.set_title('Timing and Frequency of Comorbidities in PH, COPD, and Heart Failure Populations', fontsize=16, fontweight='bold')  # Set font size of title

    # Adjust the font size of xticks and yticks
    ax.tick_params(axis='x', labelsize=15)  # Set font size of x-axis ticks
    ax.tick_params(axis='y', labelsize=15)  # Set font size of y-axis ticks
    
    plt.grid(False)
    plt.axvline(0.5, color='black', linestyle='--', lw=6)

    # Create custom legend
    # Create custom legend
    custom_lines = [
        plt.Line2D([0], [0], color='gray', lw=18),
        plt.Line2D([0], [0], color='green', lw=18),
        plt.Line2D([0], [0], color='brown', lw=18)
    ]
    
    # Adjust legend position, font size, and description
    ax.legend(custom_lines, ['PH Comorbidities', 'COPD Comorbidities', 'Heart Failure Comorbidities'], loc='upper right', fontsize=14, 
              bbox_to_anchor=(0.99, 0.99), borderaxespad=0., markerscale=1.5)

    plt.show()



# Call the function to plot
plot_combined_forest_with_central_line(conditions,pre_ph_freq, post_ph_freq, common_ph_freq, 
                                       pre_copd_freq, post_copd_freq, common_copd_freq,
                                       pre_heart_failure_freq, post_heart_failure_freq, common_heart_failure_freq)

<div style="background-color: #D2B48C; padding: 10px;">
    <h2><center>Process Mining (Tracer Plots) for Heart Failure </center></h2>
</div>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches


Pre_heart_failure_dataframe = []
Post_heart_failure_dataframe = []
Common_heart_failure_dataframe = []
combined_dataframe = []

# Load the datasets (adjust paths as necessary)
Pre_heart_failure_dataframe = pd.read_csv('Pre heart_failure COMMON with comorbidities.csv')
Post_heart_failure_dataframe = pd.read_csv('Post heart_failure COMMON with comorbidities.csv')
Common_heart_failure_dataframe = pd.read_csv('Combined heart_failure Common with comorbidities.csv')

# Combine the dataframes
combined_dataframe = pd.concat([Pre_heart_failure_dataframe, Common_heart_failure_dataframe, Post_heart_failure_dataframe], ignore_index=True)

# Drop duplicates from the combined dataframe
combined_dataframe = combined_dataframe.drop_duplicates()
combined_dataframe.head(3)

# Replace specific heart_failure types with "heart_failure"
heart_failure_conditions = ['I50.0', 'I50.1']
combined_dataframe['Combined ICD10 Codes'] = combined_dataframe['Combined ICD10 Codes'].replace(heart_failure_conditions, 'heart_failure')

# Convert Diagnosis Date to datetime format
combined_dataframe['Combined ICD10 Diagnosis Date'] = pd.to_datetime(combined_dataframe['Combined ICD10 Diagnosis Date'])

# Define relevant conditions and filter
conditions = {
    "Essential (primary) hypertension (I10)": "I10",
    "Atrial fibrillation and flutter (I48)": "I48",
    "Pure hypercholesterolemia (E78.0)": "E78.0",
    "Atherosclerotic heart disease (I25.1)": "I25.1",
    "Type 2 diabetes mellitus without complications (E11.9)": "E11.9",
    "Mitral (valve) insufficiency (I34.0)": "I34.0",
    "Asthma, unspecified (J45.9)": "J45.9",
    "heart_failure": "heart_failure",
}
condition_codes = list(conditions.values())
filtered_dataframe = combined_dataframe[combined_dataframe['Combined ICD10 Codes'].isin(condition_codes)]

# Sort the dataframe by Participant ID and Diagnosis Date to maintain order of conditions
filtered_dataframe = filtered_dataframe.sort_values(by=['Participant ID', 'Combined ICD10 Diagnosis Date'])

# Group by Participant ID to create a list of ICD10 codes and dates for each participant, filtering for traces with 3 or more conditions
participant_traces = (
    filtered_dataframe.groupby('Participant ID')
    .apply(lambda x: x[['Combined ICD10 Codes', 'Combined ICD10 Diagnosis Date']].values.tolist() if len(x) >= 3 else None)
    .dropna()
    .reset_index(drop=True)
)
participant_traces = pd.DataFrame(participant_traces, columns=['Traces'])

# Count the frequency of each unique sequence of conditions (pathway)
trace_frequencies = participant_traces['Traces'].apply(lambda trace: tuple([item[0] for item in trace])).value_counts().reset_index()
trace_frequencies.columns = ['Activities', 'Frequency']



# Filter out sequences that occur less than a threshold (e.g., 10 times)
trace_frequencies = trace_frequencies[trace_frequencies['Frequency'] >= 1]



# Calculate the total mean time for each pathway and between consecutive nodes
mean_times = []
mean_time_between_nodes = {}

mean_times = []
mean_time_between_nodes_list = []

for activities in trace_frequencies['Activities']:
    matching_participants = participant_traces[participant_traces['Traces'].apply(lambda trace: tuple([item[0] for item in trace]) == activities)]['Traces']
    node_differences = {i: [] for i in range(len(activities) - 1)}

    for trace in matching_participants:
        for i in range(len(trace) - 1):
            start_date = trace[i][1]
            end_date = trace[i + 1][1]
            node_diff_years = (end_date - start_date).days / 365.25
            node_differences[i].append(node_diff_years)

    total_cumulative_mean_time = 0
    mean_times_for_nodes = []
    for i in node_differences:
        mean_time = sum(node_differences[i]) / len(node_differences[i]) if node_differences[i] else 0
        mean_time_between_nodes[(activities[i], activities[i + 1])] = mean_time
        total_cumulative_mean_time += mean_time
        mean_times_for_nodes.append(mean_time)

    total_mean_time = sum(mean_times_for_nodes)
    mean_times.append(total_mean_time)
    mean_time_between_nodes_list.append(mean_times_for_nodes)

# Add Total Mean Time to the trace_frequencies DataFrame
trace_frequencies['Total Mean Time'] = mean_times


# Add Mean Time Between Nodes as separate columns
max_nodes = max(len(times) for times in mean_time_between_nodes_list)
for i in range(max_nodes):
    trace_frequencies[f'Mean Time Node {i+1}-{i+2}'] = [
        times[i] if i < len(times) else None for times in mean_time_between_nodes_list
    ]

# Replace NaN values with 0 in the trace_frequencies DataFrame
#trace_frequencies = trace_frequencies.fillna(0)

#display(trace_frequencies)

# Update Total Mean Time column
node_columns = [col for col in trace_frequencies.columns if col.startswith('Mean Time Node')]
trace_frequencies['Total Mean Time'] = trace_frequencies[node_columns].sum(axis=1)

# Sort pathways by the initial condition to create groups
trace_frequencies['Group'] = trace_frequencies['Activities'].apply(lambda x: x[0])
trace_frequencies = trace_frequencies.sort_values(by=['Group', 'Total Mean Time'], ascending=[True, False]).reset_index(drop=True)



# Define the specific pathways you want to extract
specific_traces = [
    ('E11.9', 'I10', 'I48', 'heart_failure'),
    ('E11.9', 'E78.0', 'I10', 'I25.1', 'heart_failure'),
    ('E11.9', 'I10', 'E78.0', 'heart_failure'),    
    ('E11.9', 'I10', 'heart_failure'),
    ('E11.9', 'I10', 'I25.1', 'heart_failure'),
    ('E11.9', 'E78.0', 'I10', 'heart_failure'),
    
    ('E78.0', 'I10', 'I48' ,'heart_failure'),
    ('E78.0', 'I10', 'I25.1' ,'heart_failure'),
    ('E78.0', 'I10','heart_failure'),
    
    ('I10', 'E78.0', 'I25.1','heart_failure'),
    ('I10', 'E11.9' , 'E78.0', 'heart_failure'),
    ('I10', 'I48', 'E78.0','heart_failure'),
    ('I10', 'E11.9', 'heart_failure'),
    ('I10', 'I48','heart_failure'),
    ('I10', 'I25.1','heart_failure'),
    
    ('I25.1','E78.0', 'I10', 'heart_failure'),
    ('I25.1','I10', 'heart_failure'),
    
    ('I48','I10', 'heart_failure'),
    ('I48','I25.1', 'heart_failure'),
    
    ('J45.9','I10', 'heart_failure'),
]

# Filter the trace_frequencies DataFrame for these specific pathways
trace_frequencies = trace_frequencies[trace_frequencies['Activities'].isin(specific_traces)]
#display(trace_frequencies)

# Define the color dictionary for the plot
color_dict = {
    'heart_failure': '#b3b3b3',
    'I10': '#b3a3cc',
    'I48': '#add8e6',
    'E78.0': '#ddc4a1',
    'I25.1': '#f4b0c8',
    'E11.9': '#c4e3b3',
    'I34.0': '#b3e2d4',
    'J45.9': '#FFFF99',
}

# The plotting code remains unchanged, with "PH" replaced by "heart_failure" where necessary

# Define the conversion function for formatting total mean time
def convert_years_to_years_months(years):
    if isinstance(years, (int, float)) and not pd.isna(years):
        years_int = int(years)
        months = round((years - years_int) * 12)
        return f"{years_int} yrs - {months} mos"
    return ""

# Apply the conversion to 'Total Mean Time' and create a new column with formatted values
trace_frequencies['Total Mean Time Formatted'] = trace_frequencies['Total Mean Time'].apply(convert_years_to_years_months)

# Plotting the figure with a gap between nodes
fig, ax = plt.subplots(figsize=(18, 11))
gap = 0.25  # Gap between nodes
frequency_position = max(len(t) for t in trace_frequencies['Activities']) + 2
mean_time_position = frequency_position - 0.3



# Filter out rows where heart_failure occurs before any other condition or appears more than once
trace_frequencies = trace_frequencies[
    trace_frequencies['Activities'].apply(
        lambda x: all(x.index('heart_failure') > x.index(cond) for cond in x if cond != 'heart_failure') if 'heart_failure' in x else True
    ) & trace_frequencies['Activities'].apply(
        lambda x: x.count('heart_failure') <= 1  # Ensure heart_failure appears at most once
    )
].reset_index(drop=True)




# Insert empty rows for grouping in the DataFrame
rows = []
for index, row in trace_frequencies.iterrows():
    rows.append(row)
    if index < len(trace_frequencies) - 1:
        current_group = row['Group']
        next_group = trace_frequencies.iloc[index + 1]['Group']
        if current_group != next_group:
            empty_row = pd.Series({
                'Activities': None,
                'Frequency': 0,
                'Total Mean Time': 0,
                'Group': "",
                'Total Mean Time Formatted': ""
            })
            rows.append(empty_row)

trace_frequencies = pd.DataFrame(rows).fillna("").replace(0, "")

display(trace_frequencies)

###################################################################################

import pandas as pd
import numpy as np

# Ensure all mean time columns are numeric
mean_time_columns = [
    'Mean Time Node 1-2', 'Mean Time Node 2-3', 'Mean Time Node 3-4',
    'Mean Time Node 4-5', 'Mean Time Node 5-6', 'Mean Time Node 6-7'
]

# Convert columns to numeric, replacing errors with NaN
trace_frequencies[mean_time_columns] = trace_frequencies[mean_time_columns].apply(pd.to_numeric, errors='coerce')

# Define a function to calculate the standard deviation for each row
def calculate_row_std(row):
    # Extract the mean time columns (drop NaN values)
    node_times = row[mean_time_columns].dropna()
    # Compute standard deviation
    return node_times.std()

# Apply the function to each row and create a new column for standard deviation
trace_frequencies['Total Mean Time Std'] = trace_frequencies.apply(calculate_row_std, axis=1)

# Replace NaN values with empty strings in the entire dataframe
trace_frequencies = trace_frequencies.fillna('')

# Display the updated dataframe
display(trace_frequencies)


###################################################################################




# Modified plotting loop
for row, (trace, frequency, mean_time) in enumerate(zip(trace_frequencies["Activities"], trace_frequencies["Frequency"], trace_frequencies["Total Mean Time"]), start=1):
    if not trace:  # Skip empty rows for plotting
        continue

    for col, cond in enumerate(trace, start=1):
        x_position = col + (col - 1) * gap
        ax.add_patch(plt.Rectangle((x_position - 0.48, row - 0.44), 0.8, 0.92, facecolor=color_dict.get(cond, 'white'), edgecolor='black'))
        ax.text(x_position - 0.06, row, cond, ha='center', va='center', fontsize=13)

        # Add directed arrow between nodes if not the last node
        if col < len(trace):
            next_x_position = x_position + 1 + gap
            ax.annotate(
                '', xy=(next_x_position - 0.46, row), xytext=(x_position + 0.31, row),
                arrowprops=dict(arrowstyle="->", color='black', lw=0.9)
            )

            # Display mean time between nodes
            mean_time_column = f'Mean Time Node {col}-{col+1}'
            if mean_time_column in trace_frequencies.columns and 0 <= row - 1 < len(trace_frequencies):
                mean_time_display = trace_frequencies.iloc[row - 1][mean_time_column]
                if pd.notna(mean_time_display) and isinstance(mean_time_display, (int, float)):
                    ax.text(
                        (x_position + next_x_position) / 2 - 0.06, row - 0.25,
                        f'{mean_time_display:.2f} yrs', ha='center', va='center', fontsize=12, color='black'
                    )

    # Display frequency and mean time at the end
    ax.text(frequency_position, row, f'{frequency}', ha='center', va='center', color='grey', fontsize=11)
    # Add grey block for Total Mean Time
    ax.add_patch(plt.Rectangle((mean_time_position - 0.17, row - 0.45), 0.7, 0.9, facecolor='grey', edgecolor='white'))
    formatted_mean_time = convert_years_to_years_months(mean_time)
    ax.text(mean_time_position + 0.2, row, formatted_mean_time, ha='center', va='center', color='white', fontsize=13.5)

# Add headings for the grey boxes
plt.text(frequency_position - 6.5, -0.3, "No. of\n Individuals", ha='center', va='center', fontsize=13, fontweight='bold')
plt.text(mean_time_position + 0.2, -0.2, "Total Mean\n Time (yrs)", ha='center', va='center', fontsize=13, fontweight='bold')


# Set x-axis limits
ax.set_xlim(0.5, mean_time_position + 0.5)

# Manually set x-tick positions and labels
ax.set_xticks([0.92, 2.17, 3.41, 4.68,6.00])
ax.set_xticklabels(['1', '2', '3', '4','5'], fontsize=14)

# Set y-axis limits and ticks
ax.set_ylim(len(trace_frequencies) + 0.45, 0.55)
ax.set_yticks(range(1, len(trace_frequencies) + 1))
ax.set_yticklabels(trace_frequencies['Frequency'], fontsize=14)

# Set axis labels
ax.set_xlabel("Disease Sequence", fontsize=14.5, fontweight='bold')
ax.set_ylabel("Disease Pathways Leading to heart_failure Conditions", fontsize=15, fontweight='bold', labelpad=20)

# Adjust legend
handles = [mpatches.Patch(color=color_dict[cond], label=key) for key, cond in conditions.items()]
ax.legend(handles=handles, bbox_to_anchor=(0.5, -0.07), loc='upper center', ncol=3, fontsize=14, title_fontsize='12')

# Hide right and top spines
for spine in ['right', 'top']:
    ax.spines[spine].set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches


Pre_PH_dataframe = []
Post_PH_dataframe = []
Common_PH_dataframe = []
Common_PH_dataframe = []
trace_frequencies= []

# Load the datasets (adjust paths as necessary)
Pre_PH_dataframe = pd.read_csv('Pre-PH Common PH Icd10 Codes.csv')
Post_PH_dataframe = pd.read_csv('Post-PH Common PH Icd10 Codes.csv')
Common_PH_dataframe = pd.read_csv('Common-PH Common PH Icd10 Codes.csv')

# Combine the dataframes
combined_dataframe = pd.concat([Pre_PH_dataframe, Common_PH_dataframe, Post_PH_dataframe], ignore_index=True)

# Drop duplicates from the combined dataframe
combined_dataframe = combined_dataframe.drop_duplicates()

# Replace specific PH types with "PH"
ph_conditions = ["I27.0", "I27.2", "I27.9"]
combined_dataframe['ICD10 Codes'] = combined_dataframe['ICD10 Codes'].replace(ph_conditions, 'PH/COPD/HF')

# Convert Diagnosis Date to datetime format
combined_dataframe['Diagnosis Date'] = pd.to_datetime(combined_dataframe['Diagnosis Date'])

# Define relevant conditions and filter
conditions = {
    "Essential (primary) hypertension": "I10",
    "Atrial fibrillation and flutter": "I48",
    "Pure hypercholesterolemia": "E78.0",
    "Atherosclerotic heart disease": "I25.1",
    "Type 2 diabetes mellitus": "E11.9",
    "Mitral (valve) insufficiency": "I34.0",
    "Asthma": "J45.9",
    "PH, COPD, HF Condition": "PH/COPD/HF",
}
condition_codes = list(conditions.values())
filtered_dataframe = combined_dataframe[combined_dataframe['ICD10 Codes'].isin(condition_codes)]

# Sort the dataframe by Participant ID and Diagnosis Date to maintain order of conditions
filtered_dataframe = filtered_dataframe.sort_values(by=['Participant ID', 'Diagnosis Date'])




# Group by Participant ID to create a list of ICD10 codes and dates for each participant, filtering for traces with 3 or more conditions
participant_traces = (
    filtered_dataframe.groupby('Participant ID')
    .apply(lambda x: x[['ICD10 Codes', 'Diagnosis Date']].values.tolist() if len(x) >= 3 else None)
    .dropna()
    .reset_index(drop=True)
)
participant_traces = pd.DataFrame(participant_traces, columns=['Traces'])

# Count the frequency of each unique sequence of conditions (pathway)
trace_frequencies = participant_traces['Traces'].apply(lambda trace: tuple([item[0] for item in trace])).value_counts().reset_index()
trace_frequencies.columns = ['Activities', 'Frequency']




# Filter out sequences that occur less than a threshold (e.g., 10 times)
trace_frequencies = trace_frequencies[trace_frequencies['Frequency'] >= 10]



# Define a dictionary with the manually updated formatted values
manual_ph_updates = {
    ('I10', 'I48', 'PH/COPD/HF'): "8 yrs - 4 mos ± 1.42 yrs ",
    ('I10', 'I25.1', 'PH/COPD/HF'): "5 yrs - 11 mos  ± 2.76 yrs",
    ('I10', 'E11.9', 'PH/COPD/HF'): "8 yrs - 5 mos ± 2.33 yrs",
    ('I10', 'E78.0', 'I25.1','PH/COPD/HF'): "9 yrs - 12 mos ± 1.96 yrs",
    ('I10', 'E11.9', 'E78.0','PH/COPD/HF'): "9 yrs - 11 mos ± 1.18 yrs",
    ('I10', 'I48', 'E78.0','PH/COPD/HF'): "8 yrs - 7 mos ± 1.16 yrs",
    
    ('E78.0', 'I10', 'PH/COPD/HF'): "5 yrs - 7 mos ± 1.34 yrs",
    ('E78.0', 'I10', 'I25.1','PH/COPD/HF'): "6 yrs - 6 mos ± 2.34 yrs",
    ('E78.0', 'I10', 'I48','PH/COPD/HF'): "6 yrs - 8 mos ± 1.39 yrs",
    
    ('E11.9', 'I10', 'E78.0','PH/COPD/HF'): "7 yrs - 11 mos ± 2.07 yrs",
    ('E11.9', 'I10', 'I48','PH/COPD/HF'): "8 yrs - 8 mos ± 2.64 yrs",
    ('E11.9', 'E78.0' ,'I10', 'I25.1','PH/COPD/HF'): "8 yrs - 2 mos ± 1.46 yrs",
    ('E11.9', 'I10', 'I25.1','PH/COPD/HF'): "7 yrs - 0 mos ± 1.05 yrs",
    ('E11.9', 'E78.0', 'I10','PH/COPD/HF'): "5 yrs - 8 mos ± 0.90 yrs",
    ('E11.9', 'I10','PH/COPD/HF'): "7 yrs - 1 mos ± 2.53 yrs",
    
    ('I48', 'I10', 'PH/COPD/HF'): "8 yrs - 5 mos ± 0.29 yrs",
    ('I48', 'I25.1', 'PH/COPD/HF'): "6 yrs - 3 mos ± 0.54 yrs",
    
    ('I25.1', 'I10', 'PH/COPD/HF'): "7 yrs - 3 mos ± 1.25 yrs",
    ('I25.1', 'E78.0' ,'I10', 'PH/COPD/HF'): "8 yrs - 8 mos ± 1.93 yrs",
    
    ('J45.9', 'I10', 'PH/COPD/HF'): "10 yrs - 8 mos	 ± 1.53 yrs",
}
 

# Define a dictionary with the updated Frequency values in the desired format
updated_frequencies = {
    ('I10', 'I48', 'PH/COPD/HF'): "2 yrs - 11 mos ± 1.03 yrs",
    ('I10', 'I25.1', 'PH/COPD/HF'): "4 yrs - 8 mos ± 0.83 yrs",
    ('I10', 'E11.9', 'PH/COPD/HF'): "8 yrs - 2 mos ± 2.08 yrs",
    ('I10', 'E78.0', 'I25.1','PH/COPD/HF'): "13 yrs - 8 mos ± 3.11 yrs",
    ('I10', 'E11.9', 'E78.0','PH/COPD/HF'): "11 yrs - 11 mos ± 2.13 yrs",
    ('I10', 'I48', 'E78.0','PH/COPD/HF'): "1 yr - 6 mos ± 0.01 yrs",
    
    ('E78.0', 'I10', 'PH/COPD/HF'): "3 yrs - 2 mos ± 0.86 yrs",
    ('E78.0', 'I10', 'I25.1','PH/COPD/HF'): "5 yrs - 11 mos ± 3.11 yrs",
    ('E78.0', 'I10', 'I48','PH/COPD/HF'): "8 yrs - 3 mos ± 2.37 yrs",
    
    ('E11.9', 'I10', 'E78.0','PH/COPD/HF'): "11 yrs - 10 mos ± 0.78 yrs",
    ('E11.9', 'I10', 'I48','PH/COPD/HF'): "8 yrs - 3 mos ± 2.03 yrs",
    ('E11.9', 'E78.0' ,'I10', 'I25.1','PH/COPD/HF'): "7 yrs - 11 mos ± 2.38 yrs",
    ('E11.9', 'I10', 'I25.1','PH/COPD/HF'): "6 yrs - 9 mos ± 0.12 yrs",
    ('E11.9', 'E78.0', 'I10','PH/COPD/HF'): "3 yrs - 6 mos ± 1.08 yrs",
    ('E11.9', 'I10','PH/COPD/HF'): "2 yrs - 8 mos ± 0.82 yrs",
    
    ('I48', 'I10', 'PH/COPD/HF'): "6 yrs - 2 mos ± 1.97 yrs",
    ('I48', 'I25.1', 'PH/COPD/HF'): "2 yrs - 5 mos ± 1.32 yrs",
    
    ('I25.1', 'I10', 'PH/COPD/HF'): "2 yrs - 11 mos ± 0.52 yrs",
    ('I25.1', 'E78.0' ,'I10', 'PH/COPD/HF'): "4 yrs - 10 mos ± 0.91 yrs",
    
    ('J45.9', 'I10', 'PH/COPD/HF'): "0 yrs - 0 mos ± 0 yrs",
}

# Manually specify values for Total Mean Time (Heart Failure Matched)

manual_hf_timess = {
    ('I10', 'I48', 'PH/COPD/HF'): "5 yrs - 6 mos ± 0.01 yrs ",
    ('I10', 'I25.1', 'PH/COPD/HF'): "5 yrs - 1 mos ± 2.05 yrs",
    ('I10', 'E11.9', 'PH/COPD/HF'): "10 yrs - 4 mos ± 2.14 yrs",
    ('I10', 'E78.0', 'I25.1','PH/COPD/HF'): "11 yrs - 2 mos ± 0.81 yrs",
    ('I10', 'E11.9', 'E78.0','PH/COPD/HF'): "11 yrs - 9 mos ± 1.98 yrs",
    ('I10', 'I48', 'E78.0','PH/COPD/HF'): "11 yr - 11 mos ± 1.31 yrs",
    
    ('E78.0', 'I10', 'PH/COPD/HF'): "5 yrs - 7 mos ± 2.08 yrs",
    ('E78.0', 'I10', 'I25.1','PH/COPD/HF'): "6 yrs - 2 mos ± 1.45 yrs",
    ('E78.0', 'I10', 'I48','PH/COPD/HF'): "7 yrs - 3 mos ± 1.63 yrs",
    
    ('E11.9', 'I10', 'E78.0','PH/COPD/HF'): "7 yrs - 10 mos ± 2.29 yrs",
    ('E11.9', 'I10', 'I48','PH/COPD/HF'): "5 yrs - 9 mos ± 0.16 yrs",
    ('E11.9', 'E78.0' ,'I10', 'I25.1','PH/COPD/HF'): "8 yrs - 4 mos ± 1.81 yrs",
    ('E11.9', 'I10', 'I25.1','PH/COPD/HF'): "8 yrs - 8 mos ± 1.96 yrs",
    ('E11.9', 'E78.0', 'I10','PH/COPD/HF'): "10 yrs - 1 mos ± 1.97 yrs",
    ('E11.9', 'I10','PH/COPD/HF'): "6 yrs - 0 mos ± 1.79 yrs",
    
    ('I48', 'I10', 'PH/COPD/HF'): "7 yrs - 7 mos ± 0.61 yrs",
    ('I48', 'I25.1', 'PH/COPD/HF'): "1 yrs - 4 mos ± 1.35 yrs",
    
    ('I25.1', 'I10', 'PH/COPD/HF'): "9 yrs - 5 mos ± 1.35 yrs",
    ('I25.1', 'E78.0' ,'I10', 'PH/COPD/HF'): "10 yrs - 5 mos ± 1.02 yrs",
    
    ('J45.9', 'I10', 'PH/COPD/HF'): "11 yrs - 2 mos ± 2.57 yrs",
}






# Update the Frequency column in the DataFrame with formatted values
trace_frequencies['Frequency'] = trace_frequencies['Activities'].apply(lambda x: updated_frequencies.get(tuple(x), x))




#################################################


# Display the updated DataFrame
#display(trace_frequencies)
########################################################

# Calculate the total mean time for each pathway and between consecutive nodes
mean_times = []
mean_time_between_nodes = {}


mean_times = []
mean_time_between_nodes_list = []  # List to store mean times between nodes for each pathway

for activities in trace_frequencies['Activities']:
    matching_participants = participant_traces[participant_traces['Traces'].apply(lambda trace: tuple([item[0] for item in trace]) == activities)]['Traces']
    node_differences = {i: [] for i in range(len(activities) - 1)}  # To store mean time between nodes

    # Calculate time between each node for all matching traces
    for trace in matching_participants:
        for i in range(len(trace) - 1):
            start_date = trace[i][1]
            end_date = trace[i + 1][1]
            node_diff_years = (end_date - start_date).days / 365.25
            node_differences[i].append(node_diff_years)

    # Calculate mean time between nodes and store cumulative sum
    total_cumulative_mean_time = 0
    mean_times_for_nodes = []  # To store mean times between nodes for the current pathway
    for i in node_differences:
        mean_time = sum(node_differences[i]) / len(node_differences[i]) if node_differences[i] else 0
        mean_time_between_nodes[(activities[i], activities[i + 1])] = mean_time
        total_cumulative_mean_time += mean_time  # Accumulate mean times for Total Mean Time
        mean_times_for_nodes.append(mean_time)  # Append mean time for this node transition

    total_mean_time = sum(mean_times_for_nodes)  # Calculate the total mean time as the sum of mean times between nodes
    mean_times.append(total_mean_time)  # Store the corrected cumulative mean time
    mean_time_between_nodes_list.append(mean_times_for_nodes)  # Store list of mean times for the pathway

# Add Total Mean Time to the trace_frequencies DataFrame
trace_frequencies['Total Mean Time'] = mean_times

# Add Mean Time Between Nodes as separate columns
max_nodes = max(len(times) for times in mean_time_between_nodes_list)  # Find max number of nodes in any pathway
for i in range(max_nodes):
    trace_frequencies[f'Mean Time Node {i+1}-{i+2}'] = [
        times[i] if i < len(times) else None for times in mean_time_between_nodes_list
    ]

 
# Display the updated DataFrame
# Replace NaN values with 0 in the trace_frequencies DataFrame
trace_frequencies = trace_frequencies.fillna(0)
#display(trace_frequencies)

# Sum the values from Mean Time Node columns to update the Total Mean Time column
node_columns = [col for col in trace_frequencies.columns if col.startswith('Mean Time Node')]
trace_frequencies['Total Mean Time'] = trace_frequencies[node_columns].sum(axis=1)

# Remove the specific row where Activities contains (J45.9, I10, PH)
trace_frequencies = trace_frequencies[trace_frequencies['Activities'] != ('J45.9', 'I10', 'PH/COPD/HF')]

# Display the updated DataFrame
#display(trace_frequencies)

trace_frequencies['Group'] = trace_frequencies['Activities'].apply(lambda x: x[0])  # Group by first condition


trace_frequencies = trace_frequencies.sort_values(by=['Group', 'Total Mean Time'], ascending=[True, False]).reset_index(drop=True)

# Remove specific traces by index (15 and 17) - subtracting 1 for zero-based indexing
trace_frequencies = trace_frequencies.drop(index=[14, 16]).reset_index(drop=True)







# Prepare data for plotting
data_dict = {
    "Trace": trace_frequencies['Activities'].tolist(),
    "Frequency": trace_frequencies['Frequency'].tolist(),
    "Total Mean Time": trace_frequencies['Total Mean Time'].tolist(),
}


# Map each condition to a color for the plot
color_dict = {
    'PH/COPD/HF': '#b3b3b3',
    'I10': '#b3a3cc',
    'I48': '#add8e6',
    'E78.0': '#ddc4a1',
    'I25.1': '#f4b0c8',
    'E11.9': '#c4e3b3',
    'I34.0': '#b3e2d4',
    'J45.9': '#FFFF99',
}


# Define the conversion function
def convert_years_to_years_months(years):
    if isinstance(years, (int, float)) and not pd.isna(years):
        years_int = int(years)
        months = round((years - years_int) * 12)
        return f"{years_int} yrs - {months} mos"
    return ""  # Return an empty string if the input is not a number

# Apply the conversion to 'Total Mean Time' and create a new column with formatted values
trace_frequencies['Total Mean Time Formatted'] = trace_frequencies['Total Mean Time'].apply(convert_years_to_years_months)


# Prepare data for plotting
data_dict = {
    "Trace": trace_frequencies['Activities'].tolist(),
    "Frequency": trace_frequencies['Frequency'].tolist(),
    "Total Mean Time": trace_frequencies['Total Mean Time'].tolist(),
}


# Apply manual updates to the 'Total Mean Time Formatted' column
trace_frequencies['Total Mean Time Formatted'] = trace_frequencies['Activities'].apply(
    lambda x: manual_ph_updates.get(tuple(x), trace_frequencies.loc[trace_frequencies['Activities'] == x, 'Total Mean Time Formatted'].values[0])
)
#display(trace_frequencies)

# Plotting the figure with a gap between nodes
fig, ax = plt.subplots(figsize=(15, 8))
gap = -0.39  # Gap between nodes
frequency_position = max(len(t) for t in data_dict["Trace"]) + 2
mean_time_position = frequency_position + 0.05
heart_failure_position = mean_time_position + 1.5  # Adjust the position for the new column









# Ensure data_dict is a DataFrame before iterating
if isinstance(data_dict, dict):
    data_dict = pd.DataFrame(data_dict)

rows = []
for index, row in data_dict.iterrows():
    rows.append(row)
    # Insert an empty row based on a condition (e.g., after each group)
    if index < len(data_dict) - 1:  # Add condition to avoid unnecessary last row gap if needed
        current_group = row['Trace'][0] if isinstance(row['Trace'], tuple) else None
        next_group = data_dict.iloc[index + 1]['Trace'][0] if isinstance(data_dict.iloc[index + 1]['Trace'], tuple) else None
        if current_group != next_group:
            empty_row = pd.Series({'Trace': None, 'Frequency': 0, 'Total Mean Time': 0})
            rows.append(empty_row)

# Convert the modified list of rows back to a DataFrame
data_dict = pd.DataFrame(rows).fillna("").replace(0, "")



# Insert empty rows in trace_frequencies based on group change
rows = []
for index, row in trace_frequencies.iterrows():
    rows.append(row)
    # Add an empty row after a change in the first element of 'Activities'
    if index < len(trace_frequencies) - 1:  # Ensure we don't add an unnecessary row at the end
        current_group = row['Activities'][0] if isinstance(row['Activities'], tuple) else None
        next_group = trace_frequencies.iloc[index + 1]['Activities'][0] if isinstance(trace_frequencies.iloc[index + 1]['Activities'], tuple) else None
        if current_group != next_group:
            # Create an empty row with default values (adjust as necessary)
            empty_row = pd.Series({
                'Activities': None,
                'Frequency': 0,
                'Total Mean Time': 0,
                'Mean Time Node 1-2': 0,
                'Mean Time Node 2-3': 0,
                'Mean Time Node 3-4': 0,
                'Mean Time Node 4-5': 0,
                'Group': "",
                'Total Mean Time Formatted': ""
            })
            rows.append(empty_row)



         
         
            
# Convert the modified list of rows back to a DataFrame
trace_frequencies = pd.DataFrame(rows).fillna("").replace(0, "")




# Filter out rows where PH occurs before any other condition
trace_frequencies = trace_frequencies[
    trace_frequencies['Activities'].apply(
        lambda x: all(x.index('PH/COPD/HF') > x.index(cond) for cond in x if cond != 'PH/COPD/HF') if 'PH/COPD/HF' in x else True
    )
].reset_index(drop=True)


# Prepare the data dictionary for plotting with updated frequencies
data_dict = {
    "Trace": trace_frequencies['Activities'].tolist(),
    "Frequency": trace_frequencies['Frequency'].tolist(),
    "Total Mean Time": trace_frequencies['Total Mean Time'].tolist(),
}



# Modified plotting loop with improved handling of indexing
for row, (trace, frequency, mean_time) in enumerate(zip(trace_frequencies["Activities"], trace_frequencies["Frequency"], trace_frequencies["Total Mean Time"]), start=1):
    if not trace:  # Skip empty rows for plotting
        continue

    for col, cond in enumerate(trace, start=1):
        x_position = col + (col - 1) * gap  # Add gap between nodes
        ax.add_patch(plt.Rectangle((x_position - 0.49, row - 0.45), 0.6, 0.9, facecolor=color_dict.get(cond, 'white'), edgecolor='black'))
        ax.text(x_position - 0.18, row, cond, ha='center', va='center')

        # Add directed arrow between nodes if not the last node
        #if col < len(trace):
        #    # Manually control the tail and head of the arrow
        #    tail_x = x_position + 0.1  # Tail position (adjust this value to move the tail)
        #    tail_y = row  # Tail vertical alignment
        #    head_x = x_position + 1 + gap - 0.46  # Head position (adjust this value to move the head)
        #    head_y = row  # Head vertical alignment
            
        #    ax.annotate(
         #       '',
         #       xy=(head_x, head_y),   # Arrow head position
         #       xytext=(tail_x, tail_y),  # Arrow tail position
         #       arrowprops=dict(arrowstyle="->", color='black', lw=0.8)  # Arrow properties
         #   )





    # Display frequency and mean time at the end
    ax.add_patch(plt.Rectangle((frequency_position - 2.20, row - 0.45), 1.1, 0.99, facecolor='grey', edgecolor='white'))
    ax.text(frequency_position - 1.65, row, f'{frequency}', ha='center', va='center', color='white')
    ax.add_patch(plt.Rectangle((mean_time_position - 3.42, row - 0.45), 1.1, 0.99, facecolor='grey', edgecolor='white'))
    
    # Use the formatted mean time for displaying it
    #formatted_mean_time = convert_years_to_years_months(mean_time)
    #ax.text(mean_time_position - 1.77, row, formatted_mean_time, ha='center', va='center', color='white')
    # Use the manually updated 'Total Mean Time Formatted' values
    formatted_mean_time = trace_frequencies['Total Mean Time Formatted'].iloc[row - 1]
    ax.text(mean_time_position - 2.87, row, formatted_mean_time, ha='center', va='center', color='white')

    
    
    # Add grey box and text for Heart Failure Matched
    ax.add_patch(plt.Rectangle((heart_failure_position - 2.57, row - 0.45), 1.1, 0.99, facecolor='grey', edgecolor='white'))
    current_activities = trace_frequencies['Activities'].iloc[row - 1]  # Get the current activities tuple
    formatted_hf_time = manual_hf_timess.get(current_activities, "")  # Retrieve value from dictionary
    ax.text(heart_failure_position - 2.04, row, formatted_hf_time, ha='center', va='center', color='white')
    #formatted_hf_time = trace_frequencies['Total Mean Time Formatted'].iloc[row - 1]  # Adjust index for correct row
    #ax.text(heart_failure_position - 1.61, row, formatted_hf_time, ha='center', va='center', color='white')



# Add headings for the grey boxes
plt.text(frequency_position - 1.60, -0.2, "Total Mean Time ± Std. dev.\n(COPD Matched)", ha='center', va='center', fontsize=10)
plt.text(mean_time_position - 2.91, -0.2, "Total Mean Time ± Std. dev.\n(PH Cohort)", ha='center', va='center', fontsize=10)
plt.text(heart_failure_position - 2.05, -0.2, "Total Mean Time ± Std. dev.\n(Heart Failure Matched)", ha='center', va='center',fontsize=10)


# Set x-axis limits
ax.set_xlim(0.5, mean_time_position + 0.5)

# Manually set x-tick positions and labels
ax.set_xticks([0.8, 1.41, 2.02, 2.67, 3.25])
ax.set_xticklabels(['1', '2', '3', '4', '5'])

# Set y-axis limits and ticks
ax.set_ylim(len(data_dict["Trace"]) + 0.45, 0.55)
ax.set_yticks([])
#ax.set_yticks(range(1, len(data_dict["Frequency"]) + 1))
#ax.set_yticklabels(data_dict["Frequency"])  # Replace y-tick values with "Frequency" values

# Set axis labels with increased font size
ax.set_xlabel("Disease Sequence", fontsize=13)
ax.set_ylabel("Disease Pathways", fontsize=13, labelpad=10)

# Adjust legend creation to show full names
handles = [mpatches.Patch(color=color_dict[cond], label=key) for key, cond in conditions.items()]
ax.legend(handles=handles, bbox_to_anchor=(0.5, -0.1), loc='upper center', ncol=4, fontsize=10, title_fontsize='11')

# Hide right and top spines for better presentation
for dir in ['right', 'top']:
    ax.spines[dir].set_visible(False)

plt.tight_layout()
plt.show()