In [9]:
import pandas as pd
from itertools import product
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test

# Read data
data_path = 'C:\\Users\\ndsch\\Data\\ITP-Lifespan-Data\\ITP_processed_data\\ITP_2004-2016_concat_simple.csv'
df = pd.read_csv(data_path)

# Function to get unique combinations of treatment, Rx(ppm), cohort, and sex, excluding "Control" treatment
# This isn't used again until the end of the code when it is an argument for the generate_itp_logrank() function
def get_combinations(df):
    unique_combinations = []
    for treatment, rx_ppm, cohort, sex in product(df["treatment"].unique(), df["Rx(ppm)"].unique(), df["cohort"].unique(), df["sex"].unique()):
        if treatment != "Control":
            unique_combinations.append((treatment, rx_ppm, cohort, sex))
    return unique_combinations # returns a list of the unique combinations

# Function to generate a DataFrame containing logrank test results for each combination
def generate_itp_logrank(df, combinations):
    results = []
    processed_count = 0
    skipped_count = 0

    # Loop through each combination of treatment, Rx(ppm), cohort, and sex
    for combination in combinations:
        treatment, rx_ppm, cohort, sex = combination
        # Filter the data for the current combination (treatment and control groups)
        treatment_data = df[(df["treatment"] == treatment) & (df["Rx(ppm)"] == rx_ppm) & (df["cohort"] == cohort) & (df["sex"] == sex)]
        control_data = df[(df["treatment"] == "Control") & (df["Rx(ppm)"] == rx_ppm) & (df["cohort"] == cohort) & (df["sex"] == sex)]

        # If either treatment or control data is empty, skip this combination
        if treatment_data.empty or control_data.empty:
            skipped_count += 1
            continue

        # Fit Kaplan-Meier survival curves for treatment and control groups
        kmf_treatment = KaplanMeierFitter()
        kmf_treatment.fit(treatment_data["age(days)"], event_observed=treatment_data["dead"])

        kmf_control = KaplanMeierFitter()
        kmf_control.fit(control_data["age(days)"], event_observed=control_data["dead"])

        # Perform logrank test comparing treatment and control groups
        logrank_result = logrank_test(treatment_data["age(days)"], control_data["age(days)"], event_observed_A=treatment_data["dead"], event_observed_B=control_data["dead"])

        # Calculate the percentage increase in median lifespan for the treatment group compared to the control group
        percentage_lifespan_increase = ((kmf_treatment.median_survival_time_ - kmf_control.median_survival_time_) / kmf_control.median_survival_time_) * 100

        # Append the results to the list
        results.append({
            "Treatment+Rx+Cohort+Sex": f"{treatment}+{rx_ppm}+{cohort}+{sex}",
            "Control+Rx+Cohort+Sex": f"Control+{rx_ppm}+{cohort}+{sex}",
            "Test statistic": logrank_result.test_statistic,
            "P-value": logrank_result.p_value,
            "%_lifespan_increase": percentage_lifespan_increase
        })

        processed_count += 1

    # Convert the list of results to a DataFrame
    results_df = pd.DataFrame(results)
    print(f"Processed combinations: {processed_count}")
    print(f"Skipped combinations due to empty data: {skipped_count}")
    return results_df

# Get unique combinations of treatments, Rx(ppm), cohorts, and sex
combinations = get_combinations(df)

# Generate a DataFrame containing logrank test results for each combination
itp_logrank = generate_itp_logrank(df, combinations)

# Save the resulting DataFrame to a CSV file
output_csv_path = 'C:\\Users\\ndsch\\Data\\ITP-Lifespan-Data\\ITP_processed_data\\ITP_logrank.csv'
itp_logrank.to_csv(output_csv_path, index=False)
print(f"Results saved to {output_csv_path}")


Total combinations: 27720

First 5 rows of the input DataFrame:
  population cohort site sex       id     group  Rx(ppm)  age_initiation(mo)  \
0    UM-HET3  C2004  TJL   m  JL00005  4-OH-PBN    315.0                   4   
1    UM-HET3  C2004  TJL   m  JL00006  4-OH-PBN    315.0                   4   
2    UM-HET3  C2004  TJL   m  JL00007  4-OH-PBN    315.0                   4   
3    UM-HET3  C2004  TJL   f  JL00095  4-OH-PBN    315.0                   4   
4    UM-HET3  C2004  TJL   f  JL00096  4-OH-PBN    315.0                   4   

  status  dead  age(days) treatment                           full_name  
0   dead  True        896  4-OH-PBN  4-OH-a-phenyl-N-tert-butyl nitrone  
1   dead  True       1077  4-OH-PBN  4-OH-a-phenyl-N-tert-butyl nitrone  
2   dead  True        790  4-OH-PBN  4-OH-a-phenyl-N-tert-butyl nitrone  
3   dead  True        823  4-OH-PBN  4-OH-a-phenyl-N-tert-butyl nitrone  
4   dead  True        601  4-OH-PBN  4-OH-a-phenyl-N-tert-butyl nitrone  

Unique val

KeyboardInterrupt: 