In [35]:
import pandas as pd
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
from itertools import chain, combinations

# Read data
data_path = 'C:\\Users\\ndsch\\Data\\ITP-Lifespan-Data\\ITP_processed_data\\ITP_2004-2016_concat_simple.csv'
df = pd.read_csv(data_path)

# Function to process logrank tests for a given treatment and sex
# The first line defines the function process_logrank with three input parameters: df, the input DataFrame containing the data; treatment, a string representing the treatment group to analyze; and sex, an optional parameter that, if provided, will filter the data by the specified sex ('m' or 'f').
def process_logrank(df, treatment, sex=None, sites=None):
    results = []
    
    treatment_data = df[df['treatment'] == treatment]
    unique_rx_ppm = treatment_data['Rx(ppm)'].unique()
    unique_cohorts = treatment_data['cohort'].unique()

    if sites:
        treatment_data = treatment_data[treatment_data['site'].isin(sites)]
    
    for rx_ppm, cohort in [(rx, ch) for rx in unique_rx_ppm for ch in unique_cohorts]:
        if sex:
            treatment_data_filtered = treatment_data[(treatment_data['Rx(ppm)'] == rx_ppm) & (treatment_data['cohort'] == cohort) & (treatment_data['sex'] == sex)]
            control_data = df[(df['treatment'] == 'Control') & (df['cohort'] == cohort) & (df['sex'] == sex)]
        else:
            sex_combined = "m+f"
            treatment_data_filtered = treatment_data[(treatment_data['Rx(ppm)'] == rx_ppm) & (treatment_data['cohort'] == cohort)]
            control_data = df[(df['treatment'] == 'Control') & (df['cohort'] == cohort)]

        if sites:
            control_data = control_data[control_data['site'].isin(sites)]
        
        if treatment_data_filtered.empty or control_data.empty:
            continue
        
        # Fit KaplanMeierFitter for treatment data
        kmf_treatment = KaplanMeierFitter()
        kmf_treatment.fit(treatment_data_filtered['age(days)'], event_observed=treatment_data_filtered['dead'])
        
        # Fit KaplanMeierFitter for control data
        kmf_control = KaplanMeierFitter()
        kmf_control.fit(control_data['age(days)'], event_observed=control_data['dead'])
        
        # Perform logrank test between treatment and control groups
        logrank_result = logrank_test(treatment_data_filtered['age(days)'], control_data['age(days)'], event_observed_A=treatment_data_filtered['dead'], event_observed_B=control_data['dead'])
        
        # Calculate the percentage of lifespan increase between treatment and control groups
        percentage_lifespan_increase = ((kmf_treatment.median_survival_time_ - kmf_control.median_survival_time_) / kmf_control.median_survival_time_) * 100
        
        # Append the result to the results list as a dictionary
        results.append({
            "treatment": treatment,
            "Rx(ppm)": rx_ppm,
            "age_initiation(mo)": treatment_data_filtered['age_initiation(mo)'].unique()[0],
            "group": treatment_data_filtered['group'].unique()[0],  
            "cohort": cohort,
            "sex": sex if sex else sex_combined,
            "site": '+'.join(sites) if sites else "combined",
            "test_statistic": logrank_result.test_statistic,
            "p-value": logrank_result.p_value,
            "%_lifespan_increase": percentage_lifespan_increase,
            "treatment_median_survival": kmf_treatment.median_survival_time_,
            "control_median_survival": kmf_control.median_survival_time_,
            "treatment_max_survival": treatment_data_filtered['age(days)'].max(),
            "control_max_survival": control_data['age(days)'].max(),
            "treatment_sample_size": len(treatment_data_filtered),
            "control_sample_size": len(control_data)
        })

    return results

# The list of unique site combinations
unique_sites = list(df['site'].unique())
site_combinations = list(chain.from_iterable(combinations(unique_sites, r) for r in range(1, len(unique_sites)+1)))

# Initialize an empty list for the results
results = []

# Loop through each combination of sites
for site_combination in site_combinations:
    # Get the list of unique treatments excluding 'Control'
    unique_treatments = [treatment for treatment in df['treatment'].unique() if treatment != 'Control']

    for treatment in unique_treatments:
        # Process logrank tests for each treatment, male, female, and combined
        treatment_logrank_male = process_logrank(df, treatment, 'm', sites=site_combination)
        treatment_logrank_female = process_logrank(df, treatment, 'f', sites=site_combination)
        treatment_logrank_combined = process_logrank(df, treatment, sites=site_combination)

        # Combine the results for male, female, and combined sex groups
        results.extend(treatment_logrank_male + treatment_logrank_female + treatment_logrank_combined)

# Convert the results list into a pandas DataFrame
results_df = pd.DataFrame(results)

# Alphabetize the output DataFrame by the 'treatment' column
results_df = results_df.sort_values(by='treatment')

# Specify the output CSV file path
output_csv_path = 'C:\\Users\\ndsch\\Data\\ITP-Lifespan-Data\\ITP_processed_data\\ITP_logrank_simple.csv'

# Save the results DataFrame to a CSV file
results_df.to_csv(output_csv_path, index=False)

# Print a message to indicate the results are saved to the CSV file
print(f"Results saved to {output_csv_path}")

KeyError: 'Treatment'