In [14]:
import pandas as pd
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test

# Read data
data_path = 'C:\\Users\\ndsch\\Data\\ITP-Lifespan-Data\\ITP_processed_data\\ITP_2004-2016_concat_simple.csv'
df = pd.read_csv(data_path)

# Function to process logrank tests for a given treatment and sex
# The first line defines the function process_logrank with three input parameters: df, the input DataFrame containing the data; treatment, a string representing the treatment group to analyze; and sex, an optional parameter that, if provided, will filter the data by the specified sex ('m' or 'f').
def process_logrank(df, treatment, sex=None): 
    results = []  # Initialize an empty list to store results of the log-rank tests and percentage lifespan increase calculations.
    
    # Filter the input dataframe to only include rows with the given treatment
    treatment_data = df[df['treatment'] == treatment]
    
    # Get unique Rx(ppm) and cohort values from the filtered treatment_data
    unique_rx_ppm = treatment_data['Rx(ppm)'].unique()
    unique_cohorts = treatment_data['cohort'].unique()
    
    # Loop through all combinations of Rx(ppm) and cohort
    for rx_ppm, cohort in [(rx, ch) for rx in unique_rx_ppm for ch in unique_cohorts]:
        # If a specific sex is provided, filter treatment and control data accordingly
        if sex:
            treatment_data_sex = treatment_data[(treatment_data['Rx(ppm)'] == rx_ppm) & (treatment_data['cohort'] == cohort) & (treatment_data['sex'] == sex if sex else treatment_data['sex'])]
            control_data = df[(df['treatment'] == 'Control') & (df['cohort'] == cohort) & (df['sex'] == sex if sex else df['sex'])]
        else:
            # If sex is not provided, filter treatment and control data without considering the 'sex' column
            treatment_data_sex = treatment_data[(treatment_data['Rx(ppm)'] == rx_ppm) & (treatment_data['cohort'] == cohort)]
            control_data = df[(df['treatment'] == 'Control') & (df['Rx(ppm)'] == rx_ppm) & (df['cohort'] == cohort)]
        
        # Check if the filtered data for treatment or control is empty; if so, skip to the next iteration
        if treatment_data_sex.empty or control_data.empty:
            continue
        
        # Fit KaplanMeierFitter for treatment data
        kmf_treatment = KaplanMeierFitter()
        kmf_treatment.fit(treatment_data_sex['age(days)'], event_observed=treatment_data_sex['dead'])
        
        # Fit KaplanMeierFitter for control data
        kmf_control = KaplanMeierFitter()
        kmf_control.fit(control_data['age(days)'], event_observed=control_data['dead'])
        
        # Perform logrank test between treatment and control groups
        logrank_result = logrank_test(treatment_data_sex['age(days)'], control_data['age(days)'], event_observed_A=treatment_data_sex['dead'], event_observed_B=control_data['dead'])
        
        # Calculate the percentage of lifespan increase between treatment and control groups
        percentage_lifespan_increase = ((kmf_treatment.median_survival_time_ - kmf_control.median_survival_time_) / kmf_control.median_survival_time_) * 100
        
        # Append the result to the results list as a dictionary
        results.append({
            "Treatment": treatment,
            "Rx(ppm)": rx_ppm,
            "Cohort": cohort,
            "Sex": sex if sex else "combined",
            "Test statistic": logrank_result.test_statistic,
            "P-value": logrank_result.p_value,
            "%_lifespan_increase": percentage_lifespan_increase
        })

    return results

# Process logrank tests for Inu treatment, male, female, and combined
Inu_logrank_male = process_logrank(df, 'Inu', 'm')
Inu_logrank_female = process_logrank(df, 'Inu', 'f')
Inu_logrank_combined = process_logrank(df, 'Inu')

# Combine the results for male, female, and combined sex groups
results = Inu_logrank_male + Inu_logrank_female + Inu_logrank_combined

# Convert the results list into a pandas DataFrame
results_df = pd.DataFrame(results)

# Specify the output CSV file path
output_csv_path = 'C:\\Users\\ndsch\\Data\\ITP-Lifespan-Data\\ITP_processed_data\\Inu_logrank.csv'

# Save the results DataFrame to a CSV file
results_df.to_csv(output_csv_path, index=False)

# Print a message to indicate the results are saved to the CSV file
print(f"Results saved to {output_csv_path}")



Results saved to C:\Users\ndsch\Data\ITP-Lifespan-Data\ITP_processed_data\Inu_logrank.csv
