In [70]:
# This notebook assumes you have downloaded the ITP lifespan data in .xlsx files from https://phenome.jax.org/projects/ITP1
# It is a bit of a pain to download as there is one file for each year, accessible only through several clicks each
# Most of these files have the same format (same column headers), but there are a few differences (C2014 in particular has extra columns)
# C2014 and after also change the name of the 'age(days)' column to 'age'
# Finally there are empty cells, especially in the age_initiation(mo) column in cases where the row is a control animal
# We need to fill those empty cells or else they will cause an error when processing those files in future steps

# packages needed for manipulating dataframes and uploading and concatenating xlsx files 
import pandas as pd
import os
import numpy as np

# suppress the warning about one of the Excel files having an unknonwn extension or (more likely in this case) an unknown feature in the file.
# this typically won't affect anything, and this is partly why it is good to use csvs! But ITP provided .xlsx files...
import warnings
from openpyxl import Workbook
warnings.simplefilter("ignore", category=UserWarning)

# First we need to input the raw data provided by the ITP and make sure the files are formatted in exactly the same way so that we can later 
# concatenate them into a single file. First, we will print the column headers contained in each file to make sure they match. If they don't
# we'll need to make them match so that concatenation works correctly, and so that there are no extra columns with empty cells created, which 
# will cause problems later on (empty cells are bad!). 

data_folder = 'C:\\Users\\ndsch\\Data\\ITP-Lifespan-Data\\ITP_raw_data\\'
file_names = [file for file in os.listdir(data_folder) if file.endswith('.xlsx')]

unique_columns = {}
column_files = {}

for file_name in file_names:
    file_path = os.path.join(data_folder, file_name)
    temp_df = pd.read_excel(file_path, engine='openpyxl')

    # Count unique column occurrences and track the file names
    for column in temp_df.columns:
        if column in unique_columns:
            unique_columns[column] += 1
            column_files[column].append(file_name)
        else:
            unique_columns[column] = 1
            column_files[column] = [file_name]

# Create a DataFrame to represent the table
table_df = pd.DataFrame(index=file_names, columns=unique_columns.keys())

# Fill the table with the information about column header presence in each file
for column, files in column_files.items():
    for file in files:
        table_df.at[file, column] = 'X'

print(table_df.fillna(''))

                    population cohort site sex id group Rx(ppm)  \
Lifespan_C2004.xlsx          X      X    X   X  X     X       X   
Lifespan_C2005.xlsx          X      X    X   X  X     X       X   
Lifespan_C2006.xlsx          X      X    X   X  X     X       X   
Lifespan_C2007.xlsx          X      X    X   X  X     X       X   
Lifespan_C2009.xlsx          X      X    X   X  X     X       X   
Lifespan_C2010.xlsx          X      X    X   X  X     X       X   
Lifespan_C2011.xlsx          X      X    X   X  X     X       X   
Lifespan_C2012.xlsx          X      X    X   X  X     X       X   
Lifespan_C2013.xlsx          X      X    X   X  X     X       X   
Lifespan_C2014.xlsx          X      X    X   X  X     X       X   
Lifespan_C2015.xlsx          X      X    X   X  X     X       X   
Lifespan_C2016.xlsx          X      X    X   X  X     X       X   

                    age_initiation(mo) status dead age(days) Status Dead Age  \
Lifespan_C2004.xlsx                  X      X   

In [71]:
# From the above output, you can see that Lifespan_C2014 is formatted quite differently from the other files, and 2015 and 2016 use 'age'
# instead of 'age(days)'. Let's standardize the column headers and get rid of the extra DOB and DOE data from the 2014 file.
dfs = []

for file_name in file_names:
    file_path = os.path.join(data_folder, file_name)
    temp_df = pd.read_excel(file_path, engine='openpyxl')

    # Modify the DataFrame according to the specified conditions
    if file_name == 'Lifespan_C2014.xlsx':
        temp_df = temp_df.drop(columns=['DOB', 'DOE'])
        temp_df = temp_df.rename(columns={'Status': 'status', 'Dead': 'dead', 'Age': 'age(days)'})
    elif file_name in ['Lifespan_C2015.xlsx', 'Lifespan_C2016.xlsx']:
        temp_df = temp_df.rename(columns={'age': 'age(days)'})

    dfs.append(temp_df)

# Concatenate all the data into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

#print the column headers to manually check everything looks good in the concatenated df
print("Column headers of the concatenated DataFrame:")
print(df.columns)

Column headers of the concatenated DataFrame:
Index(['population', 'cohort', 'site', 'sex', 'id', 'group', 'Rx(ppm)',
       'age_initiation(mo)', 'status', 'dead', 'age(days)'],
      dtype='object')


In [75]:
# code to check the age(days) column as I keep getting an error in the streamlit app script that this column contains non-numeric values.
# But it all 
# Find rows with non-numeric values in the 'age(days)' column
non_numeric_age_rows = df[pd.to_numeric(df['age(days)'], errors='coerce').isna()]

# Print rows with non-numeric values in the 'age(days)' column
if not non_numeric_age_rows.empty:
    print("Rows with non-numeric values in the 'age(days)' column:")
    print(non_numeric_age_rows)
else:
    print("No non-numeric values found in the 'age(days)' column.")

No non-numeric values found in the 'age(days)' column.


In [72]:
# Everything looks good. Now let's check what we are working with in terms of different treatments. 
# Let's output a list of all the unique treatment names in the 'group' column, and corresponding values of interest
# I know already that the same treatment was e.g. performed on different cohorts, or at different doses, or at different ages of initiation

pd.set_option('display.max_rows', 1000)
unique_combinations = df[['group', 'cohort', 'Rx(ppm)', 'age_initiation(mo)']].drop_duplicates().sort_values('group')
print(unique_combinations.to_string(index=False))

             group cohort  Rx(ppm) age_initiation(mo)
             17aE2  C2009      4.8               10.0
         17aE2_16m  C2016     14.4               16.0
         17aE2_20m  C2016     14.4               20.0
          17aE2_hi  C2011       14               10.0
          4-OH-PBN  C2004      315                4.0
               ACA  C2009   1000.0                4.0
               ACA  C2012     1000               16.0
            ACA_hi  C2013     2500                4.0
            ACA_lo  C2013      400                4.0
           ACA_mid  C2013     1000                4.0
               Asp  C2004       21                4.0
           Asp_200  C2014      200               11.0
            Asp_60  C2014       60               11.0
           CAPE_hi  C2005      300                4.0
           CAPE_lo  C2005       30                4.0
                CC  C2016     30.0                8.0
              Cana  C2016    180.0                7.0
           Control  C2011   

In [73]:
# The group names sometimes have '_hi' or '_low' or other things. Let's add a column called treatment that just contains the drug name
# but retains the group column in case we want to use it later. This will make grouping by e.g. "rapa" easier down the road.

def extract_treatment(group):
    if '_' in group:
        return group.split('_')[0]
    else:
        return group

df['treatment'] = df['group'].apply(extract_treatment)

# NOTE THAT THIS DELETES DATA FOR SIMPLIFICATION TO QUICKLY MAKE AN APP. WE WILL WANT THIS DATA BACK LATER!
# Drop rows where the 'group' column is "MetRapa", "Rapa_hi_cycle", or "Rapa_hi_start_stop"
# This will leave only treatments that were applied continuously, and simplify things for the purposes of this app
# We'll make another app later that includes those special cases
df = df.drop(df[df['group'].isin(['MetRapa', 'Rapa_hi_cycle', 'Rapa_hi_start_stop'])].index)

# Now let's fill missing cells in age_initiation and remove the decimal from the age_initiation column and standardize the Rx(ppm) column
df['age_initiation(mo)'] = df['age_initiation(mo)'].fillna(0)
# Remove the decimal place from 'age_initiation(mo)'
df['age_initiation(mo)'] = df['age_initiation(mo)'].astype(int)
# Add a decimal to all values in Rx(ppm) for consistency
df['Rx(ppm)'] = df['Rx(ppm)'].astype(float).round(1)
# Convert age(days) to integers
df['age(days)'] = df['age(days)'].astype(int)
# Convert 'dead' to boolean
df['dead'] = df['dead'].astype(bool)

# Quick manual check to make sure it all looks good:
pd.set_option('display.max_rows', 1000)
unique_combinations = df[['treatment','group', 'cohort', 'Rx(ppm)', 'age_initiation(mo)']].drop_duplicates().sort_values('group')
print(unique_combinations.to_string(index=False))

treatment              group cohort  Rx(ppm)  age_initiation(mo)
    17aE2              17aE2  C2009      4.8                  10
    17aE2          17aE2_16m  C2016     14.4                  16
    17aE2          17aE2_20m  C2016     14.4                  20
    17aE2           17aE2_hi  C2011     14.0                  10
 4-OH-PBN           4-OH-PBN  C2004    315.0                   4
      ACA                ACA  C2012   1000.0                  16
      ACA                ACA  C2009   1000.0                   4
      ACA             ACA_hi  C2013   2500.0                   4
      ACA             ACA_lo  C2013    400.0                   4
      ACA            ACA_mid  C2013   1000.0                   4
      Asp                Asp  C2004     21.0                   4
      Asp            Asp_200  C2014    200.0                  11
      Asp             Asp_60  C2014     60.0                  11
     CAPE            CAPE_hi  C2005    300.0                   4
     CAPE            CAPE

In [74]:
# Here we finally have a concatenated and minimally processed data file.
# Let's output a new file called ITP_2004-2016_concat.csv.
# We'll use that as the starting file in other notebooks to do make things like Kaplan Meier curves.
# I want to save this file because there are a few special cases, like group = Rapa_hi_start_stop that are going to be harder to deal with and I'd like to just get something
# up and running before dealing with that. So we'll have two notebooks most likely - one that removes those special cases, and another that includes them.

output_folder = 'C:\\Users\\ndsch\\Data\\ITP-Lifespan-Data\\ITP_processed_data\\'
output_file_name = 'ITP_2004-2016_concat.csv'
output_file_path = os.path.join(output_folder, output_file_name)

# Save the DataFrame as a CSV file
df.to_csv(output_file_path, index=False)