## Modeling Exitron Usage Effect on ALS Expression/Severity

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
import re

### Data Pre-processing 

In [26]:
# covariates data
covs = pd.read_csv("/gpfs/commons/projects/ALS_Consortium_analysis/compbio/data/metadata_updated_sep4_2024_with_merged_mc.csv")
covs = covs.sort_values(by=['RIN', 'Platform'], ascending=[False, False])
covs['Site.Specimen.Collected'] = covs['Site.Specimen.Collected'].str.replace(' ', '_')
covs['Prep'] = covs['Prep'].str.replace(' ', '_')
covs.columns = [col.replace('.', '_').replace(' ', '_') for col in covs.columns]

# exitron data
exitron_data_npy = np.load("normalized_data.npy")
exitron_data = pd.DataFrame(exitron_data_npy) 
# rename columns and rows corresponding with exitron_data/.parquet
reference_labels = pd.read_parquet("filtered_exitron_data.parquet")
exitron_data.columns = reference_labels.columns
exitron_data.index = reference_labels.index
exitron_data = exitron_data.T

# covariates for each tissue
design_list = {
    "Cerebellum": "~Disease + Prep + MEAN_READ_LENGTH + Site.Specimen.Collected",
    "Spinal_Cord_Lumbar": "~Disease + Site.Specimen.Collected + Prep + PCT_CHIMERAS",
    "Spinal_Cord_Cervical": "~Disease + Prep + MEAN_READ_LENGTH + Site.Specimen.Collected",
    "Spinal_Cord_Thoracic": "~Disease + Prep + MEDIAN_5PRIME_BIAS + PCT_CHIMERAS",
    "Cortex_Frontal": "~Disease + Prep + MEAN_READ_LENGTH + PCT_CHIMERAS",
    "Hippocampus": "~Disease + MEDIAN_5PRIME_BIAS + MEDIAN_3PRIME_BIAS + PCT_CODING_BASES",
    "Cortex_Motor": "~Disease + Prep + MEAN_READ_LENGTH + PCT_CHIMERAS",
    "Cortex_Temporal": "~Disease + Prep + PCT_CHIMERAS + MEDIAN_5PRIME_BIAS"
}

In [20]:
exitron_data.head()

title,chr10:119042185:119042215:-,chr10:119042185:119042245:-,chr10:119042185:119042275:-,chr10:119042190:119042373:-,chr10:119042215:119042638:-,chr10:119042220:119042310:-,chr10:119042250:119042310:-,chr10:119042275:119042638:-,chr10:119042280:119042310:-,chr10:133285213:133285455:-,...,chrX:76428739:76428883:+,chrX:76428739:76428919:+,chrX:76428847:76428919:+,chrX:76428883:76428919:+,chrX:76428883:76428955:+,chrX:76428919:76429027:+,chrX:76428955:76429063:+,chrX:77683451:77683538:-,chrX:93672617:93672659:-,chrX:93672659:93672701:-
sourceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CGND-HRA-00013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.257485,0.961039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.122112,0.0,0.0
CGND-HRA-00015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.97561,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CGND-HRA-00017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.071429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119298,0.1805,0.0
CGND-HRA-00019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.929293,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186508,0.0,0.0
CGND-HRA-00020-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.904348,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# filter for only ALS patients
als_covs = covs[covs['Subject.Group'] == 'ALS'].copy()

# create new binary and continuous outcome variables
als_covs['c9orf72_status'] = (als_covs['c9orf72'] == 'Yes').astype(int)
als_covs['onset_is_limb'] = (als_covs['Site.of.Motor.Onset'] == 'Limb').astype(int)
als_covs['ALS_duration'] = als_covs['Age.at.Death'] - als_covs['Age.at.Symptom.Onset']

als_covs.head()

Unnamed: 0.1,Unnamed: 0,RNA,Quote,ExternalSubjectId,NeuroBankID,Project,Site.Specimen.Collected,Sex,Ethnicity,Subject.Group,...,PCT_USABLE_BASES,MEDIAN_CV_COVERAGE,MEDIAN_5PRIME_BIAS,MEDIAN_3PRIME_BIAS,MEDIAN_5PRIME_TO_3PRIME_BIAS,PF_NOT_ALIGNED_BASES,c9orf72,c9orf72_status,onset_is_limb,ALS_duration
1693,CGND-HRA-02842,CGND-HRA-02842,CGND_14630,91-072-76,,ALS Consortium,Academic_Medical_Center,Female,Unknown,ALS,...,49.9382,0.296956,0.822366,0.942447,0.901721,140140575,No,0,0,2.0
1492,CGND-HRA-02522,CGND-HRA-02522,CGND_14628,NEUTH315CBQ,NEUTH315CBQ,Target ALS,Columbia_University_Medical_Center,Female,Not Hispanic/Latino,ALS,...,52.0311,0.293553,0.799647,0.948916,0.866283,198613177,No,0,1,
1318,CGND-HRA-02285,CGND-HRA-02285,CGND_14344,92-080-08,,ALS Consortium,Academic_Medical_Center,Female,Unknown,ALS,...,62.1858,0.308692,0.82689,0.944411,0.890305,117784929,No,0,0,2.0
1320,CGND-HRA-02287,CGND-HRA-02287,CGND_14344,92-081-09,,ALS Consortium,Academic_Medical_Center,Male,Unknown,ALS,...,59.4618,0.301863,0.823581,0.943307,0.884671,127136549,No,0,1,5.0
1512,CGND-HRA-02565,CGND-HRA-02565,CGND_14333,12-183-80,,ALS Consortium,Academic_Medical_Center,Male,Unknown,ALS,...,52.3515,0.293837,0.84369,0.944196,0.896569,131674287,No,0,0,1.0


### Models

In [16]:
# LOGISTIC REGRESSION

def runLogisticRegression(dependent_var, design_list, als_covs, exitron_data):
    all_results_list = []
    for tissue, design in design_list.items():
        print(f"Processing {tissue} for {dependent_var}")
        
        # filter for specified tissue
        tmp = als_covs[als_covs['Sample.Source'] == tissue].copy()
        tmp = tmp.drop_duplicates(subset=['ExternalSubjectId'])

        base_model_vars = [v.strip() for v in design.split('~')[1].split('+') if v.strip() and v.strip() != 'Disease']
        variable_cols = [col for col in base_model_vars if col in tmp.columns and tmp[col].nunique() > 1]
        
        if variable_cols:
            predictor_formula = " + ".join(variable_cols)
            final_model_formula = f"{dependent_var} ~ exitron_norm + {predictor_formula}"
        else:
            final_model_formula = f"{dependent_var} ~ exitron_norm"

        # Loop through each exitron 
        for exitron in exitron_data.index:
            exitron_values_for_mapping = exitron_data.loc[exitron]
            tmp['exitron_norm'] = tmp['RNA'].map(exitron_values_for_mapping)        

            if (tmp['exitron_norm'].fillna(0) != 0).sum() > 10:
                clean_tmp = tmp.rename(columns=lambda c: re.sub(r'[.]', '_', c))
                
                try:
                    fit = smf.logit(formula=final_model_formula, data=clean_tmp).fit(disp=0)
                    
                    res = fit.summary2().tables[1]
                    if 'exitron_norm' in res.index:
                        exitron_res = res.loc['exitron_norm']
                        result_row = pd.DataFrame([exitron_res])
                        result_row['tissue'] = tissue
                        result_row['exitron'] = exitron
                        all_results_list.append(result_row)
                except Exception as e:
                    # print(f"    Could not fit model for {exitron} in {tissue}. Error: {e}")
                    continue

    # results
    if all_results_list:
        results_df = pd.concat(all_results_list, ignore_index=True)
        return results_df
    

In [None]:
# LINEAR REGRESSION

# Update design_list to use the new clean column names
design_list = {
    "Cerebellum": "~ Disease + Prep + MEAN_READ_LENGTH + Site_Specimen_Collected",
    "Spinal_Cord_Lumbar": "~ Disease + Site_Specimen_Collected + Prep + PCT_CHIMERAS",
    "Spinal_Cord_Cervical": "~ Disease + Prep + MEAN_READ_LENGTH + Site_Specimen_Collected",
    "Spinal_Cord_Thoracic": "~ Disease + Prep + MEDIAN_5PRIME_BIAS + PCT_CHIMERAS",
    "Cortex_Frontal": "~ Disease + Prep + MEAN_READ_LENGTH + PCT_CHIMERAS",
    "Hippocampus": "~ Disease + MEDIAN_5PRIME_BIAS + MEDIAN_3PRIME_BIAS + PCT_CODING_BASES",
    "Cortex_Motor": "~ Disease + Prep + MEAN_READ_LENGTH + PCT_CHIMERAS",
    "Cortex_Temporal": "~ Disease + Prep + PCT_CHIMERAS + MEDIAN_5PRIME_BIAS"
}

# Remove 'Disease' from formulas for within-ALS analysis
design_list_als_only = {}
for tissue, formula in design_list.items():
    covariates = [p.strip() for p in formula.replace("~", "").split("+")]
    filtered_covariates = [p for p in covariates if p != 'Disease' and p]
    new_formula_rhs = "~ " + " + ".join(filtered_covariates)
    design_list_als_only[tissue] = new_formula_rhs


all_results = []
outcomes_to_predict = ["Age_at_Symptom_Onset", "Age_at_Death", "ALS_duration"]
all_exitrons = exitron_data.columns

print(f"Found {len(all_exitrons)} exitrons to analyze. Beginning main loop...")

for i, exitron_id in enumerate(all_exitrons):
    print(f"\n===== Processing Exitron {i+1}/{len(all_exitrons)}: {exitron_id} =====")

    # loop through each tissue defined in the design_list
    for tissue, base_formula_rhs in design_list_als_only.items():
        
        tmp = covs[(covs['Sample_Source'] == tissue) & (covs['Subject_Group'] == "ALS")].copy()
        tmp.drop_duplicates(subset=['ExternalSubjectId'], inplace=True)

        tmp.dropna(subset=["Age_at_Symptom_Onset", "Age_at_Death"], inplace=True)
        tmp['ALS_duration'] = tmp['Age_at_Death'] - tmp['Age_at_Symptom_Onset']

        # Inner loop for each outcome variable
        for outcome in outcomes_to_predict:
            
            required_covariates = re.findall(r'\b\w+\b', base_formula_rhs)
            all_model_cols = [outcome] + required_covariates
            cols_to_use = [col for col in all_model_cols if col in tmp.columns]
            model_df = tmp[cols_to_use].copy()
            
            # -- DYNAMIC: Add splice data for the CURRENT exitron from the loop --
            model_df['splice'] = exitron_data[exitron_id][model_df.index]
            model_df.dropna(inplace=True)

            if model_df.shape[0] < 15:
                continue
            
            formula_rhs = base_formula_rhs.replace("~", "").strip()
            formula = f"{outcome} ~ splice + {formula_rhs}"
            
            try:
                fit = smf.ols(formula=formula, data=model_df).fit()
                res_summary = fit.summary2().tables[1]
                splice_results = res_summary.loc['splice']

                # -- NEW: Add exitron_id to the results --
                result_row = {
                    "exitron_id": exitron_id,
                    "tissue": tissue,
                    "outcome_variable": outcome,
                    "coefficient": splice_results['Coef.'],
                    "p_value": splice_results['P>|t|'],
                    "std_err": splice_results['Std.Err.'],
                    "n_samples": int(fit.nobs)
                }
                all_results.append(result_row)

            except Exception:
                # Failing silently here to avoid cluttering the output.
                # Could add a print statement if needed for debugging.
                continue

# --- 3. Consolidate and Display All Results ---
print("\n\n--- All Analyses Complete ---")
if all_results:
    results_df = pd.DataFrame(all_results)
    print("Consolidated Results for all exitrons:")
    display(results_df)

    # Optional: Save the full results to a CSV file
    # results_df.to_csv("all_exitrons_analysis_results.csv", index=False)
    
    # Optional: Display the most significant findings across all analyses
    print("\nTop 20 most significant findings (lowest p-value):")
    display(results_df.sort_values(by="p_value").head(20))
else:
    print("No results were generated. Check your data and filters.")

In [32]:
results_df.head(20)

Unnamed: 0,exitron_id,tissue,outcome_variable,coefficient,p_value,std_err,n_samples
0,chr10:119042185:119042215:-,Cerebellum,Age_at_Symptom_Onset,11.39477,0.440892,14.7428,148
1,chr10:119042185:119042215:-,Cerebellum,Age_at_Death,12.73897,0.345705,13.46377,148
2,chr10:119042185:119042215:-,Cerebellum,ALS_duration,1.344208,0.770973,4.608649,148
3,chr10:119042185:119042215:-,Spinal_Cord_Lumbar,Age_at_Symptom_Onset,-13.07363,0.243997,11.19368,249
4,chr10:119042185:119042215:-,Spinal_Cord_Lumbar,Age_at_Death,-13.65573,0.200308,10.63334,249
5,chr10:119042185:119042215:-,Spinal_Cord_Lumbar,ALS_duration,-0.5820972,0.831212,2.727969,249
6,chr10:119042185:119042215:-,Spinal_Cord_Cervical,Age_at_Symptom_Onset,3.704846,0.734099,10.89501,266
7,chr10:119042185:119042215:-,Spinal_Cord_Cervical,Age_at_Death,6.535599,0.526966,10.31642,266
8,chr10:119042185:119042215:-,Spinal_Cord_Cervical,ALS_duration,2.830753,0.295544,2.700613,266
9,chr10:119042185:119042215:-,Spinal_Cord_Thoracic,Age_at_Symptom_Onset,-12.20939,0.168892,8.717988,46


### Multiple-Tests Correction

In [9]:
def multipleTestsCorrection(results_df):
    # multiple-tests correction
    corrected_results = []
    unique_tissues = results_df['tissue'].unique()

    # loop through each tissue
    for tissue in unique_tissues:
        res_tissue = results_df[results_df['tissue'] == tissue].copy()
        p_values = res_tissue['P>|z|'].dropna()
        
        # apply multiple-tests correction
        reject, p_adj_fdr, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')
        res_tissue.loc[p_values.index, 'p_fdr'] = p_adj_fdr
        corrected_results.append(res_tissue)

    final_results = pd.concat(corrected_results)

    # filter for significance: p_fdr < 0.05
    significant_hits = final_results[final_results['p_fdr'] < 0.05]
    print("Significant hits:")
    return significant_hits


### Analysis

In [None]:
# c9orf72 status (logistic regression)
c9orf72_results = runLogisticRegression('c9orf72_status')
print(c9orf72_results)
# c9orf72_results.to_csv('c9orf72_results.tsv', sep='\t', index=False)

In [None]:
# site of motor onset (logistic regression)
onsetLimb_results = runLogisticRegression('onset_is_limb')
print(onsetLimb_results)
# onsetLimb_results.to_csv('onsetLimb_results.tsv', sep='\t', index=False)