## Modeling of Exitron Usage in ALS Patients vs Healthy Individuals

In [6]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
import re

### Data

In [None]:
# covariates
covs = pd.read_csv("/gpfs/commons/projects/ALS_Consortium_analysis/compbio/data/metadata_updated_sep4_2024_with_merged_mc.csv")
covs.head()


In [None]:
# normalized exitron data (without correct labels)
exitron_data_npy = np.load("normalized_data.npy")
exitron_data = pd.DataFrame(exitron_data_npy)
exitron_data.head()

In [None]:
# labels for exitron data
reference_labels = pd.read_parquet("filtered_exitron_data.parquet")
reference_labels.head()


### Covariate and Exitron Data Pre-processing 

In [3]:
# covariates data
covs = pd.read_csv("/gpfs/commons/projects/ALS_Consortium_analysis/compbio/data/metadata_updated_sep4_2024_with_merged_mc.csv")
covs = covs.sort_values(by=['RIN', 'Platform'], ascending=[False, False])
covs['Site.Specimen.Collected'] = covs['Site.Specimen.Collected'].str.replace(' ', '_')
covs['Prep'] = covs['Prep'].str.replace(' ', '_')

# create  binary 'Disease' variable (1 for ALS, 0 for control)
covs['Disease'] = (covs['Subject.Group'] == 'ALS').astype(int)

# exitron data
exitron_data_npy = np.load("normalized_data.npy")
exitron_data = pd.DataFrame(exitron_data_npy) 
# rename columns and rows corresponding with exitron_data/.parquet
reference_labels = pd.read_parquet("filtered_exitron_data.parquet")
exitron_data.columns = reference_labels.columns
exitron_data.index = reference_labels.index

# covariates for each tissue
design_list = {
    "Cerebellum": "~Disease + Prep + MEAN_READ_LENGTH + Site.Specimen.Collected",
    "Spinal_Cord_Lumbar": "~Disease + Site.Specimen.Collected + Prep + PCT_CHIMERAS",
    "Spinal_Cord_Cervical": "~Disease + Prep + MEAN_READ_LENGTH + Site.Specimen.Collected",
    "Spinal_Cord_Thoracic": "~Disease + Prep + MEDIAN_5PRIME_BIAS + PCT_CHIMERAS",
    "Cortex_Frontal": "~Disease + Prep + MEAN_READ_LENGTH + PCT_CHIMERAS",
    "Hippocampus": "~Disease + MEDIAN_5PRIME_BIAS + MEDIAN_3PRIME_BIAS + PCT_CODING_BASES",
    "Cortex_Motor": "~Disease + Prep + MEAN_READ_LENGTH + PCT_CHIMERAS",
    "Cortex_Temporal": "~Disease + Prep + PCT_CHIMERAS + MEDIAN_5PRIME_BIAS"
}

### Logistic Regression Model

In [None]:
all_results_list = []

for tissue, design in design_list.items():
    print(f"Processing tissue: {tissue}")
    
    # filter data for the specified tissue
    tmp = covs[covs['Sample.Source'] == tissue].copy()
    tmp = tmp.drop_duplicates(subset=['ExternalSubjectId'])
    model_vars = design.split('~')[1].strip().split(' + ')
    columns_to_select = ['RNA', 'Subject.Group'] + model_vars
    if 'Disease' in columns_to_select:
        columns_to_select.remove('Disease')
    tmp = tmp[columns_to_select]
    tmp['Subject.Group'] = (tmp['Subject.Group'] == 'ALS').astype(int)

    # model formula
    predictor_formula = " + ".join(model_vars).replace('Disease', '')
    predictor_formula = re.sub(r'\s*\+\s*\+', '+', predictor_formula.strip()).strip(' +')
    final_model_formula = f"Subject_Group ~ exitron_norm + {predictor_formula}"

    # loop through each exitron 
    for exitron in exitron_data.index:
        # match exitron data by 'RNA'
        exitron_values_for_mapping = exitron_data.loc[exitron]
        tmp['exitron_norm'] = tmp['RNA'].map(exitron_values_for_mapping)        

        if (tmp['exitron_norm'] != 0).sum() > 10: # At least 10 peeps with a nonzero value
            clean_tmp = tmp.rename(columns=lambda c: re.sub(r'[.]', '_', c))
            clean_formula = re.sub(r'[.]', '_', final_model_formula)

            try:
                # fit logistic regression model
                fit = smf.logit(formula=clean_formula, data=clean_tmp).fit(disp=0) # disp=0 hides convergence output
                
                # extract results for the 'exitron_norm' coefficient
                res = fit.summary2().tables[1]
                exitron_res = res.loc['exitron_norm'].rename(fit.params.index[1])
                
                # add tissue and exitron info and append to our results list
                result_row = pd.DataFrame([exitron_res])
                result_row['tissue'] = tissue
                result_row['exitron'] = exitron
                all_results_list.append(result_row)

            except Exception as e:
                # print(f"    Could not fit model for {exitron} in {tissue}. Error: {e}")
                continue

# Combine all results into a single DataFrame
if all_results_list:
    results_df = pd.concat(all_results_list, ignore_index=True)
else:
    print("No results")

### Results

In [35]:
print(results_df)

         Coef.  Std.Err.         z     P>|z|     [0.025     0.975]  \
0    -0.723267  1.185449 -0.610121  0.541782  -3.046703   1.600170   
1    -0.290065  0.610802 -0.474893  0.634864  -1.487216   0.907085   
2     4.406686  5.193176  0.848553  0.396130  -5.771751  14.585123   
3    -4.033948  3.052461 -1.321540  0.186322 -10.016662   1.948766   
4    -0.650334  1.042348 -0.623913  0.532685  -2.693298   1.392630   
...        ...       ...       ...       ...        ...        ...   
7748  2.194986  4.142583  0.529859  0.596210  -5.924327  10.314299   
7749  1.202727  2.207991  0.544715  0.585949  -3.124856   5.530310   
7750  6.581783  5.250245  1.253614  0.209982  -3.708509  16.872075   
7751 -2.165554  2.270092 -0.953950  0.340109  -6.614853   2.283744   
7752 -6.093024  3.848505 -1.583218  0.113372 -13.635956   1.449907   

               tissue                      exitron  
0          Cerebellum  chr10:119042185:119042245:-  
1          Cerebellum  chr10:133285213:133285455:-  


In [5]:
significant_results = results_df[results_df['P>|z|'] < 0.05]
significant_results.head()
print(f"Significant results: {len(significant_results)}")

Significant results: 853


### Multiple-Test Correction on p-Values

In [18]:
corrected_results = []

unique_tissues = results_df['tissue'].unique()

# loop through each tissue
for tissue in unique_tissues:
    res_tissue = results_df[results_df['tissue'] == tissue].copy()
    p_values = res_tissue['P>|z|'].dropna()
    
    # apply multiple-tests correction
    reject, p_adj_fdr, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')
    res_tissue.loc[p_values.index, 'p_fdr'] = p_adj_fdr
    corrected_results.append(res_tissue)

final_results = pd.concat(corrected_results)

# filter for significance: p_fdr < 0.05
significant_hits = final_results[final_results['p_fdr'] < 0.05]
print("Significant hits:")
print(significant_hits)

Significant hits:
          Coef.  Std.Err.         z     P>|z|     [0.025     0.975]  \
858    1.760623  0.490647  3.588368  0.000333   0.798972   2.722274   
869   12.065610  3.559049  3.390122  0.000699   5.090001  19.041218   
876    7.919233  2.007813  3.944208  0.000080   3.983991  11.854474   
977   18.419595  5.046529  3.649953  0.000262   8.528580  28.310609   
1129  10.297866  2.385760  4.316388  0.000016   5.621862  14.973871   
...         ...       ...       ...       ...        ...        ...   
5301  -1.241073  0.331151 -3.747755  0.000178  -1.890117  -0.592029   
5310  -4.926266  1.556356 -3.165256  0.001549  -7.976668  -1.875863   
5316  -6.948522  2.199942 -3.158503  0.001586 -11.260328  -2.636715   
6275  -1.981472  0.476112 -4.161776  0.000032  -2.914634  -1.048309   
6805   2.635913  0.662019  3.981628  0.000068   1.338380   3.933447   

                  tissue                      exitron     p_fdr  
858   Spinal_Cord_Lumbar        chr11:244075:244160:+  0.028683