In [1]:
import scdrs
import scanpy as sc
sc.set_figure_params(dpi=125)
import anndata as ad
from scipy import stats
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
from pathlib import Path
import gdown
import mygene

import data_processing_utils

warnings.filterwarnings("ignore")

In [2]:
PROJECT_DIR = Path("/work/magroup/shared/Heimdall")
#################################

DATA_DIR = PROJECT_DIR / "data" / "polygenic_disease_risk"
RAW_DATA_DIR = DATA_DIR / "raw"
RAW_DATA_GENE_DIR = DATA_DIR / "raw" / "gene_expression_data"
RAW_DATA_SCDRS_DIR = DATA_DIR / "raw" / "score_files"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

os.makedirs(RAW_DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

print(f"Raw data will be saved to: {RAW_DATA_DIR}")
print(f"Processed data will be saved to: {PROCESSED_DATA_DIR}")

Raw data will be saved to: /work/magroup/shared/Heimdall/data/polygenic_disease_risk/raw
Processed data will be saved to: /work/magroup/shared/Heimdall/data/polygenic_disease_risk/processed


# scRNAseq Data

The TMS FACS data were downloaded from the official release https://figshare.com/articles/dataset/Processed_files_to_use_with_scanpy_/8273102. 

Please refer to the scDRS paper (https://www.nature.com/articles/s41588-022-01167-z) for detailed information.

In [None]:
# ! wget -O {RAW_DATA_GENE_DIR}/tabula-muris-senis-facs-official-raw-obj.h5ad https://figshare.com/ndownloader/files/23939711

In [21]:
data = ad.read_h5ad(f"{RAW_DATA_GENE_DIR}/tabula-muris-senis-facs-official-raw-obj.h5ad") 

In [16]:
data

AnnData object with n_obs × n_vars = 110824 × 22966
    obs: 'FACS.selection', 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'sex', 'subtissue', 'tissue', 'n_genes', 'n_counts'
    var: 'n_cells'

In [5]:
data.X.shape

(110824, 22966)

In [6]:
data.var

Unnamed: 0_level_0,n_cells
index,Unnamed: 1_level_1
0610005C13Rik,3939
0610007C21Rik,61891
0610007L01Rik,36115
0610007N19Rik,19591
0610007P08Rik,12581
...,...
Zyg11a,276
Zyg11b,23165
Zyx,58021
Zzef1,26577


In [22]:
# Gene id mapping
symbol_to_ensembl_mapping = data_processing_utils.symbol_to_ensembl_from_ensembl(
    data_dir=PROJECT_DIR / "data", genes=data.var.index.tolist(), species="mouse")
data.uns["gene_mapping:symbol_to_ensembl"] = symbol_to_ensembl_mapping.mapping_full

data.var["gene_symbol"] = data.var.index
data.var["gene_ensembl"] = data.var["gene_symbol"].map(symbol_to_ensembl_mapping.mapping_combined.get)
data.var.index = data.var.index.map(symbol_to_ensembl_mapping.mapping_reduced)

data.var

Mapping data directory: /work/magroup/shared/Heimdall/data/gene_mapping/ensembl/mouse
Loading mapping from cache: /work/magroup/shared/Heimdall/data/gene_mapping/ensembl/mouse/symbol_to_ensembl.json
Successfully mapped 18,828 out of 22,966 genes (82.0%)


Unnamed: 0_level_0,n_cells,gene_symbol,gene_ensembl
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSMUSG00000109644,3939,0610005C13Rik,ENSMUSG00000109644
0610007C21Rik,61891,0610007C21Rik,
0610007L01Rik,36115,0610007L01Rik,
0610007N19Rik,19591,0610007N19Rik,
0610007P08Rik,12581,0610007P08Rik,
...,...,...,...
ENSMUSG00000034645,276,Zyg11a,ENSMUSG00000034645
ENSMUSG00000034636,23165,Zyg11b,ENSMUSG00000034636
ENSMUSG00000029860,58021,Zyx,ENSMUSG00000029860
ENSMUSG00000055670,26577,Zzef1,ENSMUSG00000055670


In [26]:
# Filter the obs DataFrame to exclude rows where gene_ensembl is NA
filtered_var = data.var[data.var['gene_ensembl'] != 'N/A']

# Subset the AnnData object to include only the variables with non-NA gene_ensembl
data = data[:, filtered_var.index].copy()

In [27]:
filtered_var

Unnamed: 0_level_0,n_cells,gene_symbol,gene_ensembl
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSMUSG00000109644,3939,0610005C13Rik,ENSMUSG00000109644
ENSMUSG00000043644,8404,0610009L18Rik,ENSMUSG00000043644
ENSMUSG00000020831,44798,0610010K14Rik,ENSMUSG00000020831
ENSMUSG00000058706,27058,0610030E20Rik,ENSMUSG00000058706
ENSMUSG00000099146,680,0610031O16Rik,ENSMUSG00000099146
...,...,...,...
ENSMUSG00000034645,276,Zyg11a,ENSMUSG00000034645
ENSMUSG00000034636,23165,Zyg11b,ENSMUSG00000034636
ENSMUSG00000029860,58021,Zyx,ENSMUSG00000029860
ENSMUSG00000055670,26577,Zzef1,ENSMUSG00000055670


In [28]:
data

AnnData object with n_obs × n_vars = 110824 × 18828
    obs: 'FACS.selection', 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'sex', 'subtissue', 'tissue', 'n_genes', 'n_counts'
    var: 'n_cells', 'gene_symbol', 'gene_ensembl'
    uns: 'gene_mapping:symbol_to_ensembl'

In [29]:
data.var

Unnamed: 0_level_0,n_cells,gene_symbol,gene_ensembl
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSMUSG00000109644,3939,0610005C13Rik,ENSMUSG00000109644
ENSMUSG00000043644,8404,0610009L18Rik,ENSMUSG00000043644
ENSMUSG00000020831,44798,0610010K14Rik,ENSMUSG00000020831
ENSMUSG00000058706,27058,0610030E20Rik,ENSMUSG00000058706
ENSMUSG00000099146,680,0610031O16Rik,ENSMUSG00000099146
...,...,...,...
ENSMUSG00000034645,276,Zyg11a,ENSMUSG00000034645
ENSMUSG00000034636,23165,Zyg11b,ENSMUSG00000034636
ENSMUSG00000029860,58021,Zyx,ENSMUSG00000029860
ENSMUSG00000055670,26577,Zzef1,ENSMUSG00000055670


# scDRS Files

Source: https://figshare.com/articles/dataset/scDRS_data_release_092121_score_file_tmsfacs/16664077

In [None]:
# ! wget -O {RAW_DATA_SCDRS_DIR}/scDRS_data_release_030122.zip https://figshare.com/ndownloader/articles/16664077/versions/3
# ! unzip {RAW_DATA_SCDRS_DIR}/scDRS_data_release_030122.zip '*.score.gz'
# ! for file in {RAW_DATA_DIR}/gene_expression_data/*.score.gz; do gunzip "$file"; done

In [7]:
df_Alzheimers = pd.read_csv(RAW_DATA_SCDRS_DIR / "PASS_Alzheimers_Jansen2019.score", delimiter='\t') 

In [8]:
df_Alzheimers.head()

Unnamed: 0,index,raw_score,norm_score,mc_pval,pval,nlog10_pval,zscore
0,A10_B000497_B009023_S10.mm10-plus-0-0,0.08755,1.502652,0.074925,0.072984,1.136771,1.453921
1,A10_B000756_B007446_S10.mm10-plus-0-0,0.12709,2.140916,0.027972,0.023351,1.631691,1.98899
2,A10_B000802_B009022_S10.mm10-plus-0-0,0.077405,0.166168,0.433566,0.413432,0.383596,0.218726
3,A10_B000927_B007456_S10.mm10-plus-0-0,0.037681,0.013627,0.46953,0.473454,0.324722,0.06659
4,A10_B001361_B007505_S10.mm10-plus-0-0,0.075004,-0.152361,0.537463,0.540573,0.267146,-0.101877


In [9]:
df_Alzheimers.describe()

Unnamed: 0,raw_score,norm_score,mc_pval,pval,nlog10_pval,zscore
count,110824.0,110824.0,110824.0,110824.0,110824.0,110824.0
mean,0.071294,1.397959e-10,0.502054,0.501389,0.445708,-0.005395
std,0.015375,1.051325,0.297618,0.297633,0.459733,1.051238
min,-0.051889,-3.849838,0.000999,3e-06,1e-06,-4.541164
25%,0.062038,-0.7393628,0.23976,0.237823,0.116211,-0.72321
50%,0.071476,-0.05998576,0.503497,0.503078,0.298365,-0.007716
75%,0.08077,0.6802779,0.765235,0.765225,0.623746,0.713323
max,0.185757,6.019493,1.0,0.999997,5.582236,4.555202


# Process 

In [30]:
score_list = []

# Iterate over all files in the directory
for file_name in os.listdir(RAW_DATA_SCDRS_DIR):
    if file_name.endswith('.score'):  
        file_path = os.path.join(RAW_DATA_SCDRS_DIR, file_name)
        score_df = pd.read_csv(file_path, sep='\t') 
        # Remove the '.score' part from the file name
        column_name = file_name.replace('.score', '')
        # Extract the 'norm_score' column and rename it with the cleaned file name
        norm_score_column = score_df['norm_score'].rename(column_name)
        score_list.append(norm_score_column)


In [41]:
# Combine all norm_score columns into a single DataFrame
all_scores = pd.concat(score_list, axis=1)
print(all_scores.shape)
print(all_scores.head())

(110824, 26)
   PASS_Rheumatoid_Arthritis  UKB_460K.disease_HYPERTENSION_DIAGNOSED  \
0                  -0.017168                                -1.214287   
1                  -0.283116                                -0.098507   
2                  -1.908300                                -2.021876   
3                  -2.886779                                 0.702753   
4                  -0.321551                                -0.084453   

   UKB_460K.disease_RESPIRATORY_ENT  PASS_Schizophrenia_Pardinas2018  \
0                         -0.148465                        -1.369314   
1                         -0.950827                         2.799682   
2                          0.233704                         1.438919   
3                         -0.982751                        -0.860558   
4                         -0.567731                        -0.061623   

   PASS_CD_deLange2017  PASS_AtrialFibrillation_Nielsen2018  \
0             0.051024                            -0

In [33]:
all_scores.index = data.obs.index
for column in all_scores.columns:
    data.obs[column] = all_scores[column]

In [34]:
data

AnnData object with n_obs × n_vars = 110824 × 18828
    obs: 'FACS.selection', 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'sex', 'subtissue', 'tissue', 'n_genes', 'n_counts', 'PASS_Rheumatoid_Arthritis', 'UKB_460K.disease_HYPERTENSION_DIAGNOSED', 'UKB_460K.disease_RESPIRATORY_ENT', 'PASS_Schizophrenia_Pardinas2018', 'PASS_CD_deLange2017', 'PASS_AtrialFibrillation_Nielsen2018', 'PASS_IBD_deLange2017', 'PASS_Alzheimers_Jansen2019', 'PASS_UC_deLange2017', 'PASS_Celiac', 'PASS_Coronary_Artery_Disease', 'UKB_460K.disease_ASTHMA_DIAGNOSED', 'UKB_460K.disease_HYPOTHYROIDISM_SELF_REP', 'UKB_460K.disease_ALLERGY_ECZEMA_DIAGNOSED', 'UKB_460K.disease_AID_ALL', 'PASS_Type_1_Diabetes', 'UKB_460K.disease_CARDIOVASCULAR', 'PASS_Primary_biliary_cirrhosis', 'PASS_Multiple_sclerosis', 'PASS_Lupus', 'PASS_Type_2_Diabetes', 'PASS_ADHD_Demontis2018', 'PASS_BIP_Mullins2021', 'PASS_Insomnia_Jansen2019', 'UKB_460K.cancer_BREAST', 'PASS_MDD_Howard2019'
  

In [None]:
# # Remove unused obs
# columns_to_remove = [
#     'FACS.selection', 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id',
#     'free_annotation', 'method', 'mouse.id', 'sex', 'subtissue', 'tissue',
#     'n_genes', 'n_counts'
# ]

# # Remove the specified columns from obs
# tms_facs_sc_data.obs = tms_facs_sc_data.obs.drop(columns=columns_to_remove)

In [37]:
# change 
file_path = RAW_DATA_DIR / 'filtered_trait_info.csv'  # Adjust the path if necessary
trait_info = pd.read_csv(file_path)



identifier_to_name = pd.Series(trait_info['Trait Name'].values, index=trait_info['Trait_Identifier']).to_dict()


data.obs.rename(columns=identifier_to_name, inplace=True)




In [38]:
data

AnnData object with n_obs × n_vars = 110824 × 18828
    obs: 'FACS.selection', 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'sex', 'subtissue', 'tissue', 'n_genes', 'n_counts', 'Rheumatoid Arthritis', 'Hyperthension', 'Respiratory and Ear-nose-throat Diseases', 'Schizophrenia', "Crohn's Disease", 'Atrial Fibrillation', 'Inflammatory Bowel Disease', "Alzheimer's Disease", 'Ulcerative Colitis', 'Celiac Disease', 'Coronary Artery Disease', 'Asthma', 'Hypothyroidism', 'Eczema', 'Auto Immune Traits', 'Type 1 Diabetes', 'Cardiovascular Diseases', 'Primary Biliary Cirrhosis', 'Multiple Sclerosis', 'Systemic Lupus Erythematosus', 'Type 2 Diabetes', 'Attention Deficit Hyperactivity Disorder', 'Bipolar Disorder', 'Insomnia', 'Breast Cancer', 'Major Depressive Disorder'
    var: 'n_cells', 'gene_symbol', 'gene_ensembl'
    uns: 'gene_mapping:symbol_to_ensembl'

In [42]:
n_obs = data.n_obs

# Split ratios
n_train = int(0.8 * n_obs)
n_valid_test = int(0.1 * n_obs)

# Create an array of indices and shuffle them
indices = np.arange(n_obs)
np.random.shuffle(indices)

# Assign indices to each split
train_indices = indices[:n_train]
valid_indices = indices[n_train:n_train + n_valid_test]
test_indices = indices[n_train + n_valid_test:n_train + 2 * n_valid_test]

# Initialize a split column with 'train'
split = pd.Series('train', index=data.obs.index)

# Assign 'valid' and 'test' based on the calculated indices
split.iloc[valid_indices] = 'valid'
split.iloc[test_indices] = 'test'

# Add the 'split' column to data.obs
data.obs['split'] = split


In [43]:
data.obs['split'].value_counts()

split
train    88660
test     11082
valid    11082
Name: count, dtype: int64

In [44]:
data

AnnData object with n_obs × n_vars = 110824 × 18828
    obs: 'FACS.selection', 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'sex', 'subtissue', 'tissue', 'n_genes', 'n_counts', 'Rheumatoid Arthritis', 'Hyperthension', 'Respiratory and Ear-nose-throat Diseases', 'Schizophrenia', "Crohn's Disease", 'Atrial Fibrillation', 'Inflammatory Bowel Disease', "Alzheimer's Disease", 'Ulcerative Colitis', 'Celiac Disease', 'Coronary Artery Disease', 'Asthma', 'Hypothyroidism', 'Eczema', 'Auto Immune Traits', 'Type 1 Diabetes', 'Cardiovascular Diseases', 'Primary Biliary Cirrhosis', 'Multiple Sclerosis', 'Systemic Lupus Erythematosus', 'Type 2 Diabetes', 'Attention Deficit Hyperactivity Disorder', 'Bipolar Disorder', 'Insomnia', 'Breast Cancer', 'Major Depressive Disorder', 'split'
    var: 'n_cells', 'gene_symbol', 'gene_ensembl'
    uns: 'gene_mapping:symbol_to_ensembl'

In [45]:
file_path = PROCESSED_DATA_DIR / "scdrs.h5ad"

# Save the AnnData object to the specified path
data.write(file_path)

print(f"Data successfully saved to {file_path}")

Data successfully saved to /work/magroup/shared/Heimdall/data/polygenic_disease_risk/processed/scdrs.h5ad


In [46]:
data = ad.read_h5ad(file_path ) 
data

AnnData object with n_obs × n_vars = 110824 × 18828
    obs: 'FACS.selection', 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'sex', 'subtissue', 'tissue', 'n_genes', 'n_counts', 'Rheumatoid Arthritis', 'Hyperthension', 'Respiratory and Ear-nose-throat Diseases', 'Schizophrenia', "Crohn's Disease", 'Atrial Fibrillation', 'Inflammatory Bowel Disease', "Alzheimer's Disease", 'Ulcerative Colitis', 'Celiac Disease', 'Coronary Artery Disease', 'Asthma', 'Hypothyroidism', 'Eczema', 'Auto Immune Traits', 'Type 1 Diabetes', 'Cardiovascular Diseases', 'Primary Biliary Cirrhosis', 'Multiple Sclerosis', 'Systemic Lupus Erythematosus', 'Type 2 Diabetes', 'Attention Deficit Hyperactivity Disorder', 'Bipolar Disorder', 'Insomnia', 'Breast Cancer', 'Major Depressive Disorder', 'split'
    var: 'n_cells', 'gene_symbol', 'gene_ensembl'
    uns: 'gene_mapping:symbol_to_ensembl'