In [80]:
import numpy as np
import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sns
import anndata as ad
import PyPDF2
import matplotlib.pyplot as plt
import statistics

sc.settings.verbosity = 3             
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.8.2 anndata==0.7.8 umap==0.5.2 numpy==1.20.1 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.9.8 pynndescent==0.5.5


In [81]:
#TMS Adata Files
adata_processed = "/mnt/ibm_sm/home/lea.mcgeever/ms/notebooks/adata_files/adata_processed.h5ad"
adata = sc.read_h5ad(adata_processed)

In [82]:
mt_genes = [
    "mt-Atp6",
    "mt-Atp8",
    "mt-Co1",
    "mt-Co2",
    "mt-Co3",
    "mt-Cytb",
    "mt-Nd1",
    "mt-Nd2",
    "mt-Nd3",
    "mt-Nd4",
    "mt-Nd4l",
    "mt-Nd5",
    "mt-Nd6",
    "mt-Rnr1",
    "mt-Rnr2",
    "mt-Ta",
    "mt-Te",
    "mt-Tf",
    "mt-Ti",
    "mt-Tl1",
#     "mt-Tl2",
    "mt-Tm",
    "mt-Tp",
    "mt-Tq",
    "mt-Tt",
    "mt-Tv"
]

In [83]:
tissues = ['Aorta',
 'BAT',
 'Bladder',
 'Brain_Myeloid',
 'Brain_Non-Myeloid',
 'Diaphragm',
 'GAT',
 'Heart',
 'Kidney',
 'Large_Intestine',
 'Limb_Muscle',
 'Liver',
 'Lung',
 'MAT',
 'Mammary_Gland',         
 'Marrow',
 'Pancreas',
 'SCAT',
 'Skin',
 'Spleen',
 'Thymus',
 'Tongue',
 'Trachea'
                   ]

In [84]:
def adata_subset_by_age(adata_ob, age_m):
    '''Takes adata object and string 3m, 18m, 21m or 24m
    Outputs an adata subset by assigned age'''
    
    return adata_ob[adata_ob.obs['age'] == age_m]

def adata_subset_by_tissue(adata_age_subset, tissue):
    '''Takes adata subsetted by age, and a string of desired tissue type
    Returns a subset of inputted tissue'''
    return adata_age_subset[adata_age_subset.obs['tissue']==tissue] 

**Subset by age and tissue**

In [85]:
adata_3m = adata_subset_by_age(adata,'3m')
adata_18m = adata_subset_by_age(adata,'18m')
adata_21m = adata_subset_by_age(adata,'21m')
adata_24m = adata_subset_by_age(adata,'24m')

In [86]:
'''Subset the different 3m tissues'''
adata_3m_aorta = adata_subset_by_tissue(adata_3m,'Aorta')
adata_3m_bat = adata_subset_by_tissue(adata_3m,'BAT')
adata_3m_bladder = adata_subset_by_tissue(adata_3m,'Bladder')
adata_3m_brain_myeloid = adata_subset_by_tissue(adata_3m,'Brain_Myeloid')
adata_3m_brain_non_myeloid = adata_subset_by_tissue(adata_3m,'Brain_Non-Myeloid')
adata_3m_diaphragm = adata_subset_by_tissue(adata_3m,'Diaphragm')
adata_3m_GAT = adata_subset_by_tissue(adata_3m,'GAT')
adata_3m_heart = adata_subset_by_tissue(adata_3m,'Heart')
adata_3m_kidney = adata_subset_by_tissue(adata_3m,'Kidney')
adata_3m_large_intestine = adata_subset_by_tissue(adata_3m,'Large_Intestine')
adata_3m_limb_muscle = adata_subset_by_tissue(adata_3m,'Limb_Muscle')
adata_3m_liver = adata_subset_by_tissue(adata_3m,'Liver')
adata_3m_lung = adata_subset_by_tissue(adata_3m,'Lung')
adata_3m_mat = adata_subset_by_tissue(adata_3m,'MAT')
adata_3m_mammary_gland = adata_subset_by_tissue(adata_3m,'Mammary_Gland')
adata_3m_marrow = adata_subset_by_tissue(adata_3m,'Marrow')
adata_3m_pancreas = adata_subset_by_tissue(adata_3m,'Pancreas')
adata_3m_scat = adata_subset_by_tissue(adata_3m,'SCAT')
adata_3m_skin = adata_subset_by_tissue(adata_3m,'Skin')
adata_3m_spleen = adata_subset_by_tissue(adata_3m,'Spleen')
adata_3m_thymus = adata_subset_by_tissue(adata_3m,'Thymus')
adata_3m_tongue = adata_subset_by_tissue(adata_3m,'Tongue')
adata_3m_trachae = adata_subset_by_tissue(adata_3m,'Trachea')

In [87]:
'''Subset the different 18m tissues'''
adata_18m_aorta = adata_subset_by_tissue(adata_18m,'Aorta')
adata_18m_bat = adata_subset_by_tissue(adata_18m,'BAT')
adata_18m_bladder = adata_subset_by_tissue(adata_18m,'Bladder')
adata_18m_brain_myeloid = adata_subset_by_tissue(adata_18m,'Brain_Myeloid')
adata_18m_brain_non_myeloid = adata_subset_by_tissue(adata_18m,'Brain_Non-Myeloid')
adata_18m_diaphragm = adata_subset_by_tissue(adata_18m,'Diaphragm')
adata_18m_GAT = adata_subset_by_tissue(adata_18m,'GAT')
adata_18m_heart = adata_subset_by_tissue(adata_18m,'Heart')
adata_18m_kidney = adata_subset_by_tissue(adata_18m,'Kidney')
adata_18m_large_intestine = adata_subset_by_tissue(adata_18m,'Large_Intestine')
adata_18m_limb_muscle = adata_subset_by_tissue(adata_18m,'Limb_Muscle')
adata_18m_liver = adata_subset_by_tissue(adata_18m,'Liver')
adata_18m_lung = adata_subset_by_tissue(adata_18m,'Lung')
adata_18m_mat = adata_subset_by_tissue(adata_18m,'MAT')
adata_18m_mammary_gland = adata_subset_by_tissue(adata_18m,'Mammary_Gland')
adata_18m_marrow = adata_subset_by_tissue(adata_18m,'Marrow')
adata_18m_pancreas = adata_subset_by_tissue(adata_18m,'Pancreas')
adata_18m_scat = adata_subset_by_tissue(adata_18m,'SCAT')
adata_18m_skin = adata_subset_by_tissue(adata_18m,'Skin')
adata_18m_spleen = adata_subset_by_tissue(adata_18m,'Spleen')
adata_18m_thymus = adata_subset_by_tissue(adata_18m,'Thymus')
adata_18m_tongue = adata_subset_by_tissue(adata_18m,'Tongue')
adata_18m_trachae = adata_subset_by_tissue(adata_18m,'Trachea')

In [88]:
'''Subset the 21m into mammary gland'''
adata_21m_aorta = adata_subset_by_tissue(adata_21m,'Aorta')
adata_21m_bat = adata_subset_by_tissue(adata_21m,'BAT')
adata_21m_bladder = adata_subset_by_tissue(adata_21m,'Bladder')
adata_21m_brain_myeloid = adata_subset_by_tissue(adata_21m,'Brain_Myeloid')
adata_21m_brain_non_myeloid = adata_subset_by_tissue(adata_21m,'Brain_Non-Myeloid')
adata_21m_diaphragm = adata_subset_by_tissue(adata_21m,'Diaphragm')
adata_21m_GAT = adata_subset_by_tissue(adata_21m,'GAT')
adata_21m_heart = adata_subset_by_tissue(adata_21m,'Heart')
adata_21m_kidney = adata_subset_by_tissue(adata_21m,'Kidney')
adata_21m_large_intestine = adata_subset_by_tissue(adata_21m,'Large_Intestine')
adata_21m_limb_muscle = adata_subset_by_tissue(adata_21m,'Limb_Muscle')
adata_21m_liver = adata_subset_by_tissue(adata_21m,'Liver')
adata_21m_lung = adata_subset_by_tissue(adata_21m,'Lung')
adata_21m_mat = adata_subset_by_tissue(adata_21m,'MAT')
adata_21m_mammary_gland = adata_subset_by_tissue(adata_21m,'Mammary_Gland')
adata_21m_marrow = adata_subset_by_tissue(adata_21m,'Marrow')
adata_21m_pancreas = adata_subset_by_tissue(adata_21m,'Pancreas')
adata_21m_scat = adata_subset_by_tissue(adata_21m,'SCAT')
adata_21m_skin = adata_subset_by_tissue(adata_21m,'Skin')
adata_21m_spleen = adata_subset_by_tissue(adata_21m,'Spleen')
adata_21m_thymus = adata_subset_by_tissue(adata_21m,'Thymus')
adata_21m_tongue = adata_subset_by_tissue(adata_21m,'Tongue')
adata_21m_trachae = adata_subset_by_tissue(adata_21m,'Trachea')

In [89]:
'''Subset the different 24m tissues'''
adata_24m_aorta = adata_subset_by_tissue(adata_24m,'Aorta')
adata_24m_bat = adata_subset_by_tissue(adata_24m,'BAT')
adata_24m_bladder = adata_subset_by_tissue(adata_24m,'Bladder')
adata_24m_brain_myeloid = adata_subset_by_tissue(adata_24m,'Brain_Myeloid')
adata_24m_brain_non_myeloid = adata_subset_by_tissue(adata_24m,'Brain_Non-Myeloid')
adata_24m_diaphragm = adata_subset_by_tissue(adata_24m,'Diaphragm')
adata_24m_GAT = adata_subset_by_tissue(adata_24m,'GAT')
adata_24m_heart = adata_subset_by_tissue(adata_24m,'Heart')
adata_24m_kidney = adata_subset_by_tissue(adata_24m,'Kidney')
adata_24m_large_intestine = adata_subset_by_tissue(adata_24m,'Large_Intestine')
adata_24m_limb_muscle = adata_subset_by_tissue(adata_24m,'Limb_Muscle')
adata_24m_liver = adata_subset_by_tissue(adata_24m,'Liver')
adata_24m_lung = adata_subset_by_tissue(adata_24m,'Lung')
adata_24m_mat = adata_subset_by_tissue(adata_24m,'MAT')
adata_24m_mammary_gland = adata_subset_by_tissue(adata_24m,'Mammary_Gland')
adata_24m_marrow = adata_subset_by_tissue(adata_24m,'Marrow')
adata_24m_pancreas = adata_subset_by_tissue(adata_24m,'Pancreas')
adata_24m_scat = adata_subset_by_tissue(adata_24m,'SCAT')
adata_24m_skin = adata_subset_by_tissue(adata_24m,'Skin')
adata_24m_spleen = adata_subset_by_tissue(adata_24m,'Spleen')
adata_24m_thymus = adata_subset_by_tissue(adata_24m,'Thymus')
adata_24m_tongue = adata_subset_by_tissue(adata_24m,'Tongue')
adata_24m_trachae = adata_subset_by_tissue(adata_24m,'Trachea')

In [90]:
#adata_tissues_ages=[
adata_3m_tissues = [
                    adata_3m_aorta, 
                    adata_3m_bat, 
                    adata_3m_bladder, 
                    adata_3m_brain_myeloid, 
                    adata_3m_brain_non_myeloid,
                    adata_3m_diaphragm,
                    adata_3m_GAT,
                    adata_3m_heart,
                    adata_3m_kidney,
                    adata_3m_large_intestine,
                    adata_3m_limb_muscle,
                    adata_3m_liver,
                    adata_3m_lung,
                    adata_3m_mat,
                    adata_3m_mammary_gland,
                    adata_3m_marrow,
                    adata_3m_pancreas,
                    adata_3m_scat,
                    adata_3m_skin,
                    adata_3m_spleen,
                    adata_3m_thymus,
                    adata_3m_tongue,
                    adata_3m_trachae,
                   ]
adata_18m_tissues = [
                    adata_18m_aorta, 
                    adata_18m_bat, 
                    adata_18m_bladder, 
                    adata_18m_brain_myeloid, 
                    adata_18m_brain_non_myeloid,
                    adata_18m_diaphragm,
                    adata_18m_GAT,
                    adata_18m_heart,
                    adata_18m_kidney,
                    adata_18m_large_intestine,
                    adata_18m_limb_muscle,
                    adata_18m_liver,
                    adata_18m_lung,
                    adata_18m_mat,
                    adata_18m_mammary_gland,
                    adata_18m_marrow,
                    adata_18m_pancreas,
                    adata_18m_scat,
                    adata_18m_skin,
                    adata_18m_spleen,
                    adata_18m_thymus,
                    adata_18m_tongue,
                    adata_18m_trachae,
                   ]
adata_21m_tissues = [
                    adata_21m_aorta, 
                    adata_21m_bat, 
                    adata_21m_bladder, 
                    adata_21m_brain_myeloid, 
                    adata_21m_brain_non_myeloid,
                    adata_21m_diaphragm,
                    adata_21m_GAT,
                    adata_21m_heart,
                    adata_21m_kidney,
                    adata_21m_large_intestine,
                    adata_21m_limb_muscle,
                    adata_21m_liver,
                    adata_21m_lung,
                    adata_21m_mat,
                    adata_21m_mammary_gland,
                    adata_21m_marrow,
                    adata_21m_pancreas,
                    adata_21m_scat,
                    adata_21m_skin,
                    adata_21m_spleen,
                    adata_21m_thymus,
                    adata_21m_tongue,
                    adata_21m_trachae,
                   ]                    
adata_24m_tissues = [
                    adata_24m_aorta, 
                    adata_24m_bat, 
                    adata_24m_bladder, 
                    adata_24m_brain_myeloid, 
                    adata_24m_brain_non_myeloid,
                    adata_24m_diaphragm,
                    adata_24m_GAT,
                    adata_24m_heart,
                    adata_24m_kidney,
                    adata_24m_large_intestine,
                    adata_24m_limb_muscle,
                    adata_24m_liver,
                    adata_24m_lung,
                    adata_24m_mat,
                    adata_24m_mammary_gland,
                    adata_24m_marrow,
                    adata_24m_pancreas,
                    adata_24m_scat,
                    adata_24m_skin,
                    adata_24m_spleen,
                    adata_24m_thymus,
                    adata_24m_tongue,
                    adata_24m_trachae
]


In [91]:
testy = [adata_3m_kidney, 
adata_3m_liver]

In [92]:
def tissue_mt_expression(adata_subset_list):
    '''Takes list of adata tissue subsets and returns lists of min, max and mean mt count across tissues'''

    tissues_mean_mt_count = []
    tissues_min_mt_count = []
    tissues_max_mt_count = []
    
    for t in adata_subset_list:
        if len(t.obs['cell']) > 0:      
            tissues_min_mt_count.append(sorted(list(t.obs.total_counts_mt))[0])  
            tissues_max_mt_count.append(sorted(list(t.obs.total_counts_mt))[-1])
            tissues_mean_mt_count.append(int(statistics.mean(list(t.obs.total_counts_mt))))
            
        if len(t.obs['cell']) == 0:
            tissues_min_mt_count.append(0)  
            tissues_max_mt_count.append(0)
            tissues_mean_mt_count.append(0)    
    
    dict_tissues_mt_count = {'Tissue':tissues, 
                           'Min mt count':tissues_min_mt_count, 
                           'Max mt count': tissues_max_mt_count, 
                           'Mean mt count':tissues_mean_mt_count}
    df = pd.DataFrame(dict_tissues_mt_count)
    return df 

In [93]:
#INPUT IS ALREADY SUBSET BY TISSUE & AGE
cell_types = list(set(adata.obs.cell_ontology_class))

def mt_mmm_celltypes(adatas):
    '''Takes list of adata subsetted by tissue and returns dataframe of 
    min, max and mean mt count, and normalized mt%, across cell types, tissue and age'''
    
    tissue_tracker = []
    cell_ontology_class = []
    tissues_mean_mt_count = []
    tissues_min_mt_count = []
    tissues_max_mt_count = []
    normalized_counts = []
    normalized_rounded = []
    tissues_age_tracker = []
    
    
    for j in cell_types:
        for a in adatas:
            d = a[a.obs['cell_ontology_class']== j]

            if len(d.obs['cell']) > 0: #now filtered by cell type, age and tissue
                tissue_tracker.append(str(d.obs['tissue'][0]))

                tissues_min_mt_count.append(sorted(list(d.obs.total_counts_mt))[0])  
                
                tissues_max_mt_count.append(sorted(list(d.obs.total_counts_mt))[-1])
                
                tissues_mean_mt_count.append(int(statistics.mean(list(d.obs.total_counts_mt))))
                
                normalized_counts.append((sum(list(d.obs.total_counts_mt)))/(sum(list(d.obs.total_counts))))
                    
                tissues_age_tracker.append(str(d.obs['age'][0]))
                
                cell_ontology_class.append(str(d.obs['cell_ontology_class'][0]))

                    
                
            if len(adata.obs['cell']) == 0:
                tissues_min_mt_count.append(0)
                cell_ontology_class.append(0)
                tissues_max_mt_count.append(0)
                tissues_mean_mt_count.append(0)
                tissues_age_tracker.append(0)
                
                
    for n in normalized_counts:
        normalized_rounded.append((round((n*100),2)))
    
    dict_tissues_mt_count = {'Tissue':tissue_tracker,
                             'Cell Type':cell_ontology_class,
                           'Min mt count':tissues_min_mt_count, 
                           'Max mt count': tissues_max_mt_count, 
                           'Mean mt count':tissues_mean_mt_count,
                           'Normalized mt/total%':normalized_rounded,
                           'Age': tissues_age_tracker 
                            }
    df = pd.DataFrame(dict_tissues_mt_count)
    return df

In [94]:
'''Read all ages and tissues mt_mmm_celltypes output with pandas'''

df = pd.read_csv('/mnt/ibm_sm/home/lea.mcgeever/ms/notebooks/df_tissues_celltypes_ages.csv')

In [95]:
'''Subset for age 3 month'''

df3m = df.loc[df['Age'] == '3m']  


In [96]:

df_comparison_test = tissue_mt_expression(adata_3m_tissues)
df_comparison_test_skin = df_comparison_test.loc[df_comparison_test["Tissue"] == "Skin"]
df_comparison_test_skin

Unnamed: 0,Tissue,Min mt count,Max mt count,Mean mt count
18,Skin,17.0,166106.0,15840


In [97]:
df3 = df3m.loc[df3m["Tissue"] == "Skin"]
df3

Unnamed: 0.1,Unnamed: 0,Tissue,Cell Type,Min mt count,Max mt count,Mean mt count,Normalized mt/total%,Age
35,35,Skin,keratinocyte stem cell,4101.0,30562.0,15681,1.59,3m
73,73,Skin,T cell,5947.0,31668.0,18997,2.66,3m
121,121,Skin,macrophage,3423.0,23924.0,10751,1.52,3m
274,274,Skin,epidermal cell,393.0,60934.0,16473,1.98,3m
341,341,Skin,basal cell of epidermis,17.0,166106.0,15384,2.11,3m
445,445,Skin,bulge keratinocyte,208.0,89225.0,15950,2.61,3m


**Verifying mt_mmm_celltypes function**

Checked Adata against mt_mmm_celltypes DataFrame output

Tissue: Skin 	

cell_ontology_class: keratinocyte stem cell 	

In [119]:
'''Adata subset of 3m keratinocyte stem cell from skin tissue'''
a3mskin_kera= adata_3m_skin[adata_3m_skin.obs['cell_ontology_class'] == "keratinocyte stem cell"]

In [178]:
'''Cell Type'''
df3.iloc[[0],2].item() == a3mskin_kera.obs.cell_ontology_class[0]

True

In [179]:
'''Min mt count'''
sorted(list(a3mskin_kera.obs.total_counts_mt))[0] == (df3.iloc[[0],3]).item()

True

In [180]:
'''Max mt count'''
sorted(list(a3mskin_kera.obs.total_counts_mt))[-1] == (df3.iloc[[0],4]).item()

True

In [181]:
'''Mean mt count'''
int(statistics.mean(list(a3mskin_kera.obs.total_counts_mt))) == (df3.iloc[[0],5]).item()


True

In [177]:
'''Normalized mt/total%'''
num = sum(list(a3mskin_kera.obs.total_counts_mt)) / sum(list(a3mskin_kera.obs.total_counts))
(round((num*100),2)) == (df3.iloc[[0],6]).item()

True

In [174]:
#df3.iloc[[0],7].item() == a3mskin_kera.obs.age[0] #converts to string and then compare to adata string

True

In [165]:
(df3.iloc[[0],7] == a3mskin_kera.obs.age[0]).item()#series then item grabbed

True

In [182]:
'''Age'''
#df3.iloc[[0],7] == a3mskin_kera.obs.age[0]).values[0]#series to array to single value
(df3.iloc[[0],7]).item() == a3mskin_kera.obs.age[0]

True

In [187]:
'''Comparing pandas dataframe values to adata values for validation'''

#Adata subset of 3m keratinocyte stem cell from skin tissue
a3mskin_kera= adata_3m_skin[adata_3m_skin.obs['cell_ontology_class'] == "keratinocyte stem cell"]

#cell type
a = (df3.iloc[[0],2]).item()
b = a3mskin_kera.obs.cell_ontology_class[0]

#Min mt count
c = sorted(list(a3mskin_kera.obs.total_counts_mt))[0]
d = (df3.iloc[[0],3]).item()
#Max mt count
e = sorted(list(a3mskin_kera.obs.total_counts_mt))[-1]
f = (df3.iloc[[0],4]).item()
#Mean mt count
g = int(statistics.mean(list(a3mskin_kera.obs.total_counts_mt)))
h = (df3.iloc[[0],5]).item()
#Normalized mt/total%
num = sum(list(a3mskin_kera.obs.total_counts_mt)) / sum(list(a3mskin_kera.obs.total_counts))
i = (round((num*100),2))
j = (df3.iloc[[0],6]).item()
#Age
k = (df3.iloc[[0],7]).item()
l = a3mskin_kera.obs.age[0]
 


if a==b and c==d and e==f and g==h and i==j and k==l:
    print('You are not a fraud')


You are not a fraud
