In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import scrublet as scr
import celltypist
from celltypist import models
import decoupler as dc
import matplotlib.pyplot as plt
import seaborn as sns
sc.set_figure_params(dpi=150)

# Import raw files

In [2]:
!ls /home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/

Adata_fromC_toR.h5ad
Part1_B_GSE131928_PMID31327527.h5ad
Part1_B_GSE131928_PMID31327527_part2.h5ad
Part1_C_PMID33782623_GSE163120.h5ad
Part1_D_PMID32385277_GSE131907.h5ad
Part1_E_ImportDataFromGliomaLocati.h5ad
Part1_F_ImportDataFromGliomaMetastasis.h5ad
Part1_G_ImportDataFromJoyce.h5ad
Part1_H_PMID30545854.h5ad
Part1_I_Winkler.h5ad
Part1_L_PMID35177622.h5ad
Part1_M_GSE174401_PMID34035069.h5ad
Part1_N_GSE147275_PMID34138753.h5ad
Part1_O_GSE182109_PMID35140215.h5ad
Part1_P_GSE173278PMID35303420.h5ad
Part1_Q_PRJNA579593_PMID31901251.h5ad
Part1_R_SCP503_PMID35122077.h5ad


In [3]:
adata_B = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_B_GSE131928_PMID31327527.h5ad")
adata_C = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_C_PMID33782623_GSE163120.h5ad")
adata_D = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_D_PMID32385277_GSE131907.h5ad")
adata_E = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_E_ImportDataFromGliomaLocati.h5ad")
adata_F = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_F_ImportDataFromGliomaMetastasis.h5ad")
adata_G = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_G_ImportDataFromJoyce.h5ad")
adata_H = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_H_PMID30545854.h5ad")
adata_I = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_I_Winkler.h5ad")
adata_L = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_L_PMID35177622.h5ad")
adata_M = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_M_GSE174401_PMID34035069.h5ad")
adata_N = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_N_GSE147275_PMID34138753.h5ad")
adata_O = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_O_GSE182109_PMID35140215.h5ad")
adata_P = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_P_GSE173278PMID35303420.h5ad")
adata_Q = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_Q_PRJNA579593_PMID31901251.h5ad")
adata_R = sc.read("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_R_SCP503_PMID35122077.h5ad")

# Update dataset B

In [4]:
adata_B.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'leiden', 'majority_voting', 'cell_type'],
      dtype='object')

In [5]:
adata_B.obs['Sorted']

MGH102-0      CD45
MGH102-1      CD45
MGH102-2      CD45
MGH102-3      CD45
MGH102-4      CD45
              ... 
MGH126-224    CD45
MGH126-225    CD45
MGH126-226    CD45
MGH126-227    CD45
MGH126-228    CD45
Name: Sorted, Length: 11786, dtype: category
Categories (1, object): ['CD45']

In [6]:
adata_B.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_B.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [7]:
adata_B.obs['Sample'].value_counts()

8     4424
10    2415
7     1822
11    1613
9     1283
12     229
Name: Sample, dtype: int64

In [8]:
adata_B.obs['Sample'] = adata_B.obs['Sample'].astype("str")

In [9]:
ptz_dict = {'7':"Patient1",
            '8':"Patient2",
            '9':"Patient3",
            '10':"Patient4",
            '11':"Patient5",
            '12':"Patient6"}

In [10]:
# Add cell type column based on annotation
adata_B.obs['Sample'] = [ptz_dict[clust] for clust in adata_B.obs['Sample']]

In [11]:
del adata_B.obs['batch']

# Update dataset C

In [12]:
adata_C.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [13]:
adata_C.obs['Sorted']

ND1-0       CD45
ND1-1       CD45
ND1-2       CD45
ND1-3       CD45
ND1-4       CD45
            ... 
ND5-1714    CD45
ND5-1715    CD45
ND5-1716    CD45
ND5-1717    CD45
ND5-1718    CD45
Name: Sorted, Length: 8096, dtype: category
Categories (1, object): ['CD45']

In [14]:
adata_C.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_C.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [15]:
adata_C.obs['Sample'].value_counts()

16    3572
14    1749
17    1712
13     714
15     349
Name: Sample, dtype: int64

In [16]:
adata_C.obs['Sample'] = adata_C.obs['Sample'].astype("str")

In [17]:
ptz_dict = {'13':"Patient7",
            '14':"Patient8",
            '15':"Patient9",
            '16':"Patient10",
            '17':"Patient11"}

In [18]:
# Add cell type column based on annotation
adata_C.obs['Sample'] = [ptz_dict[clust] for clust in adata_C.obs['Sample']]

In [19]:
del adata_C.obs['batch']

# Update dataset D

In [20]:
adata_D.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [21]:
adata_D.obs['Sorted'] = "WholeTissue"

In [22]:
adata_D.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_D.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [23]:
adata_D.obs['Sample'].value_counts()

Patient23    5637
Patient25    4759
Patient28    3245
Patient24    3122
Patient20    2936
Patient27    2365
Patient19    2274
Patient21    1879
Patient26    1292
Patient22    1056
Name: Sample, dtype: int64

In [24]:
adata_D.obs['Sample'] = adata_D.obs['Sample'].astype("str")

In [25]:
ptz_dict = {'Patient19':"Patient12",
            'Patient20':"Patient13",
            'Patient21':"Patient14",
            'Patient22':"Patient15",
            'Patient23':"Patient16",
            'Patient24':"Patient17",
            'Patient25':"Patient18",
            'Patient26':"Patient19",
            'Patient27':"Patient20",
            'Patient28':"Patient21"}

In [26]:
# Add cell type column based on annotation
adata_D.obs['Sample'] = [ptz_dict[clust] for clust in adata_D.obs['Sample']]

In [27]:
del adata_D.obs['batch']

# Update dataset E

In [28]:
adata_E.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [29]:
adata_E.obs['Sorted']

GLIO1_T-0       CD45
GLIO1_T-1       CD45
GLIO1_T-2       CD45
GLIO1_T-3       CD45
GLIO1_T-4       CD45
                ... 
GLIO7_T-4523    CD45
GLIO7_T-4524    CD45
GLIO7_T-4525    CD45
GLIO7_T-4527    CD45
GLIO7_T-4528    CD45
Name: Sorted, Length: 31706, dtype: category
Categories (1, object): ['CD45']

In [30]:
adata_E.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_E.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [31]:
adata_E.obs['Sample'].value_counts()

Patient32    8421
Patient30    6053
Patient31    4728
Patient35    4300
Patient33    4123
Patient34    2788
Patient29    1293
Name: Sample, dtype: int64

In [32]:
adata_E.obs['Sample'] = adata_E.obs['Sample'].astype("str")

In [33]:
ptz_dict = {'Patient29':"Patient22",
            'Patient30':"Patient23",
            'Patient31':"Patient24",
            'Patient32':"Patient25",
            'Patient33':"Patient26",
            'Patient34':"Patient27",
            'Patient35':"Patient28"}

In [34]:
# Add cell type column based on annotation
adata_E.obs['Sample'] = [ptz_dict[clust] for clust in adata_E.obs['Sample']]

In [35]:
del adata_E.obs['batch']

# Update dataset F

In [36]:
adata_F.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [37]:
adata_F.obs['Sorted'] = "WholeTissue"

In [38]:
adata_F.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_F.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [39]:
adata_F.obs['Sample'].value_counts(dropna=False)

Patient37    10827
Patient39     9848
Patient45     9294
Patient41     6894
Patient36     6241
Patient43     5606
Patient38     3643
Patient40     3431
Patient46     2052
Patient44     1980
Patient42     1291
Name: Sample, dtype: int64

In [40]:
adata_F.obs['Sample'] = adata_F.obs['Sample'].astype("str")

In [41]:
ptz_dict = {'Patient36':"Patient29",
            'Patient37':"Patient30",
            'Patient38':"Patient31",
            'Patient39':"Patient32",
            'Patient40':"Patient33",
            'Patient41':"Patient34",
            'Patient42':"Patient35",
            'Patient43':"Patient36",
            'Patient44':"Patient37",
            'Patient45':"Patient38",
            'Patient46':"Patient39"}

In [42]:
# Add cell type column based on annotation
adata_F.obs['Sample'] = [ptz_dict[clust] for clust in adata_F.obs['Sample']]

In [43]:
del adata_F.obs['batch']

# Update dataset G

In [44]:
adata_G.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [45]:
adata_G.obs['Sorted'] = "CD3"

In [46]:
adata_G.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_G.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [47]:
adata_G.obs['Sample'].value_counts(dropna=False)

Patient50    7302
Patient47    4754
Patient48    4222
Patient52    4102
Patient54    1825
Patient51    1729
Patient53     514
Patient49     165
Patient55      93
Name: Sample, dtype: int64

In [48]:
adata_G.obs['Sample'] = adata_G.obs['Sample'].astype("str")

In [49]:
ptz_dict = {'Patient47':"Patient40",
            'Patient48':"Patient41",
            'Patient49':"Patient42",
            'Patient50':"Patient43",
            'Patient51':"Patient44",
            'Patient52':"Patient45",
            'Patient53':"Patient46",
            'Patient54':"Patient47",
            'Patient55':"Patient48"}

In [50]:
# Add cell type column based on annotation
adata_G.obs['Sample'] = [ptz_dict[clust] for clust in adata_G.obs['Sample']]

In [51]:
del adata_G.obs['batch']

# Update dataset H

In [52]:
adata_H.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [53]:
adata_H.obs['Sorted'] = "WholeTissue"

In [54]:
adata_H.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_H.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [55]:
adata_H.obs['Sample'].value_counts(dropna=False)

PatientPatient56    6753
PatientPatient58    5470
PatientPatient57    4421
Name: Sample, dtype: int64

In [56]:
adata_H.obs['Sample'] = adata_H.obs['Sample'].astype("str")

In [57]:
ptz_dict = {'PatientPatient56':"Patient49",
            'PatientPatient58':"Patient50",
            'PatientPatient57':"Patient51"}

In [58]:
# Add cell type column based on annotation
adata_H.obs['Sample'] = [ptz_dict[clust] for clust in adata_H.obs['Sample']]

In [59]:
del adata_H.obs['batch']

# Update dataset I

In [60]:
adata_I.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [61]:
adata_I.obs['Sorted'] = "WholeTissue"

In [62]:
adata_I.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_I.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [63]:
adata_I.obs['Sample'].value_counts(dropna=False)

Patient61    5945
Patient60    3223
Patient59     756
Patient62     296
Name: Sample, dtype: int64

In [64]:
ptz_dict = {'Patient59':"Patient52",
            'Patient60':"Patient53",
            'Patient61':"Patient54",
            'Patient62':"Patient55"}

In [65]:
# Add cell type column based on annotation
adata_I.obs['Sample'] = [ptz_dict[clust] for clust in adata_I.obs['Sample']]

In [66]:
del adata_I.obs['batch']

# Update dataset L

In [67]:
adata_L.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [68]:
adata_L.obs['Sorted'] = "WholeTissue"

In [69]:
adata_L.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_L.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [70]:
adata_L.obs['Sample'].value_counts(dropna=False)

Patient67    10740
Patient66     6689
Patient70     5691
Patient65     5632
Patient68     5360
Patient64     4220
Patient63     3750
Patient69     3354
Name: Sample, dtype: int64

In [71]:
ptz_dict = {'Patient63':"Patient56",
            'Patient64':"Patient57",
            'Patient65':"Patient58",
            'Patient66':"Patient59",
            'Patient67':"Patient60",
            'Patient68':"Patient61",
            'Patient69':"Patient62",
            'Patient70':"Patient63"}

In [72]:
# Add cell type column based on annotation
adata_L.obs['Sample'] = [ptz_dict[clust] for clust in adata_L.obs['Sample']]

In [73]:
del adata_L.obs['batch']

# Update dataset M

In [74]:
adata_M.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [75]:
adata_M.obs['Sorted'] = "WholeTissue"

In [76]:
adata_M.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_M.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [77]:
adata_M.obs['Sample'].value_counts(dropna=False)

Patient72    1986
Patient74    1733
Patient78    1725
Patient80    1598
Patient77    1481
Patient79    1238
Patient76    1210
Patient73    1078
Patient71     893
Name: Sample, dtype: int64

In [78]:
ptz_dict = {'Patient71':"Patient64",
            'Patient72':"Patient65",
            'Patient73':"Patient66",
            'Patient74':"Patient67",
            'Patient75':"Patient68",
            'Patient76':"Patient69",
            'Patient77':"Patient70",
            'Patient78':"Patient71",
            'Patient79':"Patient72",
            'Patient80':"Patient73"}

In [79]:
# Add cell type column based on annotation
adata_M.obs['Sample'] = [ptz_dict[clust] for clust in adata_M.obs['Sample']]

In [80]:
del adata_M.obs['batch']

# Update dataset N

In [81]:
adata_N.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [82]:
adata_N.obs['Sorted']

2_0       CD45
2_1       CD45
2_2       CD45
2_3       CD45
2_4       CD45
          ... 
6_2116    CD45
6_2117    CD45
6_2119    CD45
6_2120    CD45
6_2121    CD45
Name: Sorted, Length: 3848, dtype: category
Categories (1, object): ['CD45']

In [83]:
adata_N.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_N.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [84]:
adata_N.obs['Sample'].value_counts(dropna=False)

Patient84    2004
Patient81    1616
Patient83     228
Name: Sample, dtype: int64

In [85]:
ptz_dict = {'Patient81':"Patient74",
            'Patient83':"Patient75",
            'Patient84':"Patient76"}

In [86]:
# Add cell type column based on annotation
adata_N.obs['Sample'] = [ptz_dict[clust] for clust in adata_N.obs['Sample']]

In [87]:
del adata_N.obs['batch']

# Update dataset O

In [88]:
adata_O.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [89]:
adata_O.obs['Sorted'] = "WholeTissue"

In [90]:
adata_O.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_O.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [91]:
adata_O.obs['Sample'].value_counts(dropna=False)

Patient86    29606
Patient85    28232
Patient89    28180
Patient87    24221
Patient91    10382
Patient90     9462
Patient88     7749
Name: Sample, dtype: int64

In [92]:
ptz_dict = {'Patient85':"Patient77",
            'Patient86':"Patient78",
            'Patient87':"Patient79",
            'Patient88':"Patient80",
            'Patient89':"Patient81",
            'Patient90':"Patient82",
            'Patient91':"Patient83"}

In [93]:
# Add cell type column based on annotation
adata_O.obs['Sample'] = [ptz_dict[clust] for clust in adata_O.obs['Sample']]

In [94]:
del adata_O.obs['batch']

# Update dataset P

In [95]:
adata_P.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [96]:
adata_P.obs['Sorted'] = "WholeTissue"

In [97]:
adata_P.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_P.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [98]:
adata_P.obs['Sample'].value_counts(dropna=False)

Patient99     6950
Patient104    4459
Patient103    4334
Patient106    3263
Patient100    3138
Patient108    2906
Patient101    2827
Patient105    2563
Patient102    2323
Name: Sample, dtype: int64

In [99]:
ptz_dict = {'Patient99':"Patient84",
            'Patient100':"Patient85",
            'Patient101':"Patient86",
            'Patient102':"Patient87",
            'Patient103':"Patient88",
            'Patient104':"Patient89",
            'Patient105':"Patient90",
            'Patient106':"Patient91",
            'Patient108':"Patient92"}

In [100]:
# Add cell type column based on annotation
adata_P.obs['Sample'] = [ptz_dict[clust] for clust in adata_P.obs['Sample']]

In [101]:
del adata_P.obs['batch']

# Update dataset Q

In [102]:
adata_Q.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [103]:
adata_Q.obs['Sorted'] = "WholeTissue"

In [104]:
adata_Q.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_Q.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [105]:
adata_Q.obs['Sample'].value_counts(dropna=False)

Patient113    11686
Patient110     3617
Patient114     3577
Patient109     2706
Patient112     2599
Patient111     1120
Name: Sample, dtype: int64

In [106]:
ptz_dict = {'Patient109':"Patient93",
            'Patient110':"Patient94",
            'Patient111':"Patient95",
            'Patient112':"Patient96",
            'Patient113':"Patient97",
            'Patient114':"Patient98"}

In [107]:
# Add cell type column based on annotation
adata_Q.obs['Sample'] = [ptz_dict[clust] for clust in adata_Q.obs['Sample']]

In [108]:
del adata_Q.obs['batch']

# Update dataset R

In [109]:
adata_R.obs.columns

Index(['SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis',
       'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount',
       'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2',
       'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'predicted_doublets', 'doublet_info',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score',
       'leiden', 'cell_type'],
      dtype='object')

In [110]:
adata_R.obs['Sorted'] = "WholeTissue"

In [111]:
adata_R.obs.rename(columns={"majority_voting": "Celltypist_Classification"}, inplace=True)
adata_R.obs.rename(columns={"cell_type": "PanglaoDB_Classification"}, inplace=True)

In [112]:
adata_R.obs['Sample'].value_counts(dropna=False)

Patient119    11233
Patient121     9529
Patient118     8112
Patient120     6021
Patient117     3701
Patient116     3314
Patient115      909
Name: Sample, dtype: int64

In [113]:
ptz_dict = {'Patient115':"Patient99",
            'Patient116':"Patient100",
            'Patient117':"Patient101",
            'Patient118':"Patient102",
            'Patient119':"Patient103",
            'Patient120':"Patient104",
            'Patient121':"Patient105"}

In [114]:
# Add cell type column based on annotation
adata_R.obs['Sample'] = [ptz_dict[clust] for clust in adata_R.obs['Sample']]

# Set raw as X matrix

In [115]:
adata_B
adata_C
adata_D
adata_E
adata_F
adata_G
adata_H
adata_I
adata_L
adata_M
adata_N
adata_O
adata_P
adata_Q
adata_R

AnnData object with n_obs × n_vars = 42819 × 17301
    obs: 'SampleID', 'GEO_RNA', 'Cohort', 'Type', 'Subtype', 'Diagnosis', 'Grade', 'Location', 'IDH1_Status', 'EGFR_Status', 'MGMT', 'CellCount', 'Sample', 'Sex', 'Age', 'Sorted', 'MET', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'S_score', 'G2M_score', 'phase', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'predicted_labels', 'over_clustering', 'Celltypist_Classification', 'conf_score', 'leiden', 'PanglaoDB_Classification'
    var: 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'
    uns: 'SampleID_colors', 'cell_type_colors', 'doublet_info_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'majority_voting_colors', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'X_umap_harmony', 'ora_estima

In [116]:
adata_B= adata_B.raw.to_adata()
adata_C= adata_C.raw.to_adata()
adata_D= adata_D.raw.to_adata()
adata_E= adata_E.raw.to_adata()
adata_F= adata_F.raw.to_adata()
adata_G= adata_G.raw.to_adata()
adata_H= adata_H.raw.to_adata()
adata_I= adata_I.raw.to_adata()
adata_L= adata_L.raw.to_adata()
adata_M= adata_M.raw.to_adata()
adata_N= adata_N.raw.to_adata()
adata_O= adata_O.raw.to_adata()
adata_P= adata_P.raw.to_adata()
adata_Q= adata_Q.raw.to_adata()
#adata_R=  adata_R.raw.to_adata()

In [117]:
adata_B.shape

(11786, 21459)

In [118]:
adata_C.shape

(8096, 18745)

In [119]:
adata_D.shape

(28565, 23915)

In [120]:
adata_E.shape

(31706, 22310)

In [121]:
adata_F.shape

(61107, 27896)

In [122]:
adata_G.shape

(24706, 21180)

In [123]:
adata_H.shape

(16644, 23220)

In [124]:
adata_I.shape

(10220, 31134)

In [125]:
adata_L.shape

(45436, 26455)

In [126]:
adata_M.shape

(12942, 20211)

In [127]:
adata_N.shape

(3848, 17927)

In [128]:
adata_O.shape

(137832, 27171)

In [129]:
adata_P.shape

(32763, 24488)

In [130]:
adata_Q.shape

(25305, 25115)

In [131]:
adata_R.shape

(42819, 17301)

adata = adata_C.concatenate([adata_D,adata_E,adata_F,adata_G,adata_H,adata_I,adata_L,adata_M,adata_N,adata_O,
                             adata_P,adata_Q,adata_R],axis=1)

In [132]:
import anndata as ad

In [133]:
adata1 = ad.concat([adata_O,adata_P,adata_Q,adata_R],join="outer",fill_value=0,
                    index_unique=None)

In [134]:
adata2 = ad.concat([adata_C,adata_D,adata_E,adata_F,adata_G,adata_H,adata_I,adata_L,adata_M,adata_N],join="outer",fill_value=0,
                    index_unique=None)

In [135]:
adata = ad.concat([adata1,adata2],join="outer",fill_value=0,
                    index_unique=None)

In [136]:
adata_B.write("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Part1_B_GSE131928_PMID31327527_part2.h5ad")

... storing 'Sample' as categorical


In [137]:
adata.obs['Age'] =adata.obs['Age'].astype("str")

In [138]:
adata.write("/home/lugli/spuccio/Projects/SP039/FileH5AD_concatenated/Adata_fromC_toR.h5ad")

... storing 'SampleID' as categorical
... storing 'GEO_RNA' as categorical
... storing 'Cohort' as categorical
... storing 'Type' as categorical
... storing 'Diagnosis' as categorical
... storing 'Grade' as categorical
... storing 'Location' as categorical
... storing 'IDH1_Status' as categorical
... storing 'EGFR_Status' as categorical
... storing 'MGMT' as categorical
... storing 'Sample' as categorical
... storing 'Sex' as categorical
... storing 'Age' as categorical
... storing 'Sorted' as categorical
... storing 'doublet_info' as categorical
... storing 'predicted_labels' as categorical
... storing 'over_clustering' as categorical
... storing 'Celltypist_Classification' as categorical
... storing 'leiden' as categorical
... storing 'PanglaoDB_Classification' as categorical


# Create Metadata

In [139]:
adatatmp = adata_B.concatenate([adata_C,adata_D,adata_E,adata_F,adata_G,adata_H,adata_I,adata_L,adata_M,adata_N,adata_O,adata_P,adata_Q,adata_R],index_unique=None)

In [140]:
df = pd.DataFrame(adatatmp.obs)

In [141]:
#pd.DataFrame(adatatmp.obs).fillna('none')

In [142]:
df = df.replace(np.nan,"NotAvailable")

In [143]:
df['Sample'].value_counts(dropna=False,sort=False)

Patient1       1822
Patient2       4424
Patient3       1283
Patient4       2415
Patient5       1613
              ...  
Patient101     3701
Patient102     8112
Patient103    11233
Patient104     6021
Patient105     9529
Name: Sample, Length: 104, dtype: int64

In [144]:
metadata = df.drop_duplicates(subset=['Sample'])

In [145]:
metadata.to_excel("/home/lugli/spuccio/Projects/SP039/Metadata3.xlsx")