In [26]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import scipy.sparse as sp

GSE239626_PATH = 'GSE239626'
GSE138266_PATH = 'GSE138266'
GSE194078_PATH = 'GSE194078'

GSE239626_LABELS_PATH = 'results/celltypist_labels_from_protein_Immune_All_High.csv'
GSE138266_LABELS_PATH = 'results/celltypist_labels_from_transcriptomic_GSE138266.csv'
GSE194078_LABELS_PATH = 'results/celltypist_labels_from_transcriptomic_GSE194078.csv'

### GSE239626

In [2]:
adata_GSE239626 = sc.read(GSE239626_PATH)

GSE239626_labels = pd.read_csv(GSE239626_LABELS_PATH)
adata_GSE239626.obs['cell_types_labels'] = GSE239626_labels['predicted_labels'].values

adata_GSE239626

AnnData object with n_obs × n_vars = 72317 × 36601
    obs: 'dataset', 'disease', 'cell_type', 'patiens', 'patiens_id', 'time', 'cell_types_labels'
    var: 'gene_ids', 'feature_types'

In [3]:
# rename columns 'patiens' to 'patient' and 'patiens_id' to 'patient_id'
adata_GSE239626.obs.rename(columns={'patiens': 'sample', 'patiens_id': 'patient_id'}, inplace=True)

In [4]:
adata_GSE239626.obs

Unnamed: 0,dataset,disease,cell_type,sample,patient_id,time,cell_types_labels
AAACCCAAGACTGTTC-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells
AAACCCAAGGATCATA-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells
AAACCCAGTTATTCTC-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,B cells
AAACCCATCATGAGGG-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells
AAACGAAAGCCAGTAG-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,B cells
...,...,...,...,...,...,...,...
TTTGTTGAGCGTCGAA-20,GSE239626,MS,PBMC,GSM7669084,N5,M3,T cells
TTTGTTGAGTGTACCT-20,GSE239626,MS,PBMC,GSM7669084,N5,M3,T cells
TTTGTTGCAATTGCAC-20,GSE239626,MS,PBMC,GSM7669084,N5,M3,T cells
TTTGTTGCATTCAGCA-20,GSE239626,MS,PBMC,GSM7669084,N5,M3,T cells


### GSE138266

In [5]:
adata_GSE138266 = sc.read(GSE138266_PATH)
adata_GSE138266 = adata_GSE138266[(adata_GSE138266.obs['cell_type']!='CSF')|(adata_GSE138266.obs['disease']!='PST')]
adata_GSE138266

View of AnnData object with n_obs × n_vars = 68038 × 33694
    obs: 'sample', 'patient', 'dataset', 'disease', 'cell_type', 'batch'
    var: 'gene_symbols', 'gene_ids'

In [6]:
adata_GSE138266.obs.head(2)

Unnamed: 0,sample,patient,dataset,disease,cell_type,batch
AAACCTGAGTGGGCTA-1-MS19270-CSF,GSM4104122,MS19270,GSE138266,MS,CSF,0
AAACCTGAGTGTTAGA-1-MS19270-CSF,GSM4104122,MS19270,GSE138266,MS,CSF,0


In [7]:
GSE138266_labels = pd.read_csv(GSE138266_LABELS_PATH, index_col=0)
GSE138266_labels.head(2)

Unnamed: 0,cell_types_labels
AAACCTGAGTGGGCTA-1-MS19270-CSF-1-0-0-0-0,T cells
AAACCTGAGTGTTAGA-1-MS19270-CSF-1-0-0-0-0,T cells


In [8]:
labels = []
for label in GSE138266_labels.index:
    # drop the last character of the label until it's not a letter
    while not label[-1].isalpha():
        label = label[:-1]
    labels.append(label)
GSE138266_labels.index = labels

GSE138266_labels.head(2)

Unnamed: 0,cell_types_labels
AAACCTGAGTGGGCTA-1-MS19270-CSF,T cells
AAACCTGAGTGTTAGA-1-MS19270-CSF,T cells


In [9]:
for label in GSE138266_labels.index:
    if label not in adata_GSE138266.obs.index:
        print(label)

In [10]:
for label in adata_GSE138266.obs.index:
    if label not in GSE138266_labels.index:
        print(label)

In [11]:
def assigne_labels(row):
    row['cell_types_labels'] = GSE138266_labels[GSE138266_labels.index==row.name]['cell_types_labels'].values[0]
    return row

adata_GSE138266.obs = adata_GSE138266.obs.apply(assigne_labels, axis=1)

In [12]:
# drop columns batch
adata_GSE138266.obs.drop(columns=['batch'], inplace=True)

In [13]:
adata_GSE138266.obs

Unnamed: 0,sample,patient,dataset,disease,cell_type,cell_types_labels
AAACCTGAGTGGGCTA-1-MS19270-CSF,GSM4104122,MS19270,GSE138266,MS,CSF,T cells
AAACCTGAGTGTTAGA-1-MS19270-CSF,GSM4104122,MS19270,GSE138266,MS,CSF,T cells
AAACCTGGTCGCGTGT-1-MS19270-CSF,GSM4104122,MS19270,GSE138266,MS,CSF,Monocytes
AAACCTGGTCTCCACT-1-MS19270-CSF,GSM4104122,MS19270,GSE138266,MS,CSF,T cells
AAACCTGGTTTACTCT-1-MS19270-CSF,GSM4104122,MS19270,GSE138266,MS,CSF,T cells
...,...,...,...,...,...,...
TTTGCGCGTGCAACGA-1-PTC85037-PBMC,GSM4104143,PTC85037,GSE138266,PTC,PBMC,T cells
TTTGCGCTCCGTAGTA-1-PTC85037-PBMC,GSM4104143,PTC85037,GSE138266,PTC,PBMC,T cells
TTTGGTTCACCACGTG-1-PTC85037-PBMC,GSM4104143,PTC85037,GSE138266,PTC,PBMC,ILC
TTTGTCACACTGAAGG-1-PTC85037-PBMC,GSM4104143,PTC85037,GSE138266,PTC,PBMC,B cells


In [14]:
adata_GSE138266

AnnData object with n_obs × n_vars = 68038 × 33694
    obs: 'sample', 'patient', 'dataset', 'disease', 'cell_type', 'cell_types_labels'
    var: 'gene_symbols', 'gene_ids'

### GSE194078

In [15]:
adata_GSE194078 = sc.read(GSE194078_PATH)

adata_GSE194078

AnnData object with n_obs × n_vars = 110565 × 36601
    obs: 'patient', 'patient_id', 'disease', 'batch', 'dataset', 'cell_types'
    var: 'gene_ids', 'feature_types'

In [16]:
adata_GSE194078.obs.head(2)

Unnamed: 0,patient,patient_id,disease,batch,dataset,cell_types
AAACCCACAAGACCGA-1-PBMC,GSM5827375,CYG,Ab-mediated IDD,0,GSE194078,PBMC
AAACCCACACGGCACT-1-PBMC,GSM5827375,CYG,Ab-mediated IDD,0,GSE194078,PBMC


In [17]:
GSE194078_labels = pd.read_csv(GSE194078_LABELS_PATH, index_col=0)
GSE194078_labels.head(2)

Unnamed: 0,cell_types_labels
AAACCCACAAGACCGA-1-PBMC-GSE194078,B cells
AAACCCACACGGCACT-1-PBMC-GSE194078,T cells


In [18]:
def assigne_labels_GSE194078(row):
    cell_name = row.name + '-GSE194078'
    row['cell_types_labels'] = GSE194078_labels[GSE194078_labels.index==cell_name]['cell_types_labels'].values[0]
    return row

adata_GSE194078.obs = adata_GSE194078.obs.apply(assigne_labels_GSE194078, axis=1)

In [19]:
# rename columns 'patiens' to 'sample' and 'patiens_id' to 'patient_id'
adata_GSE194078.obs.rename(columns={'patiens': 'sample', 'patiens_id': 'patient_id', 'cell_types': 'cell_type'}, inplace=True)

# drop columns batch
adata_GSE194078.obs.drop(columns=['batch'], inplace=True)

In [20]:
adata_GSE194078.obs

Unnamed: 0,patient,patient_id,disease,dataset,cell_type,cell_types_labels
AAACCCACAAGACCGA-1-PBMC,GSM5827375,CYG,Ab-mediated IDD,GSE194078,PBMC,B cells
AAACCCACACGGCACT-1-PBMC,GSM5827375,CYG,Ab-mediated IDD,GSE194078,PBMC,T cells
AAACCCATCAGAATAG-1-PBMC,GSM5827375,CYG,Ab-mediated IDD,GSE194078,PBMC,T cells
AAACGAAAGGTAAAGG-1-PBMC,GSM5827375,CYG,Ab-mediated IDD,GSE194078,PBMC,T cells
AAACGAACACAGCCAC-1-PBMC,GSM5827375,CYG,Ab-mediated IDD,GSE194078,PBMC,T cells
...,...,...,...,...,...,...
TTTGGTTGTGTTCAGT-11-CSF,GSM5827385,YYW,MS,GSE194078,CSF,T cells
TTTGGTTTCAAGAAAC-11-CSF,GSM5827385,YYW,MS,GSE194078,CSF,T cells
TTTGTTGGTATACCCA-11-CSF,GSM5827385,YYW,MS,GSE194078,CSF,T cells
TTTGTTGGTTATGACC-11-CSF,GSM5827385,YYW,MS,GSE194078,CSF,ILC


In [21]:
adata_GSE194078

AnnData object with n_obs × n_vars = 110565 × 36601
    obs: 'patient', 'patient_id', 'disease', 'dataset', 'cell_type', 'cell_types_labels'
    var: 'gene_ids', 'feature_types'

### Concatenate

In [36]:
adata_GSE138266.var.head(2)

Unnamed: 0,gene_symbols,gene_ids
0,ENSG00000243485,RP11-34P13.3
1,ENSG00000237613,FAM138A


In [37]:
adata_GSE138266.var.index = adata_GSE138266.var['gene_ids'].values
adata_GSE138266.var['gene_ids'] = adata_GSE138266.var['gene_symbols']
adata_GSE138266.var.drop(columns=['gene_symbols'], inplace=True)
adata_GSE138266.var['feature_types'] = 'Gene Expression'

In [38]:
adata_GSE138266.var.head(2)

Unnamed: 0,gene_ids,feature_types
RP11-34P13.3,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression


adata_GSE239626 adata_GSE194078

In [22]:
adata_GSE239626.var.head(2)

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression


In [27]:
adata_GSE239626.var['feature_types'].unique() # 'Gene Expression' only

['Gene Expression']
Categories (1, object): ['Gene Expression']

In [23]:
adata_GSE194078.var.head(2)

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression


In [25]:
sum(adata_GSE239626.var.index == adata_GSE194078.var.index) / len(adata_GSE239626.var.index)

1.0

In [24]:
adata_GSE239626

AnnData object with n_obs × n_vars = 72317 × 36601
    obs: 'dataset', 'disease', 'cell_type', 'sample', 'patient_id', 'time', 'cell_types_labels'
    var: 'gene_ids', 'feature_types'

In [56]:
new_genes_ref = adata_GSE138266.var['gene_ids'][~adata_GSE138266.var['gene_ids'
    ].isin(adata_GSE239626.var['gene_ids'])]

a = sc.AnnData(np.zeros((adata_GSE239626.X.shape[0], len(new_genes_ref))))
a.var.index = new_genes_ref.index
a.var['gene_ids'] = new_genes_ref
a.var['feature_types'] = 'Gene Expression'

a.obs.index = adata_GSE239626.obs.index
a.obs = adata_GSE239626.obs

adata_GSE239626_final = anndata.concat([adata_GSE239626, a], axis=1, merge='first')
adata_GSE239626_final

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 72317 × 37944
    obs: 'dataset', 'disease', 'cell_type', 'sample', 'patient_id', 'time', 'cell_types_labels'
    var: 'gene_ids', 'feature_types'

In [57]:
adata_GSE239626_final.obs.head(2)

Unnamed: 0,dataset,disease,cell_type,sample,patient_id,time,cell_types_labels
AAACCCAAGACTGTTC-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells
AAACCCAAGGATCATA-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells


In [58]:
adata_GSE239626_final.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AP000304.12,ENSG00000249209,Gene Expression
AP000687.1,ENSG00000280170,Gene Expression
AF064858.11,ENSG00000237721,Gene Expression
AL773572.7,ENSG00000225745,Gene Expression


adata_GSE194078

In [60]:
a = sc.AnnData(np.zeros((adata_GSE194078.X.shape[0], len(new_genes_ref))))
a.var.index = new_genes_ref.index
a.var['gene_ids'] = new_genes_ref
a.var['feature_types'] = 'Gene Expression'

a.obs.index = adata_GSE194078.obs.index
a.obs = adata_GSE194078.obs

adata_GSE194078_final = anndata.concat([adata_GSE194078, a], axis=1, merge='first')
adata_GSE194078_final

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 110565 × 37944
    obs: 'patient', 'patient_id', 'disease', 'dataset', 'cell_type', 'cell_types_labels'
    var: 'gene_ids', 'feature_types'

In [61]:
adata_GSE194078_final

AnnData object with n_obs × n_vars = 110565 × 37944
    obs: 'patient', 'patient_id', 'disease', 'dataset', 'cell_type', 'cell_types_labels'
    var: 'gene_ids', 'feature_types'

In [62]:
adata_GSE194078_final.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AP000304.12,ENSG00000249209,Gene Expression
AP000687.1,ENSG00000280170,Gene Expression
AF064858.11,ENSG00000237721,Gene Expression
AL773572.7,ENSG00000225745,Gene Expression


adata_GSE138266

In [63]:
adata_GSE138266.var.head(2)

Unnamed: 0,gene_ids,feature_types
RP11-34P13.3,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression


In [64]:
new_genes_13 = adata_GSE239626.var['gene_ids'][~adata_GSE239626.var['gene_ids'
    ].isin(adata_GSE138266.var['gene_ids'])]

a = sc.AnnData(sp.csr_matrix((adata_GSE138266.X.shape[0], len(new_genes_13))))

a.var.index = new_genes_13.index
a.var['gene_ids'] = new_genes_13
a.var['feature_types'] = 'Gene Expression'

a.obs.index = adata_GSE138266.obs.index
a.obs = adata_GSE138266.obs

adata_GSE138266_final = anndata.concat([adata_GSE138266, a], axis=1, merge='first')
adata_GSE138266_final

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 68038 × 37944
    obs: 'sample', 'patient', 'dataset', 'disease', 'cell_type', 'cell_types_labels'
    var: 'gene_ids', 'feature_types'

In [65]:
adata_GSE138266_final.var

Unnamed: 0,gene_ids,feature_types
RP11-34P13.3,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
RP11-34P13.7,ENSG00000238009,Gene Expression
RP11-34P13.8,ENSG00000239945,Gene Expression
...,...,...
AC010889.2,ENSG00000288049,Gene Expression
AC009494.2,ENSG00000286247,Gene Expression
AC010086.3,ENSG00000288057,Gene Expression
AC024236.1,ENSG00000286187,Gene Expression


In [66]:
adata_GSE138266_final.obs.head(2)

Unnamed: 0,sample,patient,dataset,disease,cell_type,cell_types_labels
AAACCTGAGTGGGCTA-1-MS19270-CSF,GSM4104122,MS19270,GSE138266,MS,CSF,T cells
AAACCTGAGTGTTAGA-1-MS19270-CSF,GSM4104122,MS19270,GSE138266,MS,CSF,T cells


In [67]:
adata_GSE239626_final.n_vars, adata_GSE194078_final.n_vars, adata_GSE138266_final.n_vars

(37944, 37944, 37944)

Concat

In [74]:
adata_GSE239626_final.obs_names_make_unique()
adata_GSE194078_final.obs_names_make_unique()
adata_GSE138266_final.obs_names_make_unique()

In [83]:
[adata_GSE239626_final.var.value_counts().max(), adata_GSE194078_final.var.value_counts().max(),
    adata_GSE138266_final.var.value_counts().max()]

[1, 1, 1]

In [103]:
concat_obs = pd.concat([adata_GSE239626_final.obs, adata_GSE194078_final.obs, adata_GSE138266_final.obs])
concat_obs # shape: 72317 + 110565 + 68038 = 250920

Unnamed: 0,dataset,disease,cell_type,sample,patient_id,time,cell_types_labels,patient
AAACCCAAGACTGTTC-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells,
AAACCCAAGGATCATA-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells,
AAACCCAGTTATTCTC-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,B cells,
AAACCCATCATGAGGG-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells,
AAACGAAAGCCAGTAG-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,B cells,
...,...,...,...,...,...,...,...,...
TTTGCGCGTGCAACGA-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,T cells,PTC85037
TTTGCGCTCCGTAGTA-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,T cells,PTC85037
TTTGGTTCACCACGTG-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,ILC,PTC85037
TTTGTCACACTGAAGG-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,B cells,PTC85037


In [104]:
X_concat = sp.vstack([adata_GSE239626_final.X, adata_GSE194078_final.X, adata_GSE138266_final.X])
X_concat # shape: 250920 x 37944

<250920x37944 sparse matrix of type '<class 'numpy.float64'>'
	with 385236280 stored elements in Compressed Sparse Row format>

In [105]:
adata = sc.AnnData(X_concat, obs=concat_obs, var=adata_GSE239626_final.var)

  utils.warn_names_duplicates("var")


In [106]:
72317 + 110565 + 68038

250920

In [107]:
adata # expected shape: (72317+110565+68038) × 37944 = 250920 x 37944

AnnData object with n_obs × n_vars = 250920 × 37944
    obs: 'dataset', 'disease', 'cell_type', 'sample', 'patient_id', 'time', 'cell_types_labels', 'patient'
    var: 'gene_ids', 'feature_types'

In [108]:
adata.obs

Unnamed: 0,dataset,disease,cell_type,sample,patient_id,time,cell_types_labels,patient
AAACCCAAGACTGTTC-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells,
AAACCCAAGGATCATA-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells,
AAACCCAGTTATTCTC-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,B cells,
AAACCCATCATGAGGG-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells,
AAACGAAAGCCAGTAG-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,B cells,
...,...,...,...,...,...,...,...,...
TTTGCGCGTGCAACGA-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,T cells,PTC85037
TTTGCGCTCCGTAGTA-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,T cells,PTC85037
TTTGGTTCACCACGTG-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,ILC,PTC85037
TTTGTCACACTGAAGG-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,B cells,PTC85037


In [110]:
adata.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AP000304.12,ENSG00000249209,Gene Expression
AP000687.1,ENSG00000280170,Gene Expression
AF064858.11,ENSG00000237721,Gene Expression
AL773572.7,ENSG00000225745,Gene Expression


In [109]:
adata.var.value_counts().max()

1

In [111]:
adata.obs['disease'].unique()

array(['MS', 'Ab-mediated IDD', 'Normal', 'YYW', 'PTC', 'PST'],
      dtype=object)

In [112]:
adata.obs['MS/HC'] = adata.obs['disease'].apply(lambda x: 'MS' if x=='MS' else 'HC')

In [113]:
adata.obs

Unnamed: 0,dataset,disease,cell_type,sample,patient_id,time,cell_types_labels,patient,MS/HC
AAACCCAAGACTGTTC-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells,,MS
AAACCCAAGGATCATA-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells,,MS
AAACCCAGTTATTCTC-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,B cells,,MS
AAACCCATCATGAGGG-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,T cells,,MS
AAACGAAAGCCAGTAG-1,GSE239626,MS,PBMC,GSM7669046,N1,J0,B cells,,MS
...,...,...,...,...,...,...,...,...,...
TTTGCGCGTGCAACGA-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,T cells,PTC85037,HC
TTTGCGCTCCGTAGTA-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,T cells,PTC85037,HC
TTTGGTTCACCACGTG-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,ILC,PTC85037,HC
TTTGTCACACTGAAGG-1-PTC85037-PBMC,GSE138266,PTC,PBMC,GSM4104143,,,B cells,PTC85037,HC


### Save new dataset

In [114]:
adata.write('write/transcrittomic.h5ad')