In [1]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
#from gprofiler import GProfiler
#import loompy as lp
import rpy2.rinterface_lib.callbacks
import logging
import scrublet as scr
#from rpy2.robjects import pandas2ri
#import anndata2ri
import warnings
warnings.filterwarnings('ignore')
import os              
os.environ['PYTHONHASHSEED'] = '0'
sc.settings.set_figure_params(dpi=100, facecolor='white')

#### Load Dataset Lung

In [2]:
path_lung="/home/lugli/spuccio/Projects/SP025_NaClTcell/PangenomeBlueprint/LC_counts/"
adata_lung = sc.read("".join([path_lung,"matrix.mtx"]), cache=True)
adata_lung = adata_lung.transpose()
adata_lung.X = adata_lung.X.toarray()
barcodes = pd.read_csv("".join([path_lung,"barcodes.tsv"]), header=None, sep='\t')
genes = pd.read_csv("".join([path_lung,"genes.tsv"]), header=None, sep='\t')
#Annotate data
barcodes.rename(columns={0:'barcode'}, inplace=True)
barcodes.set_index('barcode', inplace=True)
adata_lung.obs = barcodes
genes.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
genes.set_index('gene_symbol', inplace=True)
adata_lung.var = genes
Metadata_lung = pd.read_csv("/home/lugli/spuccio/Projects/SP025_NaClTcell/PangenomeBlueprint/LC_metadata.csv",header=0,low_memory=False)

In [3]:
adata_lung.obs['CellId'] = Metadata_lung["Cell"].astype('category')
adata_lung.obs['CellFromTumor'] = Metadata_lung["CellFromTumor"].to_list()
adata_lung.obs['PatientNumber'] = Metadata_lung["PatientNumber"].to_list()
adata_lung.obs['TumorType'] = Metadata_lung["TumorType"].to_list()
adata_lung.obs['TumorSite'] = Metadata_lung["TumorSite"].to_list()
adata_lung.obs['CellType'] = Metadata_lung["CellType"].to_list()

Filter Adenocarcinoma Ptz

In [4]:
adenocarcinomacell = (adata_lung.obs['PatientNumber'] == "Ptz3") | (adata_lung.obs['PatientNumber'] == "Ptz4") | (adata_lung.obs['PatientNumber'] == "Ptz6")

In [5]:
adata_lung.obs['PatientNumber'].unique()

array(['Ptz1', 'Ptz2', 'Ptz3', 'Ptz4', 'Ptz5', 'Ptz6', 'Ptz7', 'Ptz8'],
      dtype=object)

In [6]:
adata_lung = adata_lung[adenocarcinomacell, :]

In [7]:
mod = {'Ptz3': 'Ptz1',
     'Ptz4': 'Ptz2',
     'Ptz6' : 'Ptz3'}
adata_lung.obs['PatientNumber'] = adata_lung.obs['PatientNumber'].map(mod).astype('category')

In [8]:
adata_lung.obs['PatientNumber'].unique()

['Ptz1', 'Ptz2', 'Ptz3']
Categories (3, object): ['Ptz1', 'Ptz2', 'Ptz3']

adata_lung_Tcell = adata_lung[adata_lung.obs['CellType']  == "T_cell",:]
adata_lung_Cancer = adata_lung[adata_lung.obs['CellType']  == "Cancer",:]

#### Load Dataset BC

In [9]:
path_bc="/home/lugli/spuccio/Projects/SP025_NaClTcell/PangenomeBlueprint/BC_counts/"
adata_bc = sc.read("".join([path_bc,"matrix.mtx"]), cache=True)
adata_bc = adata_bc.transpose()
adata_bc.X = adata_bc.X.toarray()
barcodes = pd.read_csv("".join([path_bc,"barcodes.tsv"]), header=None, sep='\t')
genes = pd.read_csv("".join([path_bc,"genes.tsv"]), header=None, sep='\t')
#Annotate data
barcodes.rename(columns={0:'barcode'}, inplace=True)
barcodes.set_index('barcode', inplace=True)
adata_bc.obs = barcodes
genes.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
genes.set_index('gene_symbol', inplace=True)
adata_bc.var = genes
Metadata_bc = pd.read_csv("/home/lugli/spuccio/Projects/SP025_NaClTcell/PangenomeBlueprint/BC_metadata.csv",header=0,low_memory=False)
adata_bc.obs['CellId'] = Metadata_bc["Cell"].to_list()
adata_bc.obs['CellFromTumor'] = Metadata_bc["CellFromTumor"].to_list()
adata_bc.obs['PatientNumber'] = Metadata_bc["PatientNumber"].to_list()
adata_bc.obs['TumorType'] = Metadata_bc["TumorType"].to_list()
adata_bc.obs['TumorSite'] = Metadata_bc["TumorSite"].to_list()
adata_bc.obs['CellType'] = Metadata_bc["CellType"].to_list()

In [10]:
adata_bc.obs['PatientNumber'].unique()

array(['Ptz41', 'Ptz42', 'Ptz43', 'Ptz44', 'Ptz45', 'Ptz46', 'Ptz47',
       'Ptz48', 'Ptz49', 'Ptz50', 'Ptz51', 'Ptz52', 'Ptz53', 'Ptz54'],
      dtype=object)

In [11]:
mod = {'Ptz41': 'Ptz4',
     'Ptz42': 'Ptz5',
     'Ptz43' : 'Ptz6',
     'Ptz44' : 'Ptz7',
     'Ptz45': 'Ptz8',
     'Ptz46': 'Ptz9',
     'Ptz47' : 'Ptz10',
     'Ptz48' : 'Ptz11',
     'Ptz49': 'Ptz12',
     'Ptz50': 'Ptz13',
     'Ptz51' : 'Ptz14',
     'Ptz52' : 'Ptz15',
     'Ptz53': 'Ptz16',
     'Ptz54': 'Ptz17'}
adata_bc.obs['PatientNumber'] = adata_bc.obs['PatientNumber'].map(mod).astype('category')

In [12]:
adata_bc.obs['PatientNumber'].unique()

['Ptz4', 'Ptz5', 'Ptz6', 'Ptz7', 'Ptz8', ..., 'Ptz13', 'Ptz14', 'Ptz15', 'Ptz16', 'Ptz17']
Length: 14
Categories (14, object): ['Ptz10', 'Ptz11', 'Ptz12', 'Ptz13', ..., 'Ptz6', 'Ptz7', 'Ptz8', 'Ptz9']

#### Load Dataset OC

In [13]:
path_oc="/home/lugli/spuccio/Projects/SP025_NaClTcell/PangenomeBlueprint/OvC_counts/"
adata_oc = sc.read("".join([path_oc,"matrix.mtx"]), cache=True)
adata_oc = adata_oc.transpose()
adata_oc.X = adata_oc.X.toarray()
barcodes = pd.read_csv("".join([path_oc,"barcodes.tsv"]), header=None, sep='\t')
genes = pd.read_csv("".join([path_oc,"genes.tsv"]), header=None, sep='\t')
#Annotate data
barcodes.rename(columns={0:'barcode'}, inplace=True)
barcodes.set_index('barcode', inplace=True)
adata_oc.obs = barcodes
genes.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
genes.set_index('gene_symbol', inplace=True)
adata_oc.var = genes
Metadata_oc = pd.read_csv("/home/lugli/spuccio/Projects/SP025_NaClTcell/PangenomeBlueprint/OvC_metadata.csv",header=0,low_memory=False)
adata_oc.obs['CellId'] = Metadata_oc["Cell"].to_list()
adata_oc.obs['CellFromTumor'] = Metadata_oc["CellFromTumor"].to_list()
adata_oc.obs['PatientNumber'] = Metadata_oc["PatientNumber"].to_list()
adata_oc.obs['TumorType'] = Metadata_oc["TumorType"].to_list()
adata_oc.obs['TumorSite'] = Metadata_oc["TumorSite"].to_list()
adata_oc.obs['CellType'] = Metadata_oc["CellType"].to_list()

In [14]:
adata_oc.obs['PatientNumber'].unique()

array(['Ptz11', 'Ptz12', 'Ptz13', 'Ptz14', 'Ptz15'], dtype=object)

In [15]:
mod = {'Ptz11': 'Ptz18',
     'Ptz12': 'Ptz19',
     'Ptz13' : 'Ptz20',
     'Ptz14' : 'Ptz21',
     'Ptz15': 'Ptz22'}
adata_oc.obs['PatientNumber'] = adata_oc.obs['PatientNumber'].map(mod).astype('category')

In [16]:
adata_oc.obs['PatientNumber'].unique()

['Ptz18', 'Ptz19', 'Ptz20', 'Ptz21', 'Ptz22']
Categories (5, object): ['Ptz18', 'Ptz19', 'Ptz20', 'Ptz21', 'Ptz22']

#### Load Dataset CRC

In [17]:
path_CRC="/home/lugli/spuccio/Projects/SP025_NaClTcell/PangenomeBlueprint/CRC_counts/"
adata_CRC = sc.read("".join([path_CRC,"matrix.mtx"]), cache=True)
adata_CRC = adata_CRC.transpose()
adata_CRC.X = adata_CRC.X.toarray()
barcodes = pd.read_csv("".join([path_CRC,"barcodes.tsv"]), header=None, sep='\t')
genes = pd.read_csv("".join([path_CRC,"genes.tsv"]), header=None, sep='\t')
#Annotate data
barcodes.rename(columns={0:'barcode'}, inplace=True)
barcodes.set_index('barcode', inplace=True)
adata_CRC.obs = barcodes
genes.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
genes.set_index('gene_symbol', inplace=True)
adata_CRC.var = genes
Metadata_CRC = pd.read_csv("/home/lugli/spuccio/Projects/SP025_NaClTcell/PangenomeBlueprint/CRC_metadata_2.csv",header=0,low_memory=False)
adata_CRC.obs['CellId'] = Metadata_CRC["Cell"].to_list()
adata_CRC.obs['CellFromTumor'] = Metadata_CRC["CellFromTumor"].to_list()
adata_CRC.obs['PatientNumber'] = Metadata_CRC["PTZ_PatientNumber"].to_list()
adata_CRC.obs['TumorType'] = Metadata_CRC["TumorType"].to_list()
adata_CRC.obs['TumorSite'] = Metadata_CRC["TumorSite"].to_list()
adata_CRC.obs['CellType'] = Metadata_CRC["CellType"].to_list()

In [18]:
adata_CRC.obs['PatientNumber'].unique()

array(['PTZ_31', 'PTZ_32', 'PTZ_33', 'PTZ_35', 'PTZ_36', 'PTZ_37',
       'PTZ_38'], dtype=object)

In [19]:
mod = {'PTZ_31': 'Ptz23',
     'PTZ_32': 'Ptz24',
     'PTZ_33' : 'Ptz25',
     'PTZ_35': 'Ptz26',
     'PTZ_36' : 'Ptz27',
     'PTZ_37': 'Ptz28',
     'PTZ_38': 'Ptz29'}
adata_CRC.obs['PatientNumber'] = adata_CRC.obs['PatientNumber'].map(mod).astype('category')

In [20]:
adata_CRC.obs['PatientNumber'].unique()

['Ptz23', 'Ptz24', 'Ptz25', 'Ptz26', 'Ptz27', 'Ptz28', 'Ptz29']
Categories (7, object): ['Ptz23', 'Ptz24', 'Ptz25', 'Ptz26', 'Ptz27', 'Ptz28', 'Ptz29']

#### Data concatenation

In [21]:
adata = adata_lung.concatenate(adata_bc,adata_CRC,adata_oc,index_unique=None)

In [24]:
adata.write("/home/lugli/spuccio/Projects/SP025_NaClTcell/Analysis/Allcell.h5ad")

In [25]:
adata.raw = adata

In [26]:
adata.shape

(174834, 33694)

In [27]:
del adata_lung
del adata_bc
del adata_CRC
del adata_oc

#### Remove Ribo/Mito genes

In [28]:
print(adata.n_obs, adata.n_vars)
mito_genes = adata.var_names.str.startswith('MT')
ribo_genes = adata.var_names.str.startswith(("RPS","RPL"))
remove = np.add(ribo_genes, mito_genes)
keep = np.invert(remove)

adata = adata[:,keep]

print(adata.n_obs, adata.n_vars)

174834 33694
174834 33487


In [30]:
adata.raw = adata

In [31]:
adata.layers['counts'] = adata.X

In [32]:
sc.pp.normalize_total(adata, target_sum=1e4,exclude_highly_expressed=True)
sc.pp.log1p(adata)
#adata.raw = adata

In [33]:
adata.write("/home/lugli/spuccio/Projects/SP025_NaClTcell/Analysis/adata.h5ad")