In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os,sys
import scanpy as sc 
import pandas as pd
import numpy as np
import scipy
import anndata

In [3]:
# this code chunk is from Github https://github.com/MarioniLab/oor_design_reproducibility
# check X stores raw counts 
def _check_counts_in_X(adata):
    return(all(np.random.choice(adata.X.data, 100) % 1 == 0))

def _clean_adata(a):
    ## Make obs_names unique
    a.obs_names = a.obs['dataset_id'].astype('str') + '-' + a.obs_names.astype("str")
    assert _check_counts_in_X(a)

    sc.pp.calculate_qc_metrics(a, inplace=True)
    sc.pp.filter_cells(a, min_counts=1000)
    return(a)

### COVID dataset

In [4]:
h5ad_files_covid = 'PBMC_COVID.subsample500cells.covid.h5ad'

adata_covid = sc.read_h5ad(h5ad_files_covid)
cleaned_covid = _clean_adata(adata_covid)

### Control Dataset

In [5]:
h5ad_files_ctrl = 'PBMC_COVID.subsample500cells.ctrl.h5ad'

adata_ctrl = sc.read_h5ad(h5ad_files_ctrl)
cleaned_ctrl = _clean_adata(adata_ctrl)

# Get raw data 
Before cell-type selection and gene selection

In [6]:
# sparse matrix (main) 
expression_covid = cleaned_covid.X
expression_control = cleaned_ctrl.X

# for observation (e.g., cells) and annotation
annotations_covid = cleaned_covid.obs
annotations_control = cleaned_ctrl.obs

In [7]:
# turn sparse matrix into dataframe
expression_covid_array = expression_covid.toarray()
expression_covid_df = pd.DataFrame(expression_covid_array)

expression_control_array = expression_control.toarray()
expression_control_df = pd.DataFrame(expression_control_array)

raw_X = expression_covid_df
raw_Xt = expression_control_df

cell type label responses

In [8]:
raw_Y = annotations_covid['cell_type'].values     # categorical datatype 
raw_Yt = annotations_control['cell_type']

raw_Y = pd.Series(raw_Y)
raw_Yt = pd.Series(raw_Yt)

raw_Y = pd.DataFrame(raw_Y)
raw_Yt = pd.DataFrame(raw_Yt)

In [9]:
print(raw_X.shape)
print(raw_Xt.shape)
print(raw_Y.shape)
print(raw_Yt.shape)

(48083, 24727)
(14426, 24727)
(48083, 1)
(14426, 1)


In [10]:
# raw_X.to_csv("COVID_raw_foreground_X.csv") 
# raw_Xt.to_csv("COVID_raw_background_Xt.csv")
# raw_Y.to_csv("COVID_raw_foreground_Y.csv")
# raw_Yt.to_csv("COVID_raw_background_Yt.csv") 

# Foreground (COVID dataset)

#### Expression_matrix_covid and annotations_covid

rows represent individual cells
columns represent genes 

In [11]:
expression_covid = cleaned_covid.X
annotations_covid = cleaned_covid.obs   # for observations (e.g., cells)
var_annotations_covid = cleaned_covid.var   # for variables (e.g., genes)

In [12]:
expression_covid_array = expression_covid.toarray()
expression_covid_df = pd.DataFrame(expression_covid_array)

#### Get Same Cell Type for fg and bg

In [13]:
cell_type_covid = annotations_covid['cell_type']

# check the unique cell type
cell_type_covid_unique = set(cell_type_covid)

# check the counts of each cell type
cell_type_covid_total = cell_type_covid.value_counts()

# get cell type with counts > 250 counts
cell_type_covid_greater_than_250 = cell_type_covid_total[cell_type_covid_total > 250]

#### background cell type

In [14]:
annotations_control = cleaned_ctrl.obs 
cell_type_control = annotations_control['cell_type']
cell_type_control_total = cell_type_control.value_counts()

In [15]:
cell_type_control_greater_than_250 = cell_type_control_total[cell_type_control_total > 250] 

intersection_cell_type = pd.Series(list(set(cell_type_covid_greater_than_250.index) & set(cell_type_control_greater_than_250.index)))
intersection_cell_type.values

array(['mature NK T cell', 'gamma-delta T cell', 'CD14-positive monocyte',
       'platelet', 'effector CD8-positive, alpha-beta T cell',
       'T-helper 22 cell',
       'CD16-negative, CD56-bright natural killer cell, human',
       'CD16-positive, CD56-dim natural killer cell, human',
       'central memory CD4-positive, alpha-beta T cell',
       'effector memory CD8-positive, alpha-beta T cell',
       'mucosal invariant T cell', 'naive B cell',
       'naive thymus-derived CD4-positive, alpha-beta T cell',
       'naive thymus-derived CD8-positive, alpha-beta T cell'],
      dtype=object)

### Get the Foreground X (COVID)

In [16]:
annotations_covid_df = pd.DataFrame(annotations_covid)

# Reset the index to get numeric indices
annotations_covid_df.reset_index(drop=True, inplace=True)

# check that it returns sets of indices for the certain cell type
annotations_covid_df[annotations_covid_df['cell_type'] == 'dendritic cell'].index

Index([ 6656,  6661, 11088, 11243, 13509, 14630, 18324, 21726, 22677, 24405,
       24499, 24507, 24581, 25135, 25584, 26376, 27008, 28504, 28823, 28876,
       30273, 34569, 34837, 35525, 36196, 36683, 42346, 42460, 44282, 44729,
       45355, 46614, 47239, 47567],
      dtype='int64')

In [17]:
filtered_annotations_covid = annotations_covid_df[annotations_covid_df['cell_type'].isin(intersection_cell_type.values)]

filtered_covid_indices = filtered_annotations_covid.index.to_list()
# print(len(filtered_covid_indices))

In [18]:
# print(set(filtered_annotations_covid['cell_type']))

In [19]:
filtered_expression_covid_df = expression_covid_df.iloc[filtered_covid_indices]

### Get the foreground Y (COVID)

In [26]:
filtered_cell_type = filtered_annotations_covid['cell_type']
unique_filtered_cell_type = set(filtered_cell_type)

# print(len(filtered_cell_type))
# print(len(unique_filtered_cell_type))

In [27]:
COVID_cell_type = filtered_annotations_covid['cell_type']
COVID_cell_type = pd.Categorical(COVID_cell_type)
COVID_cell_type = COVID_cell_type.rename_categories({
    'CD16-positive, CD56-dim natural killer cell, human': '1',
    'mucosal invariant T cell': '2', 
    'effector memory CD8-positive, alpha-beta T cell': '3',
    'platelet': '4',
    'naive B cell': '5', 
    'T-helper 22 cell': '6',
    'gamma-delta T cell': '7',
    'CD16-negative, CD56-bright natural killer cell, human': '8',
    'effector CD8-positive, alpha-beta T cell': '9',
    'naive thymus-derived CD4-positive, alpha-beta T cell': '10',
    'CD14-positive monocyte': '11',
    'central memory CD4-positive, alpha-beta T cell': '12',
    'naive thymus-derived CD8-positive, alpha-beta T cell': '13',
    'mature NK T cell': '14'
})
    
    

# print(COVID_cell_type)

In [28]:
COVID_cell_type = pd.Series(COVID_cell_type)
COVID_cell_type_df = COVID_cell_type.to_frame(name='Cell_Type_COVID')
# print(COVID_cell_type_df)

# Background (Control dataset)

In [29]:
expression_control = cleaned_ctrl.X
annotations_control = cleaned_ctrl.obs   # for observations (e.g., cells)
var_annotations_control = cleaned_ctrl.var   # for variables (e.g., genes)

In [30]:
expression_control_array = expression_control.toarray()
expression_control_df = pd.DataFrame(expression_control_array)

background (Control) has 41 cell types

In [31]:
cell_type_control = annotations_control['cell_type']
cell_type_control_unique = set(cell_type_control)
cell_type_control_total = cell_type_control.value_counts()

### Get the Background X (Control) 

In [32]:
annotations_control_df = pd.DataFrame(annotations_control)

In [33]:
# reset the index to get numeric indices
annotations_control_df.reset_index(drop=True, inplace=True)
filtered_annotations_control = annotations_control_df[annotations_control_df['cell_type'].isin(intersection_cell_type.values)]

check the indices for cell_type less than 250 (to ensure the filtered_control_indices remove these indices)

In [34]:
annotations_control_df[annotations_control_df['cell_type'] == 'plasmablast'].index

Index([ 1239,  2216,  2570,  6617,  9012,  9048,  9651,  9657, 10096, 10164,
       10657, 11574, 12442, 13396, 14210],
      dtype='int64')

In [35]:
filtered_control_indices = filtered_annotations_control.index.to_list()
# print(len(filtered_control_indices))

In [36]:
filter_expression_control_df = expression_control_df.iloc[filtered_control_indices]
# print(filter_expression_control_df.shape)

### Get the background Y

In [37]:
# pd.set_option('display.max_rows', None)
filtered_cell_type_control = filtered_annotations_control['cell_type'] 
unique_filtered_cell_type_control = set(filtered_cell_type_control)

In [38]:
control_cell_type = filtered_annotations_control['cell_type']
control_cell_type = pd.Categorical(control_cell_type)
control_cell_type = control_cell_type.rename_categories({
    'CD16-positive, CD56-dim natural killer cell, human': '1',
    'mucosal invariant T cell': '2', 
    'effector memory CD8-positive, alpha-beta T cell': '3',
    'platelet': '4',
    'naive B cell': '5', 
    'T-helper 22 cell': '6',
    'gamma-delta T cell': '7',
    'CD16-negative, CD56-bright natural killer cell, human': '8',
    'effector CD8-positive, alpha-beta T cell': '9',
    'naive thymus-derived CD4-positive, alpha-beta T cell': '10',
    'CD14-positive monocyte': '11',
    'central memory CD4-positive, alpha-beta T cell': '12',
    'naive thymus-derived CD8-positive, alpha-beta T cell': '13',
    'mature NK T cell': '14'
})
    

# print(control_cell_type)

In [39]:
control_cell_type = pd.Series(control_cell_type)
control_cell_type_df = control_cell_type.to_frame(name='Cell_Type_Control')

# print(control_cell_type_df)

# Feature Preprocessing (Genes)

In [40]:
fg = filtered_expression_covid_df
Y = COVID_cell_type_df

bg = filter_expression_control_df
Yt = control_cell_type_df


print(fg.shape)
print(Y.shape)
print(bg.shape)
print(Yt.shape)

(40411, 24727)
(40411, 1)
(13041, 24727)
(13041, 1)


In [41]:
combined_fg_bg = np.vstack((fg, bg))

In [42]:
# select the top p highly variable genes

p = 500     # p can vary from 100 to 500
col_var = np.var(combined_fg_bg, axis=0)
col_var_sorted_idx = np.argsort(-col_var)

fg = fg.iloc[:, col_var_sorted_idx[:p]]
bg = bg.iloc[:, col_var_sorted_idx[:p]]

In [43]:
print(fg.shape)
print(bg.shape)
print(Y.shape)
print(Yt.shape)

(40411, 500)
(13041, 500)
(40411, 1)
(13041, 1)


In [38]:
# fg.to_csv("covid_preprocessed_fg.csv")
# bg.to_csv("covid_preprocessed_bg.csv")
# Y.to_csv("covid_preprocessed_Y.csv")
# Yt.to_csv("covid_preprocessed_Yt.csv")