### BMMC Data Preprocessing

Preprocessing data code from repository suinleelab **contrastiveVI**: "https://github.com/suinleelab/contrastiveVI/blob/main/contrastive_vi/data/datasets/zheng_2017.py", and andrewcharlesjones **papca**: https://github.com/andrewcharlesjones/pcpca/blob/main/data/preprocess_singlecell_data.py

And the function (get_data_df) is from pcpca experiment preprocessing file: 
 https://github.com/andrewcharlesjones/pcpca/blob/main/data/preprocess_singlecell_data.py

The python **anndata** package is found here: https://github.com/scverse/anndata?tab=BSD-3-Clause-1-ov-file

The **scanpy** package is found here: https://github.com/scverse/scanpy

The **requests** package is found here: https://github.com/psf/requests

In [2]:
"""
Download, read, and preprocess Zheng et al. (2017) expression data.

Single-cell expression data from Zheng et al. Massively parallel digital
transcriptional profiling of single cells. Nature Communications (2017).
"""
import os
import shutil
import numpy as np
import pandas as pd
import scanpy as sc
from anndata import AnnData
from scipy.io import mmread
import requests
import seaborn as sns
from os.path import join as pjoin
from scipy.io import mmread

N_GENES = 500


# create a folder name "singlecell_bmmc" in local computer
DATA_DIR = ".../singlecell_bmmc"


# this following function (download_binary_file) is from contrastive_vi.data.utils
# (https://github.com/suinleelab/contrastiveVI/blob/main/contrastive_vi/data/utils.py) 
def download_binary_file(file_url: str, output_path: str) -> None:
    """
    Download binary data file from a URL.

    Args:
    ----
        file_url: URL where the file is hosted.
        output_path: Output path for the downloaded file.

    Returns
    -------
        None.
    """
    request = requests.get(file_url)
    with open(output_path, "wb") as f:
        f.write(request.content)
    print(f"Downloaded data from {file_url} at {output_path}")


def download_zheng_2017(output_path: str) -> None:
    """
    Download Zheng et al. 2017 data from the hosting URLs.

    Args:
    ----
        output_path: Output path to store the downloaded and unzipped
        directories.

    Returns
    -------
        None. File directories are downloaded and unzipped in output_path.
    """
    host = "https://cf.10xgenomics.com/samples/cell-exp/1.1.0/"
    host_directories = [
        (
            "aml027_post_transplant/"
            "aml027_post_transplant_filtered_gene_bc_matrices.tar.gz"
        ),
        (
            "aml027_pre_transplant/"
            "aml027_pre_transplant_filtered_gene_bc_matrices.tar.gz"
        ),
        (
            "aml035_post_transplant/"
            "aml035_post_transplant_filtered_gene_bc_matrices.tar.gz"
        ),
        (
            "aml035_pre_transplant/"
            "aml035_pre_transplant_filtered_gene_bc_matrices.tar.gz"
        ),
        (
            "frozen_bmmc_healthy_donor1/"
            "frozen_bmmc_healthy_donor1_filtered_gene_bc_matrices.tar.gz"
        ),
        (
            "frozen_bmmc_healthy_donor2/"
            "frozen_bmmc_healthy_donor2_filtered_gene_bc_matrices.tar.gz"
        ),
    ]
    urls = [host + host_directory for host_directory in host_directories]
    output_filenames = [os.path.join(output_path, url.split("/")[-1]) for url in urls]
    for url, output_filename in zip(urls, output_filenames):
        download_binary_file(url, output_filename)
        output_dir = output_filename.replace(".tar.gz", "")
        shutil.unpack_archive(output_filename, output_dir)


# this following function (get_data_df) is from pcpca experiment preprocessing file: 
# https://github.com/andrewcharlesjones/pcpca/blob/main/data/preprocess_singlecell_data.py
# the directorry to the data is modified becaause the two function above made packages in the root directory. 
def get_data_df(file_directory: str) -> pd.DataFrame:
    """
    Read the expression data for in a downloaded file directory.

    Args:
    ----
        file_directory: A downloaded and unzipped file directory.

    Returns
    -------
        A data frame containing single-cell gene expression count, with cell
        identification barcodes as column names and gene IDs as indices.
    """
    data = mmread(
        os.path.join(file_directory, "filtered_matrices_mex/hg19/matrix.mtx")
    ).toarray()

    # this line is from pcpca
    data = np.log(data + 1)

    genes = pd.read_table(os.path.join(file_directory, "filtered_matrices_mex/hg19/genes.tsv"), header=None)
    barcodes = pd.read_table(os.path.join(file_directory, "filtered_matrices_mex/hg19/barcodes.tsv"), header=None)

    # this following line is from pcpca
    data_df = pd.DataFrame(
        data, index=genes.iloc[:, 0].values, columns=barcodes.iloc[:, 0].values
    )

    data_df = data_df.iloc[:, np.sum(data_df.values, axis=0) != 0]
    data_df = data_df.iloc[np.sum(data_df.values, axis=1) != 0, :]
    return data_df.transpose()

In [3]:
root_data_path = "..."   # local computer data path to download the dataset. 

download_zheng_2017(root_data_path)

In [4]:
# preprocess the single cell data

pretransplant1 = get_data_df(pjoin(DATA_DIR, "aml027_pre_transplant_filtered_gene_bc_matrices"))
posttransplant1 = get_data_df(pjoin(DATA_DIR, "aml027_post_transplant_filtered_gene_bc_matrices"))
pretransplant2 = get_data_df(pjoin(DATA_DIR, "aml035_pre_transplant_filtered_gene_bc_matrices"))
posttransplant2 = get_data_df(pjoin(DATA_DIR, "aml035_post_transplant_filtered_gene_bc_matrices"))

healthy1 = get_data_df(pjoin(DATA_DIR, "frozen_bmmc_healthy_donor1_filtered_gene_bc_matrices"))
healthy2 = get_data_df(pjoin(DATA_DIR, "frozen_bmmc_healthy_donor2_filtered_gene_bc_matrices"))

In [None]:
# this following function (get_data_df) is from pcpca experiment preprocessing file: 
# https://github.com/andrewcharlesjones/pcpca/blob/main/data/preprocess_singlecell_data.py

## Subset to shared genes
shared_genes = pretransplant1.columns.values
for curr_df in [posttransplant1, pretransplant2, posttransplant2, healthy1, healthy2]:
    shared_genes = np.intersect1d(shared_genes, curr_df.columns.values)

## Combine into one dataframe
stacked_df = pretransplant1[shared_genes]
for curr_df in [
    posttransplant1,
    pretransplant2,
    posttransplant2,
]:  # , healthy1, healthy2]:
    stacked_df = pd.concat([stacked_df, curr_df[shared_genes]], axis=0)
print("Total of {} cells and {} genes".format(stacked_df.shape[0], stacked_df.shape[1]))

## Subset to most variable genes
gene_means = np.mean(stacked_df.values, axis=0)
gene_vars = np.var(stacked_df.values, axis=0)
gene_dispersions = gene_vars / gene_means
top_idx = np.argsort(-gene_dispersions)[:N_GENES]
top_genes = stacked_df.columns.values[top_idx]

print("Saving {} genes".format(top_genes.shape[0]))


## Save
pretransplant1[top_genes].to_csv(pjoin(DATA_DIR, "clean", "pretransplant1.csv"))
posttransplant1[top_genes].to_csv(pjoin(DATA_DIR, "clean", "posttransplant1.csv"))

pretransplant2[top_genes].to_csv(pjoin(DATA_DIR, "clean", "pretransplant2.csv"))
posttransplant2[top_genes].to_csv(pjoin(DATA_DIR, "clean", "posttransplant2.csv"))

healthy1[top_genes].to_csv(pjoin(DATA_DIR, "clean", "healthy1.csv"))
healthy2[top_genes].to_csv(pjoin(DATA_DIR, "clean", "healthy2.csv"))
# import ipdb; ipdb.set_trace()

Total of 12399 cells and 12079 genes
Saving 500 genes


In [None]:
# the following code is from pcpca experiment single_cell_bmmc.py file: 
# https://github.com/andrewcharlesjones/pcpca/blob/main/experiments/realworld/scrnaseq/single_cell_bmmc.py

import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join as pjoin
from scipy.io import mmread
from sklearn.decomposition import PCA


DATA_DIR = ".../singlecell_bmmc"
N_COMPONENTS = 10


if __name__ == "__main__":

    pretransplant2 = pd.read_csv(
        pjoin(DATA_DIR, "clean", "pretransplant2.csv"), index_col=0
    )
    posttransplant2 = pd.read_csv(
        pjoin(DATA_DIR, "clean", "posttransplant2.csv"), index_col=0
    )

    healthy1 = pd.read_csv(pjoin(DATA_DIR, "clean", "healthy1.csv"), index_col=0)
    # healthy2 = pd.read_csv(pjoin(DATA_DIR, "clean", "healthy2.csv"), index_col=0)

    # Background is made up of healthy cells
    Y = healthy1.values  # pd.concat([healthy1, healthy2], axis=0).values

    X = pd.concat([pretransplant2, posttransplant2], axis=0).values
    X_labels = ["Pretransplant" for _ in range(pretransplant2.shape[0])]
    X_labels.extend(["Posttransplant" for _ in range(posttransplant2.shape[0])])
    X_labels = np.array(X_labels)
    assert X_labels.shape[0] == X.shape[0]

    # Standardize
    Y -= Y.mean(0)  # background
    Y /= Y.std(0)
    # Y = Y.T
    X -= X.mean(0)  # foreground
    X /= X.std(0)
    # X = X.T

In [None]:
# get the dimension of the foreground 
n, p = X.shape
m, p = Y.shape
print(n)
print(m)

4501
1985
