#  Data Preparation NSCLCs (donor 1)

This notebook prepares the NSCLC dataset for further analysis.

In [9]:
# loading packages
from scipy.io import mmread
import pandas as pd
import anndata
import matplotlib.pyplot as plt
import numpy as np
import gzip
import shutil
import os
import scanpy as sc

## Loading data

Use sc.read_10x_mtx to load the 10x-Genomics-formatted data as an AnnData file.

In [22]:
# directory containing the zipped folders
folder_path = "/home/cog/nlandzaat/rep_learning_sc_foundation/raw/donor_1/sample_feature_bc_matrix"

# function to read gzipped files
def read_gzip(file_path):
    with gzip.open(file_path, 'rb') as f:
        return f.read().decode('utf-8')

# read matrix
matrix_path = os.path.join(folder_path, "matrix.mtx.gz")
matrix_content = read_gzip(matrix_path)

# read features
features_path = os.path.join(folder_path, "features.tsv.gz")
features_content = read_gzip(features_path)

# read barcodes
barcodes_path = os.path.join(folder_path, "barcodes.tsv.gz")
barcodes_content = read_gzip(barcodes_path)

# load data using scanpy
adata_donor_1 = sc.read_10x_mtx(folder_path, var_names='gene_symbols')

## Preprocessing

Adjust data in order to make it suitable as input data for the foundation models. This includes creating a column containing batch information and creating a layer containing the raw counts. For PCA, UMAP, Geneformer and scGPT, cells are typically represented by rows and genes by columns. The Anndata object is saved to the specified path.

In [23]:
# access barcodes from the index
barcodes = adata_donor_1.obs.index.tolist()

# create cell batches in obs
adata_donor_1.obs["batch"] = ["donor 1" for i in range (len(adata_donor_1.obs))]

# create a layer named "counts" and store the raw counts in it
adata_donor_1.layers["counts"] = adata_donor_1.X

In [25]:
# define the path where you want to save the AnnData object
save_path = "/home/cog/nlandzaat/rep_learning_sc_foundation/analysis/nlandzaat/2.NSCLC notebooks/adata_donor_1.h5ad"

# save the AnnData object
adata_donor_1.write_h5ad(save_path)

... storing 'batch' as categorical
... storing 'feature_types' as categorical
