In [1]:
import numpy as np
import random
import itertools
from itertools import combinations
import anndata as ad
import os
import pandas as pd
import scanpy as sc
from scipy.sparse import issparse

## Load scRNA data from geosketch_CCA

In [2]:
path = 'data/'
adata = sc.read(path+'CCA_Lung_geosketch.h5ad')
gene_list_df = pd.read_csv('DB/CCIdb.csv')
adata

AnnData object with n_obs × n_vars = 10804 × 36601
    obs: 'n_counts', 'n_genes', 'mito', 'Dataset', 'Organ_orig', 'Organ', 'Sample', 'Patient', 'Subtype', 'Tissue_site', 'Tissue', 'CancerType', 'DataOrgTis', 'PtOrgTis', 'SpOrgTis', 'leiden', 'cnv_status', 'cnv_leiden', 'cnv_score', 'cell_type_major', 'cell_type_subset'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'chromosome', 'start', 'end', 'gene_id', 'gene_name'
    uns: 'anno_2212_colors', 'anno_cat_colors', 'cell_type_subset_colors', 'cnv', 'cnv_leiden_colors', 'cnv_neighbors', 'cnv_status_colors', 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_cnv', 'X_cnv_pca', 'X_cnv_umap', 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'cnv_neighbors_connectivities', 'cnv_neighbors_distances', 'connectivities', 'distances'

## Random sampling

In [3]:
output_folder = "data"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [4]:
# Classify cells by cell type
obs_c = 'cell_type_subset'
all_d = dict()  # Dictionary for all cells: cell ID -> cell type
ct_d = dict()  # Dictionary for cell lists by cell type: cell type -> list of cell IDs

for cell, ct in adata.obs[obs_c].items():
    all_d[cell] = ct
    if ct in ct_d:
        ct_d[ct].append(cell)
    else:
        ct_d[ct] = [cell]

In [5]:
other_cells = ['Tip_Cells', 'activated_capillary', 'Immature_Phenotype',
               'capillary_I', 'capillary_II', 'Activated_EC', 'TandNK', 
               'Epithelial', 'Myeloid', 'Fibro_Peri', 'B']

In [6]:
numb1 = 100  # Tumor cell sampling
numb2 = 100  # Other cell sampling

for idx, cell_type in enumerate(other_cells):
    filename = os.path.join(output_folder, f"combi-cells_c{idx}.txt")
    with open(filename, "w") as f_out:
        for i in range(500):
            tum_samples = random.sample(ct_d['tumor'], numb1)
            cell_samples = random.sample(ct_d[cell_type], numb2)
            # Shuffle
            random.shuffle(tum_samples)
            random.shuffle(cell_samples)
            # Save
            for tum, cell in zip(tum_samples, cell_samples):
                f_out.write(f"{tum},{cell}\n")

## Preprocess adata

In [7]:
# annotate the group of mitochondrial genes as "mt"
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
)

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.regress_out(adata, ["total_counts", "pct_counts_mt"])
sc.pp.scale(adata, max_value=10)

In [8]:
path = 'data/'
adata.write(path + 'CCA_Lung_geosketch_prepro.h5ad')

In [9]:
cell_cnt = len(adata.obs.cell_type_subset.value_counts())-1
cell_cnt

for i in range(cell_cnt):
    # Load combi-cells_cX.txt file
    file_name = f'combi-cells_c{i}.txt'
    combi_cells_df = pd.read_csv(f'{output_folder}/{file_name}', header=None)
    
    # Extract gene indices (maintaining order)
    tumor_gene_indices = [np.where(adata.var_names == gene)[0][0] for gene in gene_list_df['TumorGene'] if gene in adata.var_names]
    other_gene_indices = [np.where(adata.var_names == gene)[0][0] for gene in gene_list_df['OtherGene'] if gene in adata.var_names]

    print("Gene index extraction: ", i)
    # Initialize list to store results
    result_arrays = []    
    
    # Split into 100 groups and generate expression data for each group
    for group_start in range(0, 50000, 100):
        tumor_cells = combi_cells_df.iloc[group_start:group_start+100, 0]
        other_cells = combi_cells_df.iloc[group_start:group_start+100, 1]

        # Create expression data arrays
        expression_data_tumor = np.array([adata[adata.obs_names.isin([cell]), tumor_gene_indices].X.toarray().flatten() for cell in tumor_cells])
        expression_data_other = np.array([adata[adata.obs_names.isin([cell]), other_gene_indices].X.toarray().flatten() for cell in other_cells])

        # Combine two channels and add to results
        combined_expression_data = np.stack((expression_data_tumor, expression_data_other), axis=-1)
        result_arrays.append(combined_expression_data.astype(np.float32))

    print("Data configuration complete: ", i)
        
    # Save results as .npz file
    np.savez(f'{output_folder}/{file_name.replace(".txt", ".npz")}', *result_arrays)

    print(f"File {file_name} processed and saved as .npz")
