In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.io import mmread
from anndata import AnnData
import anndata as ad

In [None]:
def read_10x_cca(data_path,study):
    mtx_file = data_path + 'Exp_data_UMIcounts.mtx'
    cells_file = data_path + "Cells.csv"
    genes_file = data_path + 'Genes.txt'
    if study == 'Bischoff2021':
        genes_file = data_path + 'genes.txt'
    print(mtx_file)
    matrix = mmread(mtx_file).tocsr().transpose()  # Transpose to make cells rows and genes columns

    # Read the cells file
    cells = pd.read_csv(cells_file)
    cells['study'] = study
    # Read the genes file
    genes = pd.read_csv(genes_file, header=None, sep='\t')
    genes.columns = ['gene_symbols']

    # Create an AnnData object
    adata = AnnData(X=matrix)
    adata.obs = cells
    adata.var = genes
    adata.var_names = genes['gene_symbols']
    adata.var_names_make_unique()
    adata.obs_names_make_unique()

    return(adata)

In [None]:
def concatenate_cca_tissue(data_directories,studies):
    adata_list = []
    for i in range(len(data_directories)):
        print(studies[i])
        if (os.path.exists(data_directories[i] + 'Exp_data_UMIcounts.mtx') == False):
            print("Skipping " + studies[i] + " because the file does not exist")
            continue
        adata = read_10x_cca(data_directories[i],studies[i])
        adata_list.append(adata)
    adata_concat = ad.concat(adata_list)
    return(adata_concat)

In [None]:
def wrangle_cca_data(tissue):
    tissue_directory = "/home/madhughes/cca_data/Data_" + tissue +'/'

    study_directories = os.listdir(tissue_directory)

    studies = [x.split('_')[1] for x in study_directories]

    data_directories = [tissue_directory + study_directory + "/" for study_directory in study_directories]

    concat_adata = concatenate_cca_tissue(data_directories,studies)

    #update sample to be a string, cell_type to be a string, complexity to be a number, and study to be a string
    concat_adata.obs['sample'] = concat_adata.obs['sample'].astype(str)
    concat_adata.obs['cell_type'] = concat_adata.obs['cell_type'].astype(str)
    concat_adata.obs['complexity'] = concat_adata.obs['complexity'].astype(float)
    concat_adata.obs['study'] = concat_adata.obs['study'].astype(str)

    return(concat_adata)


In [None]:
#loop through tissues and wrangle data
# tissues = ['Prostate',
#            #'Lung', Fix bug later
#            'Head and Neck',
#            'Kidney',
#            'Liver-Biliary',
#            'Sarcoma',
#            'Ovarian',
#            'Neuroendocrine',
#            'Breast',
#            'Colorectal',
#            'Pancreas',
#            #'Brain', Fix bug later
#            'Hematologic',
#            'Skin']
tissues = ['Lung', 'Brain']


output_data = '/home/madhughes/weizmann_cca/'


for tissue in tissues:
    print(tissue)
    adata = wrangle_cca_data(tissue)
    adata.obs['tissue'] = tissue
    print(adata)
    adata.obs['cell_name'] = adata.obs['cell_name'].astype(str)
    adata.write_h5ad(output_data + 'cca_' + tissue + '_aggregated.h5ad')
    print(adata)
    print("Done writing " + tissue)
