# Processing GSE205506: A data collection of CRC patietns treated with anti-PD-1 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import anndata as ad
import load_10X_matrices

In [4]:
data_dir = "/home/data/PanCanSC/CRC/GEO/GSE205506/"

In [2]:

import os as os
import sys as sys
import argparse
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad


def load_10X_matrices(matrix_dir):
    '''
    load_10X_matrices(matrix_dir)
    
    This function will load a fold of 10X matrix files into a single sparse matrix.

    Parameters:
        matrix_dir (str): The directory containing the 10X matrix files.
    
    Returns:
        A single AnnData object containing the concatenated matrices.

    '''

    #check if matrix_dir is a directory
    if not os.path.isdir(matrix_dir):
        print ("Input " + matrix_dir + " is not a directory")
        return None
    
    # Open the matrix directory and get the list of files
    matrix_files = os.listdir(matrix_dir)

    # Loop through the files and concatenate the matrices
    mtx_files = [x for x in matrix_files if '.mtx' in x]
    if len(mtx_files) == 0:
        print ("Input directory " + matrix_dir + " has no mtx files.\n")
        return None
        
    # find out whether files has prefix, use set to remove duplicates
    prefixes = set([x.split('matrix.mtx')[0] for x in mtx_files if 'matrix.mtx' in x]) 

    if len(prefixes) > 0:
        # create a list to hold adatas
        adata_list = []   

        # interate through prefixes and index mtx_files        
        for index, prefix in enumerate(prefixes): 
            print("Loading " + prefix)
            tmp = sc.read_10x_mtx(matrix_dir, prefix = prefix)
            tmp.obs['sample_ID'] = prefix 
            adata_list.append(tmp)

        # concatenate adata_list
        overall_adata = ad.concat(adata_list, join='outer')
        return overall_adata

In [7]:
GSE205506_adata = load_10X_matrices(data_dir)

Loading GSM6213974_Colon_MGI_251_v3_
Loading GSM6213959_Colon_MGI_122_v3_
Loading GSM6213963_Colon_171_v3_
Loading GSM6213966_Colon_182_v2_
Loading GSM6213986_Colon_MGI_301_v3_
Loading GSM6213958_Colon_MGI_121_v3_
Loading GSM6213964_Colon_172_v3_
Loading GSM6213995_Colon_MGI_333_v3_
Loading GSM6213978_Colon_MGI_271_v3_
Loading GSM6213992_Colon_MGI_321_v3_
Loading GSM6213968_Colon_MGI_192_v3_
Loading GSM6213969_Colon_MGI_211_v3_
Loading GSM6213973_Colon_MGI_243_v3_
Loading GSM6213962_Colon_MGI_152_v3_
Loading GSM6213960_Colon_MGI_141_v2_
Loading GSM6213965_Colon_181_v2_
Loading GSM6213983_Colon_MGI_283_v3_
Loading GSM6213957_Colon_MGI_112_v3_
Loading GSM6213976_Colon_MGI_253_v3_
Loading GSM6213971_Colon_MGI_233_v3_
Loading GSM6213982_Colon_MGI_282_v3_
Loading GSM6213961_Colon_MGI_151_v3_
Loading GSM6213990_Colon_MGI_312_v3_
Loading GSM6213994_Colon_MGI_323_v3_
Loading GSM6213981_Colon_MGI_281_v3_
Loading GSM6213975_Colon_MGI_252_v3_
Loading GSM6213987_Colon_MGI_302_v3_
Loading GSM621399

  utils.warn_names_duplicates("obs")


In [13]:
GSE205506_adata.obs


Unnamed: 0,sample_ID
AAACCCAAGAATTGTG-1,GSM6213974_Colon_MGI_251_v3_
AAACCCAAGGTTGGTG-1,GSM6213974_Colon_MGI_251_v3_
AAACCCACAATTTCTC-1,GSM6213974_Colon_MGI_251_v3_
AAACCCAGTCGTGATT-1,GSM6213974_Colon_MGI_251_v3_
AAACCCAGTGACCGAA-1,GSM6213974_Colon_MGI_251_v3_
...,...
TTTGGTTTCTGCAGCG-1,GSM6213991_Colon_MGI_313_v3_
TTTGTTGAGGTGCTTT-1,GSM6213991_Colon_MGI_313_v3_
TTTGTTGAGTAATACG-1,GSM6213991_Colon_MGI_313_v3_
TTTGTTGTCGACCACG-1,GSM6213991_Colon_MGI_313_v3_


### Parse the sample_ID to extract patient id and samples source 
Sample source:
    1 : Pre tumor?
    2 : post tumor?
    3 : post adj normal

In [14]:
sample_id = GSE205506_adata.obs['sample_ID']
sample_id_short = [x.split('_')[3] for x in sample_id]

In [19]:
GSE205506_adata.obs['Sample_source'] = [x[-1] for x in sample_id_short]
GSE205506_adata.obs['Patient_id'] = [x[:-1] for x in sample_id_short]
GSE205506_adata.obs



Unnamed: 0,sample_ID,Sample_source,Patient_id
AAACCCAAGAATTGTG-1,GSM6213974_Colon_MGI_251_v3_,1,25
AAACCCAAGGTTGGTG-1,GSM6213974_Colon_MGI_251_v3_,1,25
AAACCCACAATTTCTC-1,GSM6213974_Colon_MGI_251_v3_,1,25
AAACCCAGTCGTGATT-1,GSM6213974_Colon_MGI_251_v3_,1,25
AAACCCAGTGACCGAA-1,GSM6213974_Colon_MGI_251_v3_,1,25
...,...,...,...
TTTGGTTTCTGCAGCG-1,GSM6213991_Colon_MGI_313_v3_,3,31
TTTGTTGAGGTGCTTT-1,GSM6213991_Colon_MGI_313_v3_,3,31
TTTGTTGAGTAATACG-1,GSM6213991_Colon_MGI_313_v3_,3,31
TTTGTTGTCGACCACG-1,GSM6213991_Colon_MGI_313_v3_,3,31


In [20]:
# save GSE205506_adata to h5ad
GSE205506_adata.write_h5ad(data_dir + "GSE205506.h5ad")

### Chunhui describe what steps should be followed below
1. 

In [6]:
GSE205506_adata = sc.read_h5ad(data_dir + "GSE205506.h5ad") 
GSE205506_adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 324020 × 33538
    obs: 'sample_ID', 'Sample_source', 'Patient_id'