# GSE178318 Data Processing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.io as sio
import anndata as ad
import os as os
import seaborn as sns
from load_10X_matrices import load_10X_matrices

## 0. Load Data

In [2]:
directory = '/home/data/PanCanSC/CRC/GEO/GSE178318/'
adata = load_10X_matrices(directory)
print(adata)

Loading GSE178318_
AnnData object with n_obs × n_vars = 140281 × 33694
    obs: 'sample_ID'


In [24]:
adata.obs_names_make_unique

<bound method AnnData.obs_names_make_unique of AnnData object with n_obs × n_vars = 129323 × 33694
    obs: 'sample_ID', 'sample_src', 'patient_ID', 'n_genes', 'n_counts'>

#### Extracting sample src & patient ID from sample ID ????

In [25]:
sampleID = adata.obs['sample_ID']
sampleKey = sampleID.keys()
print(sampleKey[0:5])

adata.obs['sample_src'] = [x.split('_')[2] for x in sampleKey]
adata.obs['patient_ID'] = [x.split('_')[1][-2:] for x in sampleKey]
print(adata.obs['patient_ID'])

Index(['AAACCTGAGAAACCTA_COL07_CRC', 'AAACCTGAGACAATAC_COL07_CRC',
       'AAACCTGAGACGCAAC_COL07_CRC', 'AAACCTGAGCAGATCG_COL07_CRC',
       'AAACCTGAGCTATGCT_COL07_CRC'],
      dtype='object')
AAACCTGAGAAACCTA_COL07_CRC     07
AAACCTGAGACAATAC_COL07_CRC     07
AAACCTGAGACGCAAC_COL07_CRC     07
AAACCTGAGCAGATCG_COL07_CRC     07
AAACCTGAGCTATGCT_COL07_CRC     07
                               ..
TTTGTCAGTTGGGACA_COL18_PBMC    18
TTTGTCATCATGTGGT_COL18_PBMC    18
TTTGTCATCCGCATCT_COL18_PBMC    18
TTTGTCATCGGAATCT_COL18_PBMC    18
TTTGTCATCGGTTAAC_COL18_PBMC    18
Name: patient_ID, Length: 129323, dtype: object


#### Write the GSE178318 AnnData Obj
Save time for future experiments. 

In [15]:
#write to file
adata.write('/home/data/PanCanSC/CRC/GEO/GSE178318/GSE178318.h5ad')

In [17]:
adata = sc.read('/home/data/PanCanSC/CRC/GEO/GSE178318/GSE178318.h5ad')

In [18]:
print ("Dimention of adata: " + str(adata.shape))
print ("columns for adata.obs: " + str(adata.obs.columns))

Dimention of adata: (140281, 33694)
columns for adata.obs: Index(['sample_ID', 'sample_src', 'patient_ID'], dtype='object')


## 1. Basic Filtering

In [19]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.3 anndata==0.9.1 umap==0.5.3 numpy==1.23.5 scipy==1.10.0 pandas==1.5.3 scikit-learn==1.2.1 statsmodels==0.13.5 pynndescent==0.5.10


#### Removing cells expressing <500 || >5000 genes:

In [20]:
# removing cells expressing <500 || >5000 genes
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_cells(adata, max_genes=5000)

filtered out 8499 cells that have less than 500 genes expressed
filtered out 757 cells that have more than 5000 genes expressed


#### Removing cells containing <400 || >25000 UMIs:

In [23]:
# removing cells containing <400 || >25000 UMIs
sc.pp.filter_cells(adata, min_counts = 400)
sc.pp.filter_cells(adata, max_counts = 25000)

(129323, 33694)