# GSE180286 OBS Notebook

## Table of Contents
1. [Introduction](#introduction)
2. [Imports](#imports)
3. [Path for Assets](#path-for-assets)
4. [Data Loading](#data-loading)
5. [OBS File Generation](#obs-file-generation)

## Introduction

This notebook is the start of GSE180286 validation. It is designed to load and process single-cell RNA sequencing data from the GSE180286 dataset. The goal is to prepare the data for further analysis and validation.

## Imports

In [64]:
# Necessary imports for the notebook. Please ensure these libraries are installed in your Python environment, 
# if not then please install them using the requirements.txt file.
import glob
import pandas as pd
import anndata as ad
import scanpy as sc
from gtfparse import read_gtf
from scipy.sparse import issparse, csr_matrix
import gc
import numpy as np
from collections import defaultdict

## Path for Assets

In [65]:
# This is the relative path of the assets directory, if the assets are in another location please change the path accordingly.
assets = "../assets"

In [66]:
# Output of this cell is truncated to avoid displaying too many files.
# This will list all the files in the GSE180286 directory.
gse18_files = glob.glob(f"{assets}/GSE180286/*")
gsm_files = sorted([f for f in gse18_files if f.endswith(".txt")])
gsm_file_names = sorted(set(f.split('\\')[-1].split('_')[0] for f in gsm_files if f.endswith("_matrix.txt")))
# gsm_files, gsm_file_names

In [67]:
def load_adata(gsm_files=gsm_files, gsm_file_names=gsm_file_names):
    """    Load GSM files into a list of AnnData objects.
    Args:
        gsm_files (list): List of GSM file names to load.
        gsm_file_names (list): List of GSM file names without extensions or the path.
    Returns:
        list: List of AnnData objects loaded from the provided files.
    """    
    list_data = []
    for gsm_file in gsm_files:
        path = f"{gsm_file}"
        print(f"Loading: {path}")
        
        data = sc.read_csv(path, delimiter="\t").T
        
        # Convert to sparse
        if not issparse(data.X):
            data.X = csr_matrix(data.X)
        
        print(f"Successfully loaded {gsm_file} with shape {data.shape}")
        list_data.append(data)
    adata = ad.concat(list_data, join="outer", label="sample", keys=gsm_file_names)
    return adata

In [68]:
# This block takes a really long time to run, so please be patient.
# If you are running this locally on a machine please check if you have enough resources available.
# If not then please run this on a cloud platform like Google Colab or Kaggle.
adata = load_adata(gsm_files=gsm_files, gsm_file_names=gsm_file_names)

Loading: ../assets/GSE180286\GSM5457199_A2019-1.expression_matrix.txt
Successfully loaded ../assets/GSE180286\GSM5457199_A2019-1.expression_matrix.txt with shape (3267, 25540)
Loading: ../assets/GSE180286\GSM5457200_A2019-2.expression_matrix.txt
Successfully loaded ../assets/GSE180286\GSM5457200_A2019-2.expression_matrix.txt with shape (8607, 25501)
Loading: ../assets/GSE180286\GSM5457201_A2019-3.expression_matrix.txt
Successfully loaded ../assets/GSE180286\GSM5457201_A2019-3.expression_matrix.txt with shape (16661, 27060)
Loading: ../assets/GSE180286\GSM5457202_B2019-1.expression_matrix.txt
Successfully loaded ../assets/GSE180286\GSM5457202_B2019-1.expression_matrix.txt with shape (11467, 32148)
Loading: ../assets/GSE180286\GSM5457203_B2019-2.expression_matrix.txt
Successfully loaded ../assets/GSE180286\GSM5457203_B2019-2.expression_matrix.txt with shape (11356, 29607)
Loading: ../assets/GSE180286\GSM5457204_B2019-3.expression_matrix.txt
Successfully loaded ../assets/GSE180286\GSM5457

  utils.warn_names_duplicates("obs")


In [69]:
adata.write(f"{assets}/GSE180286/GSE180286_adata.h5ad")

In [6]:
adata.obs

Unnamed: 0,sample
GGATAAGGGTCA,GSM5457199
CCGTGCGTACTG,GSM5457199
AGGTAACCTACG,GSM5457199
CTGTATAACCTA,GSM5457199
AAACAGGTTTGA,GSM5457199
...,...
CTCCAACCAATG,GSM5457213
CTCCTGAAGCAC,GSM5457213
GTACCAGCGGCA,GSM5457213
TACCTCCTAAAG,GSM5457213


In [7]:
adata.obs['sample'].value_counts()

sample
GSM5457201    16661
GSM5457202    11467
GSM5457203    11356
GSM5457204    11323
GSM5457200     8607
GSM5457207     8517
GSM5457210     8228
GSM5457211     7521
GSM5457213     6398
GSM5457212     5673
GSM5457209     5298
GSM5457206     4940
GSM5457205     4161
GSM5457208     4064
GSM5457199     3267
Name: count, dtype: int64

In [8]:
# Common mitochondrial Ensembl gene ID prefixes: ENSG00000198888 → MT-ND1, etc.
mt_gene_names = [
    "MT-ND1", "MT-ND2", "MT-ND3", "MT-ND4", "MT-ND4L", "MT-ND5", "MT-ND6",
    "MT-CO1", "MT-CO2", "MT-CO3",
    "MT-ATP6", "MT-ATP8",
    "MT-CYB", "MT-RNR1", "MT-RNR2", "MT-TP", "MT-TL1", "MT-TL2"
]

# Check if these are present
mt_present = [gene for gene in mt_gene_names if gene in adata.var_names]
print("MT genes found:", mt_present)

MT genes found: ['MT-ND1', 'MT-ND2', 'MT-ND3', 'MT-ND4', 'MT-ND4L', 'MT-ND5', 'MT-ND6', 'MT-CO1', 'MT-CO2', 'MT-CO3', 'MT-ATP6', 'MT-ATP8', 'MT-CYB', 'MT-RNR1', 'MT-RNR2', 'MT-TP', 'MT-TL1', 'MT-TL2']


In [9]:
# Load the GTF file
gtf = read_gtf(f"""{assets}/Gencode/gencode.v44.annotation.gtf""").to_pandas()

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'tag', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'havana_transcript', 'exon_number', 'exon_id', 'hgnc_id', 'havana_gene', 'ont', 'protein_id', 'ccdsid', 'artif_dupl']


In [None]:
# Extract gene_id and gene_name
genes = gtf[gtf["feature"] == "gene"][["gene_id", "gene_name"]].drop_duplicates()

# Strip version from Ensembl IDs
genes["gene_id_clean"] = genes["gene_id"].str.replace(r"\..*", "", regex=True)

# Create mapping
ens_to_symbol = dict(zip(genes["gene_id_clean"], genes["gene_name"]))

In [11]:
# Map gene symbols in adata_all.var
adata.var["ensembl_id"] = adata.var_names.str.replace(r"\..*", "", regex=True)
adata.var["gene_name"] = adata.var["ensembl_id"].map(ens_to_symbol)

# Fill missing symbols with Ensembl ID, then set .var_names
# Fill NaNs and ensure all entries are strings
adata.var["gene_name_clean"] = (
    adata.var["gene_name"].fillna(adata.var["ensembl_id"])
).astype(str)

# Set .var_names safely
adata.var_names = adata.var["gene_name_clean"]
adata.var_names_make_unique()

In [12]:
# Check first few gene names
adata.var_names[:10].tolist()

['A1BG',
 'A1BG-AS1',
 'A1CF',
 'A2M',
 'A2M-AS1',
 'A2ML1',
 'A2ML1-AS1',
 'A2ML1-AS2',
 'A2MP1',
 'A3GALT2']

In [13]:
# Identify mitochondrial genes by prefix
adata.var['mt'] = adata.var_names.str.upper().str.startswith('MT-')

# Recalculate QC metrics with updated mitochondrial flags
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)

# Assign percent.mito (to match GSE176078 column naming)
adata.obs['percent.mito'] = adata.obs['pct_counts_mt']

In [16]:
# Confirm if any MT- genes exist and their total expression
mt_genes = adata.var_names[adata.var_names.str.upper().str.startswith("MT-")]
print("🔍 MT-genes found:", mt_genes.tolist())

# Check summed expression of mitochondrial genes
mt_sum = adata[:, mt_genes].X.sum()
total_sum = adata.X.sum()
print(f"MT-total expression: {mt_sum}")
print(f"Total expression: {total_sum}")

🔍 MT-genes found: ['MT-ATP6', 'MT-ATP8', 'MT-CO1', 'MT-CO2', 'MT-CO3', 'MT-CYB', 'MT-ND1', 'MT-ND2', 'MT-ND3', 'MT-ND4', 'MT-ND4L', 'MT-ND5', 'MT-ND6', 'MT-RNR1', 'MT-RNR2', 'MT-TA', 'MT-TC', 'MT-TD', 'MT-TE', 'MT-TF', 'MT-TG', 'MT-TH', 'MT-TI', 'MT-TK', 'MT-TL1', 'MT-TL2', 'MT-TM', 'MT-TN', 'MT-TP', 'MT-TQ', 'MT-TR', 'MT-TS1', 'MT-TS2', 'MT-TT', 'MT-TV', 'MT-TW', 'MT-TY']
MT-total expression: 21227332.0
Total expression: 207242224.0


In [17]:
adata.obs['percent.mito']


GGATAAGGGTCA    20.738459
CCGTGCGTACTG    18.740520
AGGTAACCTACG    18.443533
CTGTATAACCTA    17.642475
AAACAGGTTTGA    20.140953
                  ...    
CTCCAACCAATG    12.580645
CTCCTGAAGCAC     5.263158
GTACCAGCGGCA     6.643356
TACCTCCTAAAG    30.769232
TGGTTTGTAGGG    18.345324
Name: percent.mito, Length: 117481, dtype: float32

In [19]:
# Parse GTF and extract gene features
genes = gtf[gtf["feature"] == "gene"]
gene_pos = genes[["gene_name", "seqname", "start", "end"]].drop_duplicates()
gene_pos.columns = ["gene", "chromosome", "start", "end"]

# Filter gene positions to only those in adata
adata.var["gene"] = adata.var_names
gene_pos_filtered = gene_pos[gene_pos["gene"].isin(adata.var["gene"])].drop_duplicates(subset="gene")
gene_pos_filtered = gene_pos_filtered.set_index("gene")

# Remove any existing coordinate columns that would conflict
adata.var = adata.var.drop(columns=["chromosome", "start", "end"], errors="ignore")

# Join and reindex
adata.var = adata.var.join(gene_pos_filtered, on="gene")
adata.var = adata.var.reindex(adata.var_names)

print("Gene coordinate columns added to adata.var:")
adata.var[["chromosome", "start", "end"]].head()

Gene coordinate columns added to adata.var:


Unnamed: 0_level_0,chromosome,start,end
gene_name_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A1BG,chr19,58345178.0,58353492.0
A1BG-AS1,chr19,58347718.0,58355455.0
A1CF,chr10,50799409.0,50885675.0
A2M,chr12,9067664.0,9116229.0
A2M-AS1,chr12,9065163.0,9068689.0


In [20]:
adata.obs["sample"].value_counts()
adata.obs["cnv_reference"] = "tumor"
adata.obs.loc[adata.obs["sample"] == "GSM5457199", "cnv_reference"] = "normal"

In [24]:
adata.obs["cnv_reference"].value_counts()

cnv_reference
tumor     114214
normal      3267
Name: count, dtype: int64

In [25]:
adata.obs["percent.mito"].value_counts()

percent.mito
9.090909     145
7.692308     142
5.882353     130
8.333334     128
5.555556     127
            ... 
8.398269       1
8.361774       1
3.549246       1
7.844905       1
18.345324      1
Name: count, Length: 65191, dtype: int64

In [26]:
# Clean chromosome names
adata.var["chromosome"] = (
    adata.var["chromosome"].astype(str)
    .str.replace("chr", "", regex=False)
    .str.upper()
)

# Show sample values
print("Sample chromosome values:", adata.var["chromosome"].unique().tolist()[:10])

# Keep canonical chromosomes only
valid_chroms = [str(i) for i in range(1, 23)] + ["X", "Y"]
mask = adata.var["chromosome"].isin(valid_chroms)
print(f"Genes with valid chromosome info: {mask.sum()} / {adata.shape[1]}")

# Subset the full object to valid chromosomes only
adata = adata[:, mask].copy()

Sample chromosome values: ['19', '10', '12', '1', '22', '3', '5', '4', '15', '2']
Genes with valid chromosome info: 28623 / 28623


  utils.warn_names_duplicates("obs")


In [28]:
# Clean chromosome info
adata.var["chromosome"] = (
    adata.var["chromosome"].astype(str)
    .str.replace("chr", "")
    .str.upper()
)

# Keep only genes with chromosome info
valid_mask = adata.var["chromosome"].notna()
valid_genes = adata.var[valid_mask]

# Convert valid_mask to a NumPy array and get indices
valid_idx = np.where(valid_mask.values)[0]

# Extract matrix and chromosome info
X_cnv = adata.X[:, valid_idx]  # Use .X, not "X_cnv"
chroms = valid_genes["chromosome"].values

# Group gene indices by chromosome
chrom_to_idx = defaultdict(list)
for idx, chrom in enumerate(chroms):
    chrom_to_idx[chrom].append(idx)

# Create dataframe for CNV signal
cnv_chr_df = pd.DataFrame(index=adata.obs_names)

# Compute average signal per chromosome
for chrom, indices in chrom_to_idx.items():
    X_chr = X_cnv[:, indices]
    mean_signal = np.asarray(X_chr.mean(axis=1)).ravel()
    cnv_chr_df[chrom] = mean_signal

print("Chromosome-wise CNV matrix shape:", cnv_chr_df.shape)

Chromosome-wise CNV matrix shape: (117481, 24)


In [29]:
# Compute CNV score as mean absolute deviation across chromosomes
cnv_score = cnv_chr_df.abs().mean(axis=1)

# Store in AnnData object
adata.obs["cnv_score"] = cnv_score

# Check result
print("Sample CNV scores:")
adata.obs[["cnv_score", "cnv_reference"]].head()

Sample CNV scores:


Unnamed: 0,cnv_score,cnv_reference
GGATAAGGGTCA,1.573313,normal
CCGTGCGTACTG,1.022685,normal
AGGTAACCTACG,0.775885,normal
CTGTATAACCTA,0.670172,normal
AAACAGGTTTGA,0.591936,normal


In [31]:
adata.obs['nCount_RNA'] = adata.X.sum(axis=1).A1 if hasattr(adata.X, 'A1') else adata.X.sum(axis=1)
mito_genes = adata.var_names.str.startswith('MT-')

In [43]:
ribo_genes = adata.var_names.str.upper().str.startswith(('RPS', 'RPL'))
# adata.obs['pct_counts_ribo'] = (adata[:, ribo_genes].X.sum(axis=1).A1 if hasattr(adata.X, 'A1') 
#                                 else adata[:, ribo_genes].X.sum(axis=1)) / adata.obs['nCount_RNA'] * 100

In [44]:
# Identify ribosomal genes
ribo_genes = adata.var_names.str.upper().str.startswith(('RPS', 'RPL'))

# Calculate ribosomal counts per cell
ribo_counts = adata[:, ribo_genes].X.sum(axis=1)
if hasattr(ribo_counts, 'A1'):
    ribo_counts = ribo_counts.A1
else:
    ribo_counts = np.array(ribo_counts).ravel()

# Calculate percent ribosomal counts per cell
adata.obs['pct_counts_ribo'] = ribo_counts / adata.obs['nCount_RNA'] * 100

In [None]:
# adata.obs['percent.mito'] = (adata[:, mito_genes].X.sum(axis=1).A1 if hasattr(adata.X, 'A1') 
#                               else adata[:, mito_genes].X.sum(axis=1)) / adata.obs['nCount_RNA'] * 100

In [54]:
# Calculate total counts per cell
if hasattr(adata.X, 'A1'):
    adata.obs['nCount_RNA'] = adata.X.sum(axis=1).A1
else:
    adata.obs['nCount_RNA'] = np.array(adata.X.sum(axis=1)).ravel()

# Identify mitochondrial genes
mito_genes = adata.var_names.str.upper().str.startswith('MT-')

# Calculate percent mitochondrial counts per cell
mito_counts = adata[:, mito_genes].X.sum(axis=1)
if hasattr(mito_counts, 'A1'):
    mito_counts = mito_counts.A1
else:
    mito_counts = np.array(mito_counts).ravel()

adata.obs['percent.mito'] = mito_counts / adata.obs['nCount_RNA'] * 100

In [55]:
adata.obs.head()

Unnamed: 0,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,total_counts_mt,...,cnv_reference,cnv_score,nCount_RNA,S_score,G2M_score,phase,oxphos_score,apoptosis_score,proto_oncogenescore,pct_counts_ribo
GGATAAGGGTCA,GSM5457199,7171,8.87794,62319.0,11.040038,34.774306,43.134518,51.916751,63.938767,12924.0,...,normal,1.573313,48061.0,-1.03876,-0.072789,G1,25.207519,-0.458551,25.207519,18.69083
CCGTGCGTACTG,GSM5457199,5783,8.662851,39556.0,10.585498,32.920417,41.404591,51.190717,64.6577,7413.0,...,normal,1.022685,31105.0,-0.118217,1.279655,G2M,19.120603,-0.774244,19.120603,17.897444
AGGTAACCTACG,GSM5457199,5261,8.568266,29734.0,10.30008,31.680904,39.241273,48.627833,62.558014,5484.0,...,normal,0.775885,23562.0,-0.062016,1.626887,G2M,11.705379,-0.725756,11.705379,14.043799
CTGTATAACCTA,GSM5457199,4917,8.500657,25671.0,10.153156,33.34502,41.681275,50.558997,63.9866,4529.0,...,normal,0.670172,20544.0,-0.310078,-0.115385,G1,7.692909,1.00391,7.692909,18.589369
AAACAGGTTTGA,GSM5457199,4784,8.473241,23554.0,10.067094,36.201919,44.293963,52.797826,65.513289,4744.0,...,normal,0.591936,18180.0,0.176357,0.726456,G2M,7.310999,0.679093,7.310999,15.891088


In [56]:
s_genes = [
    'MCM5', 'PCNA', 'TYMS', 'FEN1', 'MCM2', 'MCM4', 'RRM1', 'UNG', 'GINS2',
    'MCM6', 'CDCA7', 'DTL', 'PRIM1', 'UHRF1', 'MCM10', 'HELLS', 'RFC2', 'RPA2',
    'NASP', 'RAD51AP1', 'GMNN', 'WDR76', 'SLBP', 'CCNE2', 'UBR7', 'POLD3',
    'MSH2', 'ATAD2', 'RAD51', 'RRM2', 'CDC45', 'CDC6', 'EXO1', 'TIPIN', 'DSCC1',
    'BLM', 'CASP8AP2', 'USP1', 'CLSPN', 'POLA1', 'CHAF1B', 'BRIP1', 'E2F8'
]

g2m_genes = [
    'HMGB2', 'CDK1', 'NUSAP1', 'UBE2C', 'BIRC5', 'TPX2', 'TOP2A', 'NDC80',
    'CKS2', 'NUF2', 'CKS1B', 'MKI67', 'TMPO', 'CENPF', 'TACC3', 'FAM64A',
    'SMC4', 'CCNB2', 'CKAP2L', 'CKAP2', 'AURKB', 'BUB1', 'KIF11', 'ANP32E',
    'TUBB4B', 'GTSE1', 'KIF20B', 'HJURP', 'CDC20', 'TTK', 'CDC25C', 'KIF2C',
    'RANGAP1', 'NCAPD2', 'DLGAP5', 'CDCA3', 'HN1', 'CDC45', 'CDCA8', 'ECT2',
    'KIF23', 'HMMR', 'AURKA', 'PSRC1', 'ANLN', 'LBR', 'CKAP5', 'CENPE',
    'CTCF', 'NEK2', 'G2E3', 'GAS2L3', 'CBX5', 'CENPA'
]

oxphos_genes = [
    "ATP5F1A", "ATP5F1B", "ATP5MC1", "ATP5MC2", "ATP5ME", "ATP5MG",
    "COX4I1", "COX5A", "COX6A1", "COX6C", "NDUFA1", "NDUFA2", "NDUFA4",
    "NDUFAB1", "NDUFB2", "NDUFB3", "NDUFS1", "NDUFS2", "NDUFV1", "UQCRC1",
    "UQCRC2", "UQCRH", "SDHA", "SDHB", "SDHC", "SDHD", "CYCS"
]

apoptosis_genes = [
    "BAX", "BAK1", "BCL2", "BCL2L1", "BCL2L11", "CASP3", "CASP6", "CASP7",
    "CASP8", "CASP9", "TP53", "FAS", "FASLG", "TNFRSF10A", "TNFRSF10B",
    "TNFRSF1A", "TNF", "AIFM1", "APAF1", "BAD", "BID", "CFLAR", "DIABLO",
    "MCL1", "NFKB1", "NFKBIA", "TRADD", "XIAP"
]

proto_oncogenes = [
    'MYC', 'KRAS', 'EGFR', 'BRAF', 'AKT1', 'PIK3CA', 'CCND1', 'ERBB2', 'FGFR1', 'MDM2'
]

In [57]:
# Filter genes present in your dataset
s_genes_present = [g for g in s_genes if g in adata.var_names]
g2m_genes_present = [g for g in g2m_genes if g in adata.var_names]

print(f"S phase genes found: {len(s_genes_present)}")
print(f"G2M phase genes found: {len(g2m_genes_present)}")

# Score cell cycle
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes_present, g2m_genes=g2m_genes_present)

# Confirm it worked
adata.obs.head()

S phase genes found: 43
G2M phase genes found: 52


Unnamed: 0,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,total_counts_mt,...,cnv_reference,cnv_score,nCount_RNA,S_score,G2M_score,phase,oxphos_score,apoptosis_score,proto_oncogenescore,pct_counts_ribo
GGATAAGGGTCA,GSM5457199,7171,8.87794,62319.0,11.040038,34.774306,43.134518,51.916751,63.938767,12924.0,...,normal,1.573313,48061.0,-1.03876,-0.072789,G1,25.207519,-0.458551,25.207519,18.69083
CCGTGCGTACTG,GSM5457199,5783,8.662851,39556.0,10.585498,32.920417,41.404591,51.190717,64.6577,7413.0,...,normal,1.022685,31105.0,-0.118217,1.279655,G2M,19.120603,-0.774244,19.120603,17.897444
AGGTAACCTACG,GSM5457199,5261,8.568266,29734.0,10.30008,31.680904,39.241273,48.627833,62.558014,5484.0,...,normal,0.775885,23562.0,-0.062016,1.626887,G2M,11.705379,-0.725756,11.705379,14.043799
CTGTATAACCTA,GSM5457199,4917,8.500657,25671.0,10.153156,33.34502,41.681275,50.558997,63.9866,4529.0,...,normal,0.670172,20544.0,-0.310078,-0.115385,G1,7.692909,1.00391,7.692909,18.589369
AAACAGGTTTGA,GSM5457199,4784,8.473241,23554.0,10.067094,36.201919,44.293963,52.797826,65.513289,4744.0,...,normal,0.591936,18180.0,0.176357,0.726456,G2M,7.310999,0.679093,7.310999,15.891088


In [58]:
# Filter to genes present in dataset
oxphos_genes_present = [g for g in oxphos_genes if g in adata.var_names]

# Compute OxPhos score

sc.tl.score_genes(adata, gene_list=oxphos_genes_present, score_name="oxphos_score")

# Check output
print(f"OxPhos genes used: {len(oxphos_genes_present)}")
adata.obs["oxphos_score"].describe()

OxPhos genes used: 27


count    117481.000000
mean          0.257516
std           0.902100
min          -1.720826
25%          -0.015261
50%           0.050810
75%           0.199516
max          55.930207
Name: oxphos_score, dtype: float64

In [59]:
# Apoptosis genes
apoptosis_genes_present = [g for g in apoptosis_genes if g in adata.var_names]
print(f"Apoptosis genes found in dataset: {len(apoptosis_genes_present)}")
sc.tl.score_genes(adata, gene_list=apoptosis_genes_present, score_name="apoptosis_score")
adata.obs["apoptosis_score"]

Apoptosis genes found in dataset: 28


GGATAAGGGTCA   -0.458551
CCGTGCGTACTG   -0.774244
AGGTAACCTACG   -0.725756
CTGTATAACCTA    1.003910
AAACAGGTTTGA    0.679093
                  ...   
CTCCAACCAATG   -0.012774
CTCCTGAAGCAC   -0.000782
GTACCAGCGGCA   -0.020073
TACCTCCTAAAG   -0.007299
TGGTTTGTAGGG    0.022941
Name: apoptosis_score, Length: 117481, dtype: float64

In [60]:
# Score Proto-oncogene activity
# Filter to genes in adata
valid_protooncogenes = [g for g in proto_oncogenes if g in adata.var_names]
sc.tl.score_genes(adata, gene_list=oxphos_genes_present, score_name="proto_oncogenescore")
print(f"Found {len(valid_protooncogenes)} Proto-oncogenes in dataset: {valid_protooncogenes}")

✅ Found 10 Proto-oncogenes in dataset: ['MYC', 'KRAS', 'EGFR', 'BRAF', 'AKT1', 'PIK3CA', 'CCND1', 'ERBB2', 'FGFR1', 'MDM2']


In [61]:
adata.obs.head()

Unnamed: 0,sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,total_counts_mt,...,cnv_reference,cnv_score,nCount_RNA,S_score,G2M_score,phase,oxphos_score,apoptosis_score,proto_oncogenescore,pct_counts_ribo
GGATAAGGGTCA,GSM5457199,7171,8.87794,62319.0,11.040038,34.774306,43.134518,51.916751,63.938767,12924.0,...,normal,1.573313,48061.0,-1.03876,-0.072789,G1,25.207519,-0.458551,25.207519,18.69083
CCGTGCGTACTG,GSM5457199,5783,8.662851,39556.0,10.585498,32.920417,41.404591,51.190717,64.6577,7413.0,...,normal,1.022685,31105.0,-0.118217,1.279655,G2M,19.120603,-0.774244,19.120603,17.897444
AGGTAACCTACG,GSM5457199,5261,8.568266,29734.0,10.30008,31.680904,39.241273,48.627833,62.558014,5484.0,...,normal,0.775885,23562.0,-0.062016,1.626887,G2M,11.705379,-0.725756,11.705379,14.043799
CTGTATAACCTA,GSM5457199,4917,8.500657,25671.0,10.153156,33.34502,41.681275,50.558997,63.9866,4529.0,...,normal,0.670172,20544.0,-0.310078,-0.115385,G1,7.692909,1.00391,7.692909,18.589369
AAACAGGTTTGA,GSM5457199,4784,8.473241,23554.0,10.067094,36.201919,44.293963,52.797826,65.513289,4744.0,...,normal,0.591936,18180.0,0.176357,0.726456,G2M,7.310999,0.679093,7.310999,15.891088


In [62]:
adata.obs.columns.tolist()

['sample',
 'n_genes_by_counts',
 'log1p_n_genes_by_counts',
 'total_counts',
 'log1p_total_counts',
 'pct_counts_in_top_50_genes',
 'pct_counts_in_top_100_genes',
 'pct_counts_in_top_200_genes',
 'pct_counts_in_top_500_genes',
 'total_counts_mt',
 'log1p_total_counts_mt',
 'pct_counts_mt',
 'percent.mito',
 'cnv_reference',
 'cnv_score',
 'nCount_RNA',
 'S_score',
 'G2M_score',
 'phase',
 'oxphos_score',
 'apoptosis_score',
 'proto_oncogenescore',
 'pct_counts_ribo']

## OBS File Generation

In [63]:
# Save the .obs DataFrame to CSV
adata.obs.to_csv(f"{assets}/GSE180286/GSE180286_obs.csv")