In [None]:
import os
from scipy.io import mmread
import pandas as pd
import scanpy as sc
import numpy as np
import urllib.request
from gtfparse import read_gtf
import infercnvpy as cnv

In [None]:
## This is the beginning of SE176078 data from a single-cell and spatially 
### resolved atlas of human breast cancers.

extract_dir = os.path.expanduser("~/Desktop/Capstone/Raw Data/GSE176078/")

# 1. Load matrix and transpose
X = mmread(os.path.join(extract_dir, "count_matrix_sparse.mtx")).tocsr().T

# 2. Load genes
genes = pd.read_csv(os.path.join(extract_dir, "count_matrix_genes.tsv"), sep="\t", header=None)
genes.columns = ["gene_id"]

# 3. Load barcodes
barcodes = pd.read_csv(os.path.join(extract_dir, "count_matrix_barcodes.tsv"), sep="\t", header=None)
barcodes.columns = ["cell_id"]

# 4. Load metadata
metadata = pd.read_csv(os.path.join(extract_dir, "metadata.csv"))

# 5. Build AnnData object
adata = sc.AnnData(X)

# Assign gene names (columns)
adata.var_names = genes["gene_id"].astype(str).values
adata.var_names_make_unique()

# Assign cell barcodes (rows)
adata.obs_names = barcodes["cell_id"].astype(str).values

# Assign metadata
metadata.index = adata.obs_names
adata.obs = metadata

print(adata)

AnnData object with n_obs × n_vars = 100064 × 29733
    obs: 'Unnamed: 0', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'subtype', 'celltype_subset', 'celltype_minor', 'celltype_major'


In [2]:
extract_dir = "/Users/adi/Desktop/Capstone/Raw data/GSE176078"

# Load count matrix
X = mmread(os.path.join(extract_dir, "count_matrix_sparse.mtx")).T.tocsr()  # Transpose needed

# Load barcodes (cells)
barcodes = pd.read_csv(os.path.join(extract_dir, "count_matrix_barcodes.tsv"), header=None)[0].tolist()

# Load genes
#genes = pd.read_csv(os.path.join(extract_dir, "count_matrix_genes.tsv"), header=None)
#gene_symbols = genes[1].tolist()

genes = pd.read_csv(os.path.join(extract_dir, "count_matrix_genes.tsv"), header=None, sep="\t")
gene_symbols = genes[0].tolist()

# Create AnnData object
adata = sc.AnnData(X=X)
adata.obs_names = barcodes
adata.var_names = gene_symbols

# Load metadata
metadata = pd.read_csv(os.path.join(extract_dir, "metadata.csv"))
adata.obs = metadata.set_index(adata.obs_names)

# Final check
print(adata)
print("🧬 Tumor subtypes:", adata.obs['subtype'].unique())

AnnData object with n_obs × n_vars = 100064 × 29733
    obs: 'Unnamed: 0', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'subtype', 'celltype_subset', 'celltype_minor', 'celltype_major'
🧬 Tumor subtypes: ['HER2+' 'TNBC' 'ER+']


In [None]:
mtx_path = os.path.join(extract_dir, "count_matrix_sparse.mtx")
X = mmread(os.path.join(extract_dir, "count_matrix_sparse.mtx")).tocsr().T

# Load genes
genes = pd.read_csv(os.path.join(extract_dir, "count_matrix_genes.tsv"), sep="\t", header=None)
genes.columns = ["gene_id"]

# Load barcodes
barcodes = pd.read_csv(os.path.join(extract_dir, "count_matrix_barcodes.tsv"), sep="\t", header=None)
barcodes.columns = ["cell_id"]

# Load metadata
metadata = pd.read_csv(os.path.join(extract_dir, "metadata.csv"))

# Create AnnData
adata = sc.AnnData(X)

# Set .var_names using gene IDs
adata.var_names = genes["gene_id"].astype(str).values
adata.var_names_make_unique()

# Set .obs using barcodes and align metadata
adata.obs_names = barcodes["cell_id"].astype(str).values
metadata.index = adata.obs_names
adata.obs = metadata
print(adata.obs.head(2))

                                        Unnamed: 0 orig.ident  nCount_RNA  \
CID3586_AAGACCTCAGCATGAG  CID3586_AAGACCTCAGCATGAG    CID3586        4581   
CID3586_AAGGTTCGTAGTACCT  CID3586_AAGGTTCGTAGTACCT    CID3586        1726   

                          nFeature_RNA  percent.mito subtype  \
CID3586_AAGACCTCAGCATGAG          1689      1.506221   HER2+   
CID3586_AAGGTTCGTAGTACCT           779      5.793743   HER2+   

                            celltype_subset     celltype_minor celltype_major  
CID3586_AAGACCTCAGCATGAG  Endothelial ACKR1  Endothelial ACKR1    Endothelial  
CID3586_AAGGTTCGTAGTACCT  Endothelial ACKR1  Endothelial ACKR1    Endothelial  


In [4]:
print("📋 Metadata columns in adata.obs:")
print(adata.obs.columns.tolist())

# Show first few rows to inspect the actual metadata
print("\n🔍 Sample of adata.obs:")
print(adata.obs.head())

📋 Metadata columns in adata.obs:
['Unnamed: 0', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'subtype', 'celltype_subset', 'celltype_minor', 'celltype_major']

🔍 Sample of adata.obs:
                                        Unnamed: 0 orig.ident  nCount_RNA  \
CID3586_AAGACCTCAGCATGAG  CID3586_AAGACCTCAGCATGAG    CID3586        4581   
CID3586_AAGGTTCGTAGTACCT  CID3586_AAGGTTCGTAGTACCT    CID3586        1726   
CID3586_ACCAGTAGTTGTGGCC  CID3586_ACCAGTAGTTGTGGCC    CID3586        1229   
CID3586_ACCCACTAGATGTCGG  CID3586_ACCCACTAGATGTCGG    CID3586        1352   
CID3586_ACTGATGGTCAACTGT  CID3586_ACTGATGGTCAACTGT    CID3586        1711   

                          nFeature_RNA  percent.mito subtype  \
CID3586_AAGACCTCAGCATGAG          1689      1.506221   HER2+   
CID3586_AAGGTTCGTAGTACCT           779      5.793743   HER2+   
CID3586_ACCAGTAGTTGTGGCC           514      1.383238   HER2+   
CID3586_ACCCACTAGATGTCGG           609      1.923077   HER2+   
CID3586_ACTGATGGTCA

In [None]:
print(adata.obs['celltype_major'].value_counts())
print(adata.obs['celltype_minor'].value_counts())

celltype_major
T-cells              35214
Cancer Epithelial    24489
Myeloid               9675
Endothelial           7605
CAFs                  6573
PVL                   5423
Normal Epithelial     4355
Plasmablasts          3524
B-cells               3206
Name: count, dtype: int64
celltype_minor
T cells CD4+                   19231
T cells CD8+                   11487
Cancer LumA SC                  7742
Macrophage                      5929
Cancer Cycling                  5359
Endothelial ACKR1               4611
Cancer Basal SC                 4312
Cancer Her2 SC                  3708
Plasmablasts                    3524
PVL Differentiated              3487
CAFs myCAF-like                 3420
Cancer LumB SC                  3368
CAFs MSC iCAF-like              3153
B cells Memory                  2581
Monocyte                        2328
Luminal Progenitors             1992
PVL Immature                    1886
NK cells                        1846
Endothelial CXCL12              164

In [7]:
print(adata.obs['subtype'].value_counts())

subtype
TNBC     42512
ER+      38241
HER2+    19311
Name: count, dtype: int64


In [None]:
## Cell cycle genes feature in our datset
# Seurat v3 cell cycle gene sets (subset)
s_genes = [
    'MCM5', 'PCNA', 'TYMS', 'FEN1', 'MCM2', 'MCM4', 'RRM1', 'UNG', 'GINS2',
    'MCM6', 'CDCA7', 'DTL', 'PRIM1', 'UHRF1', 'MCM10', 'HELLS', 'RFC2', 'RPA2',
    'NASP', 'RAD51AP1', 'GMNN', 'WDR76', 'SLBP', 'CCNE2', 'UBR7', 'POLD3',
    'MSH2', 'ATAD2', 'RAD51', 'RRM2', 'CDC45', 'CDC6', 'EXO1', 'TIPIN', 'DSCC1',
    'BLM', 'CASP8AP2', 'USP1', 'CLSPN', 'POLA1', 'CHAF1B', 'BRIP1', 'E2F8'
]

g2m_genes = [
    'HMGB2', 'CDK1', 'NUSAP1', 'UBE2C', 'BIRC5', 'TPX2', 'TOP2A', 'NDC80',
    'CKS2', 'NUF2', 'CKS1B', 'MKI67', 'TMPO', 'CENPF', 'TACC3', 'FAM64A',
    'SMC4', 'CCNB2', 'CKAP2L', 'CKAP2', 'AURKB', 'BUB1', 'KIF11', 'ANP32E',
    'TUBB4B', 'GTSE1', 'KIF20B', 'HJURP', 'CDC20', 'TTK', 'CDC25C', 'KIF2C',
    'RANGAP1', 'NCAPD2', 'DLGAP5', 'CDCA3', 'HN1', 'CDC45', 'CDCA8', 'ECT2',
    'KIF23', 'HMMR', 'AURKA', 'PSRC1', 'ANLN', 'LBR', 'CKAP5', 'CENPE',
    'CTCF', 'NEK2', 'G2E3', 'GAS2L3', 'CBX5', 'CENPA'
]
print(len(s_genes))
print(len(g2m_genes))
# Filter for genes present in your dataset
s_genes = [g for g in s_genes if g in adata.var_names]
g2m_genes = [g for g in g2m_genes if g in adata.var_names]

# Compute cell cycle scores
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)

# Output: adata.obs now includes:
# 'S_score', 'G2M_score', 'phase'

43
54


In [9]:
## Apoptosis score feature
# Apoptosis gene list (example set — you can refine this)
apoptosis_genes = [
    'BAX', 'BAK1', 'CASP3', 'CASP8', 'BCL2L11', 'FAS', 'TP53', 'BBC3', 'CYCS'
]
apoptosis_genes = [gene for gene in apoptosis_genes if gene in adata.var_names]

# Compute score
sc.tl.score_genes(adata, gene_list=apoptosis_genes, score_name='apoptosis_score')

In [None]:
## % Ribosomal as a feature

# Identify ribosomal genes (common prefixes: RPS, RPL)
ribo_genes = [gene for gene in adata.var_names if gene.startswith('RPS') or gene.startswith('RPL')]

# Total counts per cell
total_counts = adata.X.sum(axis=1).A1 if isinstance(adata.X, np.matrix) or hasattr(adata.X, 'A1') else adata.X.sum(axis=1)

# Ribosomal counts per cell
ribo_counts = adata[:, ribo_genes].X.sum(axis=1).A1 if hasattr(adata[:, ribo_genes].X, 'A1') else adata[:, ribo_genes].X.sum(axis=1)

# Percent ribosomal
adata.obs['pct_counts_ribo'] = (ribo_counts / total_counts) * 100

In [11]:
## Oxphos genes
oxphos_genes = [
    "ATP5F1A", "ATP5F1B", "ATP5MC1", "ATP5MC2", "ATP5ME", "ATP5MG",
    "COX4I1", "COX5A", "COX6A1", "COX6C", "NDUFA1", "NDUFA2", "NDUFA4",
    "NDUFAB1", "NDUFB2", "NDUFB3", "NDUFS1", "NDUFS2", "NDUFV1", "UQCRC1",
    "UQCRC2", "UQCRH", "SDHA", "SDHB", "SDHC", "SDHD", "CYCS"
]
# Filter genes present in the dataset
oxphos_genes_present = [g for g in oxphos_genes if g in adata.var_names]
sc.tl.score_genes(adata, gene_list=oxphos_genes_present, score_name="oxphos_score")

In [12]:
## Protooncogene score
proto_oncogenes = ['MYC', 'KRAS', 'EGFR', 'BRAF', 'AKT1', 'PIK3CA', 'CCND1', 'ERBB2', 'FGFR1', 'MDM2']

# ---------------------------------------
# 🧪 Step 2: Score Proto-oncogene activity
# Filter to genes in adata
valid_protooncogenes = [g for g in proto_oncogenes if g in adata.var_names]
sc.tl.score_genes(adata, gene_list=oxphos_genes_present, score_name="proto_oncogenescore")
print(f"✅ Found {len(valid_protooncogenes)} Proto-oncogenes in dataset: {valid_protooncogenes}")

✅ Found 10 Proto-oncogenes in dataset: ['MYC', 'KRAS', 'EGFR', 'BRAF', 'AKT1', 'PIK3CA', 'CCND1', 'ERBB2', 'FGFR1', 'MDM2']


In [None]:
adata.obs.sample(5)
adata.obs.columns

Index(['Unnamed: 0', 'orig.ident', 'nCount_RNA', 'nFeature_RNA',
       'percent.mito', 'subtype', 'celltype_subset', 'celltype_minor',
       'celltype_major', 'S_score', 'G2M_score', 'phase', 'apoptosis_score',
       'pct_counts_ribo', 'oxphos_score', 'proto_oncogenescore'],
      dtype='object')

In [None]:
gtf_url = "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz"
gtf_file = "gencode.v44.annotation.gtf.gz"

if not os.path.exists(gtf_file):
    print("📥 Downloading GTF file...")
    urllib.request.urlretrieve(gtf_url, gtf_file)
else:
    print("✅ GTF file already exists.")

# Load and filter for gene info
gtf = read_gtf(gtf_file).to_pandas()
genes = gtf[gtf["feature"] == "gene"]
gene_pos = genes[["gene_name", "seqname", "start", "end"]].drop_duplicates()
gene_pos.columns = ["gene", "chromosome", "start", "end"]

✅ GTF file already exists.


INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'tag', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'havana_transcript', 'exon_number', 'exon_id', 'hgnc_id', 'havana_gene', 'ont', 'protein_id', 'ccdsid', 'artif_dupl']


In [15]:
##STEP 2: Merge gene location with adata.var
# Match gene names
adata.var["gene"] = adata.var_names
gene_pos_filtered = gene_pos[gene_pos["gene"].isin(adata.var["gene"])].drop_duplicates(subset="gene")
gene_pos_filtered = gene_pos_filtered.set_index("gene")

# Merge and reindex
merged_var = adata.var.join(gene_pos_filtered, on="gene")
merged_var = merged_var.reindex(adata.var_names)
adata.var = merged_var

print("✅ Chromosomal annotations added to adata.var")

✅ Chromosomal annotations added to adata.var


In [None]:
##STEP 3: Define reference group using Normal Epithelial cells
# Create reference labels
adata.obs["cnv_reference"] = "tumor"
adata.obs.loc[adata.obs["celltype_major"] == "Normal Epithelial", "cnv_reference"] = "normal"

print("🔍 CNV reference group counts:\n", adata.obs["cnv_reference"].value_counts())

## STEP 4: Run inferCNV with infercnvpy
cnv.tl.infercnv(
    adata,
    reference_key="cnv_reference",
    reference_cat="normal",
    window_size=100,
    step=10,
    n_jobs=4  # Adjust depending on your CPU
)

print("✅ CNV inference completed.")

🔍 CNV reference group counts:
 cnv_reference
tumor     95709
normal     4355
Name: count, dtype: int64


  0%|          | 0/21 [00:00<?, ?it/s]

✅ CNV inference completed.


In [17]:
adata.obs.head(2)

Unnamed: 0.1,Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mito,subtype,celltype_subset,celltype_minor,celltype_major,S_score,G2M_score,phase,apoptosis_score,pct_counts_ribo,oxphos_score,proto_oncogenescore,cnv_reference
CID3586_AAGACCTCAGCATGAG,CID3586_AAGACCTCAGCATGAG,CID3586,4581,1689,1.506221,HER2+,Endothelial ACKR1,Endothelial ACKR1,Endothelial,-0.038055,-0.312865,G1,-0.632049,25.889544,0.495952,0.495952,tumor
CID3586_AAGGTTCGTAGTACCT,CID3586_AAGGTTCGTAGTACCT,CID3586,1726,779,5.793743,HER2+,Endothelial ACKR1,Endothelial ACKR1,Endothelial,-0.048626,-0.187135,G1,-0.286432,26.303592,0.075952,0.075952,tumor


In [17]:
## Run this block first 
# Step 1: Clean and filter chromosome annotations
# adata.var["chromosome"] = adata.var["chromosome"].astype(str).str.replace("chr", "").str.upper()
# valid_chroms = [str(i) for i in range(1, 23)] + ["X", "Y"]
# filtered_var = adata.var[adata.var["chromosome"].isin(valid_chroms)]

# # Step 2: Subset the AnnData object to these genes
# adata_chr = adata[:, filtered_var.index].copy()  # creates new adata_chr with only valid chromosomes

# # Step 3: Proceed with visualization on this cleaned object
# sc.pp.scale(adata_chr, zero_center=True, max_value=10)

# # Step 4: Calculate per-chromosome CNV signal
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

# chrom_scores = {}
# for chrom in sorted(adata_chr.var["chromosome"].unique(), key=lambda x: (x not in "XY", x)):
#     chrom_genes = adata_chr.var[adata_chr.var["chromosome"] == chrom].index
#     if len(chrom_genes) < 10:
#         continue
#     expr = adata_chr[:, chrom_genes].X
#     chrom_score = np.abs(expr).mean(axis=1)
#     chrom_scores[chrom] = chrom_score

# # Step 5: Melt and plot
# cnv_df = pd.DataFrame({chrom: chrom_scores[chrom].A1 if hasattr(chrom_scores[chrom], 'A1') else chrom_scores[chrom]
#                        for chrom in chrom_scores})
# cnv_df = cnv_df.melt(var_name="Chromosome", value_name="Mean Abs Z-score")

# plt.figure(figsize=(12, 6))
# sns.boxplot(data=cnv_df, x="Chromosome", y="Mean Abs Z-score", order=sorted(cnv_df["Chromosome"].unique(), key=lambda x: (x not in "XY", x)))
# plt.xticks(rotation=45)
# plt.title("📊 CNV Signal Intensity by Chromosome")
# plt.tight_layout()
# plt.show()

In [18]:
# Calculate the mean (or median, which is more robust to outliers) of Mean Abs Z-score for each chromosome
# chromosome_cnv_summary = cnv_df.groupby("Chromosome")["Mean Abs Z-score"].mean().sort_values(ascending=False)
# # You could also use .median() for a more robust measure:
# # chromosome_cnv_summary = cnv_df.groupby("Chromosome")["Mean Abs Z-score"].median().sort_values(ascending=False)

# print("📈 Chromosomes Ranked by Mean Abs Z-score:\n")
# print(chromosome_cnv_summary)

# print("\n--- Top 10 High Burden Chromosomes ---")
# print(chromosome_cnv_summary.head(10))

In [None]:
## Step-by-step CNV Scoring on Selected Chromosomes
# Step 1: Ensure chromosome formatting is consistent
adata.var["chromosome"] = adata.var["chromosome"].astype(str).str.replace("chr", "").str.upper()

# Step 2: Define high-CNV burden chromosomes based on literature
#selected_chroms = ["1", "6", "7", "10", "14", "18"]
#selected_chroms = ["1", "6", "7", "8", "10", "14", "17", "18"]
selected_chroms = [
    "16",
    "12",
    "1",
    "19",
    "17",
    "6",
    "2",
    "11",
    "15",
    "5"
]

# Step 3: Subset genes from these chromosomes
cnv_genes = adata.var[adata.var["chromosome"].isin(selected_chroms)]

# Step 4: Subset the AnnData object to CNV-related genes
adata_cnv = adata[:, cnv_genes.index].copy()

# Step 5: Normalize expression per gene (Z-score across all cells)
# This helps detect CNV-like expression shifts (amplifications/deletions)
sc.pp.scale(adata_cnv, zero_center=True, max_value=10)  # Clip extreme values

# Step 6: Calculate CNV score as mean absolute Z-score per cell
adata.obs['cnv_score'] = np.abs(adata_cnv.X).mean(axis=1)

In [19]:
adata.obs.head(2)

Unnamed: 0.1,Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mito,subtype,celltype_subset,celltype_minor,celltype_major,S_score,G2M_score,phase,apoptosis_score,pct_counts_ribo,oxphos_score,proto_oncogenescore,cnv_reference,cnv_score
CID3586_AAGACCTCAGCATGAG,CID3586_AAGACCTCAGCATGAG,CID3586,4581,1689,1.506221,HER2+,Endothelial ACKR1,Endothelial ACKR1,Endothelial,-0.038055,-0.312865,G1,-0.632049,25.889544,0.495952,0.495952,tumor,0.258242
CID3586_AAGGTTCGTAGTACCT,CID3586_AAGGTTCGTAGTACCT,CID3586,1726,779,5.793743,HER2+,Endothelial ACKR1,Endothelial ACKR1,Endothelial,-0.048626,-0.187135,G1,-0.286432,26.303592,0.075952,0.075952,tumor,0.208406


In [21]:
df_old = pd.read_csv("adata_obs_full.csv")
print(df_old.shape)


(100064, 19)


In [22]:
old_columns = df_old.columns.tolist()
old_index = df_old.index.tolist()
df_new = adata.obs.copy()


In [23]:
#df_new_aligned = df_new[old_columns]  # Reorder columns

# Read old file
df_old = pd.read_csv("adata_obs_full.csv")

# Clean up the column list (remove auto-generated index columns)
old_columns = [col for col in df_old.columns if not col.startswith("Unnamed")]

# Create new DataFrame from current AnnData object
df_new = adata.obs.copy()

# Check what's extra/missing
missing_in_new = set(old_columns) - set(df_new.columns)
print("Missing columns in new df:", missing_in_new)

# Align columns (skip Unnamed ones)
df_new_aligned = df_new[old_columns]



Missing columns in new df: set()


In [24]:
df_new.to_csv("adata_obs_full_17July.csv", index=False)


In [29]:
df_old = pd.read_csv("adata_obs_full.csv")
df_new = pd.read_csv("adata_obs_full_17July.csv")

# Compare head
print("🔍 Preview of old vs new:")
print(df_old.head(2))
print(df_new.head(2))

# Optional: Spot check values
#assert df_old.shape == df_new.shape
#assert all(df_old.columns == df_new.columns)

# Compare a few columns
(df_old[["S_score","G2M_score", "proto_oncogenescore", "cnv_score","proto_oncogenescore"]]
 .compare(df_new[["S_score","G2M_score", "proto_oncogenescore", "cnv_score", "proto_oncogenescore"]],))


🔍 Preview of old vs new:
               Unnamed: 0.1                Unnamed: 0 orig.ident  nCount_RNA  \
0  CID3586_AAGACCTCAGCATGAG  CID3586_AAGACCTCAGCATGAG    CID3586        4581   
1  CID3586_AAGGTTCGTAGTACCT  CID3586_AAGGTTCGTAGTACCT    CID3586        1726   

   nFeature_RNA  percent.mito subtype    celltype_subset     celltype_minor  \
0          1689      1.506221   HER2+  Endothelial ACKR1  Endothelial ACKR1   
1           779      5.793743   HER2+  Endothelial ACKR1  Endothelial ACKR1   

  celltype_major   S_score  G2M_score phase  apoptosis_score  pct_counts_ribo  \
0    Endothelial -0.038055  -0.312865    G1        -0.632049        25.889544   
1    Endothelial -0.048626  -0.187135    G1        -0.286432        26.303592   

   oxphos_score  proto_oncogenescore cnv_reference  cnv_score  
0      0.495952             0.495952         tumor   0.258242  
1      0.075952             0.075952         tumor   0.208406  
                 Unnamed: 0 orig.ident  nCount_RNA  nFeature

In [26]:
# Step 1: Identify unwanted index-like columns
cols_to_drop_old = [col for col in df_old.columns if col.startswith("Unnamed")]
cols_to_drop_new = [col for col in df_new.columns if col.startswith("Unnamed")]

# Step 2: Drop those columns from both DataFrames
df_old_clean = df_old.drop(columns=cols_to_drop_old)
df_new_clean = df_new.drop(columns=cols_to_drop_new)

# Step 3: Now compare
assert df_old_clean.shape == df_new_clean.shape, "Shape mismatch"
assert all(df_old_clean.columns == df_new_clean.columns), "Column order mismatch"

print("✅ adata_obs_full_17July.csv is consistent with the original version (except for index columns).")



✅ adata_obs_full_17July.csv is consistent with the original version (except for index columns).


In [27]:
df_new.to_csv("GSE176078_adata_obs_full_17July.csv", index=False)
