In [1]:
!pip install --quiet scanpy pandas numpy anndata igraph leidenalg squidpy scvi-tools

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.9/44.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.8/87.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.3/172.3 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m124.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m104.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.3/183.3 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import scanpy as sc
import numpy as np
import pandas as pd
import os
import leidenalg
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import anndata as ad
import scipy.sparse as sp
import gzip

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
sc.settings.verbosity = 3 #for debugging

In [5]:
sc.settings.set_figure_params(figsize=(5,5)) #standardize figure sizes

In [6]:
checks = True #variable that determines whether we print checks used during coding or not

In [7]:
import pandas as pd
import gzip

#paths to raw files downoaded from the dataset
raw_path  = "/content/drive/MyDrive/data_science_final_project/raw_data/expression_raw.csv.gz"
meta_path = "/content/drive/MyDrive/data_science_final_project/raw_data/meta_dataset2.csv"

#load metadata
meta = pd.read_csv(meta_path, index_col=0)
print("Original metadata rows:", meta.shape[0])

#load header from CSV (cell barcodes)
with gzip.open(raw_path, "rt") as f:
    header = f.readline().strip().split(",")

cells_data = header[1:]  #skip first empty field
print("Cells in expression file:", len(cells_data))

Original metadata rows: 64650
Cells in expression file: 64649


In [8]:
#Clean Metadata:
#keeep only rows whose index exists in the expression data
valid_metadata = meta.index.intersection(cells_data)

#drop invalid metadata rows such as "TYPE"
if len(valid_metadata) < meta.shape[0]:
    dropped = set(meta.index) - set(valid_metadata)
    print("Dropping metadata rows not found in expression file:", list(dropped))
    meta = meta.loc[valid_metadata]

#update list of metadata cells
cells_meta = meta.index.tolist()
print("Cleaned metadata rows:", len(cells_meta))


if(checks == True):
  #check that all metadata cells exist in expression matrix
  missing = set(cells_meta) - set(cells_data)
  if missing:
      print("ERROR: These metadata cells are still missing in expression matrix:", list(missing)[:10])
  else:
      print("Metadata cells all found in expression matrix.")

Dropping metadata rows not found in expression file: ['TYPE']
Cleaned metadata rows: 64649
Metadata cells all found in expression matrix.


In [9]:
if(checks == True):
  #print types before final consistency check
  print("Type of cells_meta:", type(meta))
  print("Type of cells_data:", type(cells_data))

  #if they're lists/arrays/Series, print element type examples:
  try:
      print("Example element from cells_meta:", next(iter(meta)))
  except Exception:
      print("Could not preview cells_meta")

  try:
      print("Example element from cells_data:", next(iter(cells_data)))
  except Exception:
      print("Could not preview cells_data")

  #original check
  missing = set(meta.index) - set(cells_data)
  if missing:
      print(" ERROR: These metadata cells are still missing in expression matrix:", list(missing)[:10])
  else:
      print("Metadata cells all found in expression matrix")

  print("Original metadata rows:", len(meta))
  print("Cells in expression file:", len(cells_data))

Type of cells_meta: <class 'pandas.core.frame.DataFrame'>
Type of cells_data: <class 'list'>
Example element from cells_meta: biosample_id
Example element from cells_data: AAACCCACAACAGCTT
Metadata cells all found in expression matrix
Original metadata rows: 64649
Cells in expression file: 64649


In [10]:
#count lines in file to determine number of genes
TOTAL_GENES = sum(1 for _ in gzip.open(raw_path, "rt")) - 1  # minus the header
print("Total genes in matrix:", TOTAL_GENES)

N_CHUNKS = 10
chunk_size = TOTAL_GENES // N_CHUNKS + 1

print(f"Will generate {N_CHUNKS} chunks, each of ~{chunk_size} genes.")

Total genes in matrix: 36601
Will generate 10 chunks, each of ~3661 genes.


In [None]:
import os
import gzip
import pandas as pd
import scanpy as sc
import scipy.sparse as sp

chunk_idx = 0
chunk_files = []

#chunk folder
drive_chunk_dir = "/content/drive/My Drive/data_science_final_project/data_processed/chunks"
os.makedirs(drive_chunk_dir, exist_ok=True)

#map cell position in CSV to index
cell_index_data = {cell: i for i, cell in enumerate(cells_data)}

with gzip.open(raw_path, "rt") as f:
    next(f)  #skip header

    gene_names = []
    data = []
    rows = []
    cols = []
    gene_counter = 0

    print("Starting chunked streaming (10 chunks)...")

    for line in f:
        parts = line.strip().split(",")
        gene = parts[0]
        expr = parts[1:]

        gene_idx = len(gene_names)
        gene_names.append(gene)

        #collect sparse non-zero entries
        for j, val in enumerate(expr):
            if val not in ("0", "0.0", ""):
                rows.append(j)
                cols.append(gene_idx)
                data.append(float(val))

        gene_counter += 1

        #save chunk when full
        if gene_counter % chunk_size == 0:
            print(f"Saving chunk {chunk_idx} with {len(gene_names)} genes...")

            #build sparse matrix (cells × genes_chunk)
            X = sp.csr_matrix(
                (data, (rows, cols)),
                shape=(len(cells_data), len(gene_names))
            )

            #AnnData
            ad = sc.AnnData(
                X,
                obs=pd.DataFrame(index=cells_meta),
                var=pd.DataFrame(index=gene_names)
            )

            #save directly to Google Drive
            fname = os.path.join(drive_chunk_dir, f"chunk_{chunk_idx}.h5ad")
            print(f"   Writing to {fname} ...")
            ad.write(fname)
            print(f"Saved chunk {chunk_idx} to Google Drive")

            chunk_files.append(fname)

            #reset buffers
            gene_names = []
            data = []
            rows = []
            cols = []
            chunk_idx += 1

    #save final partial chunk
    if len(gene_names) > 0:
        print(f"Saving final chunk {chunk_idx} with {len(gene_names)} genes...")

        X = sp.csr_matrix(
            (data, (rows, cols)),
            shape=(len(cells_data), len(gene_names))
        )

        ad = sc.AnnData(
            X,
            obs=pd.DataFrame(index=cells_meta),
            var=pd.DataFrame(index=gene_names)
        )

        fname = os.path.join(drive_chunk_dir, f"chunk_{chunk_idx}.h5ad")
        print(f"Writing to {fname} ...")
        ad.write(fname)
        print(f"Saved final chunk {chunk_idx} to Google Drive")

        chunk_files.append(fname)

print("Finished creating 10 gene-chunks.")
print("Chunks saved to:", drive_chunk_dir)

Starting chunked streaming (10 chunks)...
Saving chunk 0 with 3661 genes...
   Writing to /content/drive/My Drive/data_science_final_project/data_processed/chunks/chunk_0.h5ad ...
Saved chunk 0 to Google Drive


In [None]:
#Concatinate chunks together

import scanpy as sc
import os
import pandas as pd

#Path where chunks were saved
drive_chunk_dir = "/content/drive/My Drive/DataScienceFinalProject/data/chunks/"

#List and sort chunk files
chunk_files = sorted([
    os.path.join(drive_chunk_dir, f)
    for f in os.listdir(drive_chunk_dir)
    if f.startswith("chunk_") and f.endswith(".h5ad")
])

print(f"Found {len(chunk_files)} chunk files.")
print("Example files:", chunk_files[:3])


#Load chunks
print("Loading chunks from Google Drive...")
adatas = []
for idx, f in enumerate(chunk_files):
    print(f"   Loading chunk {idx} ---{os.path.basename(f)}")
    ad = sc.read_h5ad(f)
    adatas.append(ad)
print("All chunks loaded.")


#Concatenate horizontally (genes axis)
print("\nConcatenating chunks horizontally (axis=1)...")
adata_full = sc.concat(adatas, axis=1, join="outer")
print("Shape after concat:", adata_full.shape)


#Reorder rows to match metadata order
print("Reordering rows to match metadata...")
adata_full = adata_full[cells_meta, :]
print("   Shape after reordering:", adata_full.shape)


#attach metadata
print("Attaching metadata...")
adata_full.obs = meta.copy()
print("Metadata attached.")


#Add counts layer
print("Adding counts layer...")
adata_full.layers["counts"] = adata_full.X.copy()
print("Counts layer added.")


#Save final dataset to Drive
output_path = "/content/drive/My Drive/DataScienceFinalProject/data/dataset2_full.h5ad"
print(f"\nSaving final AnnData to: {output_path}")
adata_full.write(output_path)

print("Final shape (cells × genes):", adata_full.shape)