In [4]:
import pandas as pd
import requests
import gzip


In [6]:
# Download umitab file
url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE54006&format=file&file=GSE54006%5Fumitab%2Etxt%2Egz"
response = requests.get(url)
with open("GSE54006_umitab.txt.gz", "wb") as file:
    file.write(response.content)

# Unzip the file
with gzip.open("GSE54006_umitab.txt.gz", "rb") as f_in:
    with open("GSE54006_umitab.txt", "wb") as f_out:
        f_out.write(f_in.read())


In [None]:
# load the experimental design, skipping the two comment lines
meta = pd.read_csv(
    "GSE54006_experimental_design.txt",
    sep="\t",
    skiprows=5,
    header=1,
)

meta.head(10)

Unnamed: 0,index,sequencing_batch,amplification_batch,mouse_ID,pool_barcode,sample_barcode,plate_id,well_id,number_of_cells,sorting_markers,RMT_length,group_name,ERCC_dilution,ERCC_volume_ul,Column_name_in_processed_data_file
0,1,1,0,,NNNNNN,CTACCA,1.0,A1,1,CD11c+,4,CD11c+,2e-05,0.01,0_1
1,2,1,0,,NNNNNN,CATGCT,1.0,B1,1,CD11c+,4,CD11c+,2e-05,0.01,0_2
2,3,1,0,,NNNNNN,GCACAT,1.0,C1,1,CD11c+,4,CD11c+,2e-05,0.01,0_3
3,4,1,0,,NNNNNN,TGCTCG,1.0,D1,1,CD11c+,4,CD11c+,2e-05,0.01,0_4
4,5,1,0,,NNNNNN,AGCAAT,1.0,E1,1,CD11c+,4,CD11c+,2e-05,0.01,0_5
5,6,1,0,,NNNNNN,AGTTGC,1.0,F1,1,CD11c+,4,CD11c+,2e-05,0.01,0_6
6,7,1,0,,NNNNNN,CCAGTT,1.0,G1,1,CD11c+,4,CD11c+,2e-05,0.01,0_7
7,8,1,0,,NNNNNN,TTGAGC,1.0,H1,0,CD11c+,4,CD11c+,2e-05,0.01,0_8
8,9,1,0,,NNNNNN,ACCAAC,1.0,A2,1,CD11c+,4,CD11c+,2e-05,0.01,0_9
9,10,1,0,,NNNNNN,GGTCCA,1.0,B2,1,CD11c+,4,CD11c+,2e-05,0.01,0_10


In [54]:

# 3. Build a mapping:  UMI-tab column (e.g. “0_1”) → 6-mer barcode (e.g. “CTACCA”)
mapping = dict(zip(
    meta["Column_name_in_processed_data_file"],
    meta["sample_barcode"]
))

# 4. Load your UMI-tab (genes × “0_1”, “0_2”, …)
data = pd.read_csv(
    "GSE54006_umitab.txt",
    sep="\t",
    header=0,      # first line is: gene_name  0_1  0_2  …
    index_col=0    # first column is gene_name
)

# 5. Rename its columns in place to the 6-mer barcodes
data.rename(columns=mapping, inplace=True)

# # 6. Quick sanity-check
print(data.head())
data.shape


                    CTACCA  CATGCT  GCACAT  TGCTCG  AGCAAT  AGTTGC  CCAGTT  \
gene_name                                                                    
0610007C21Rik_Apr3       0       0       0       1       0       0       0   
0610007L01Rik            0       1       0       0       0       0       0   
0610007P08Rik            0       0       0       0       0       0       0   
0610007P14Rik            0       1       0       0       0       0       0   
0610007P22Rik            0       0       0       0       0       0       0   

                    TTGAGC  ACCAAC  GGTCCA  ...  CTCAGA  AGCGCT  GTCAAG  \
gene_name                                   ...                           
0610007C21Rik_Apr3       0       0       0  ...       0       0       1   
0610007L01Rik            0       1       0  ...       0       0       0   
0610007P08Rik            0       0       0  ...       0       0       0   
0610007P14Rik            0       0       0  ...       0       0       0   
061

(20190, 4590)

In [None]:
# expr is your genes×cells DataFrame (columns are 6-mers)
ercc_ids = [g for g in data.index if g.startswith("ERCC-")]
print(f"Found {len(ercc_ids)} ERCC spike‐in rows, examples:", ercc_ids[:5])

# example format; replace with your actual filepath or dict
ercc_conc = pd.read_csv("ercc_mix1_concentrations.csv", index_col=0, squeeze=True)
# ercc_conc.name might be something like {'ERCC-00002': 0.59, 'ERCC-00004': 4.42, …}



Found 83 ERCC spike‐in rows, examples: ['ERCC-00003', 'ERCC-00009', 'ERCC-00014', 'ERCC-00022', 'ERCC-00025']
