In [1]:
import os
import gzip
import requests
import pandas as pd
from pathlib import Path
import dask.dataframe as dd
import pyarrow.csv as pv

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
os.chdir("/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_SRRM3_Tasic")

In [3]:
# Create directories for data organization
def create_directories():
    Path("raw_data").mkdir(exist_ok=True)
    Path("processed_data").mkdir(exist_ok=True)

In [4]:
# Download function with progress tracking
def download_file(url, filename):
    print(f"Downloading {filename}...")
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
    print(f"Downloaded {filename}")

In [5]:
# URLs for the GSE115746 dataset
base_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE115nnn/GSE115746/suppl/"
files_to_download = {
    "metadata": "GSE115746_complete_metadata_28706-cells.csv.gz",
    "exon_counts": "GSE115746_cells_exon_counts.csv.gz",
    "intron_counts": "GSE115746_cells_intron_counts.csv.gz",
    "accession_table": "GSE115746_accession_table.csv.gz"
}

In [6]:
# Create directories
create_directories()

In [7]:
# Download files
for key, filename in files_to_download.items():
    url = base_url + filename
    output_path = os.path.join("raw_data", filename)
    if not os.path.exists(output_path):
        download_file(url, output_path)

In [8]:
# Process the data
# Load metadata
metadata = pd.read_csv("raw_data/GSE115746_complete_metadata_28706-cells.csv")

In [9]:
# Load exon counts and set gene names correctly
# exon_counts = pd.read_csv("raw_data/GSE115746_cells_exon_counts.csv")

In [10]:
# exon_counts = dd.read_csv("raw_data/GSE115746_cells_exon_counts.csv", 
#                          sample=3_000_000_000)  # 3GB sample

In [11]:
# exon_counts = exon_counts.compute()

In [21]:
# Read using pyarrow
exon_counts = pd.read_csv("raw_data/GSE115746_cells_exon_counts.csv", 
                         index_col='Unnamed: 0')

In [22]:
exon_counts.shape

(45768, 23178)

In [23]:
exon_counts.head(3)

Unnamed: 0,F2S4_150422_002_A01,F2S4_150422_002_B01,F2S4_150422_002_C01,F2S4_150422_002_D01,F2S4_150422_002_E01,F2S4_150422_002_F01,F2S4_150422_002_G01,F2S4_150422_002_H01,F2S4_150427_001_A01,F2S4_150427_001_B01,...,F1S4_180124_316_G01,F1S4_180124_316_H01,F1S4_180124_317_A01,F1S4_180124_317_B01,F1S4_180124_317_C01,F1S4_180124_317_D01,F1S4_180124_317_E01,F1S4_180124_317_F01,F1S4_180124_317_G01,F1S4_180124_317_H01
0610005C13Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,25,0,0,0,0,0,0
0610006L08Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007P14Rik,95,48,101,51,36,128,126,102,64,75,...,105,139,103,205,88,172,27,108,23,168


In [24]:
# Filter for Srrm3 and Srrm4
exons = exon_counts.filter(regex='^(Srrm3|Srrm4)', axis=0)
all_exons = exon_counts[exon_counts.index.str.contains('Srrm3|Srrm4', case=True, na=False)]

In [26]:
exons

Unnamed: 0,F2S4_150422_002_A01,F2S4_150422_002_B01,F2S4_150422_002_C01,F2S4_150422_002_D01,F2S4_150422_002_E01,F2S4_150422_002_F01,F2S4_150422_002_G01,F2S4_150422_002_H01,F2S4_150427_001_A01,F2S4_150427_001_B01,...,F1S4_180124_316_G01,F1S4_180124_316_H01,F1S4_180124_317_A01,F1S4_180124_317_B01,F1S4_180124_317_C01,F1S4_180124_317_D01,F1S4_180124_317_E01,F1S4_180124_317_F01,F1S4_180124_317_G01,F1S4_180124_317_H01
Srrm3,18,25,25,16,66,44,19,38,36,72,...,0,31,30,8,5,37,8,40,4,21
Srrm4,30,17,57,0,16,7,24,43,30,57,...,0,55,12,37,3,20,14,0,54,32
Srrm4os,0,0,0,0,5,0,0,0,0,0,...,0,0,4,0,0,0,4,0,0,3


In [27]:
all_exons

Unnamed: 0,F2S4_150422_002_A01,F2S4_150422_002_B01,F2S4_150422_002_C01,F2S4_150422_002_D01,F2S4_150422_002_E01,F2S4_150422_002_F01,F2S4_150422_002_G01,F2S4_150422_002_H01,F2S4_150427_001_A01,F2S4_150427_001_B01,...,F1S4_180124_316_G01,F1S4_180124_316_H01,F1S4_180124_317_A01,F1S4_180124_317_B01,F1S4_180124_317_C01,F1S4_180124_317_D01,F1S4_180124_317_E01,F1S4_180124_317_F01,F1S4_180124_317_G01,F1S4_180124_317_H01
Srrm3,18,25,25,16,66,44,19,38,36,72,...,0,31,30,8,5,37,8,40,4,21
Srrm4,30,17,57,0,16,7,24,43,30,57,...,0,55,12,37,3,20,14,0,54,32
Srrm4os,0,0,0,0,5,0,0,0,0,0,...,0,0,4,0,0,0,4,0,0,3


In [29]:
print(all_exons.index.tolist())


['Srrm3', 'Srrm4', 'Srrm4os']


In [28]:
# Save processed data
all_exons.to_csv("processed_data/srrm_counts.csv", index=False)

In [30]:
intron_counts = pd.read_csv("raw_data/GSE115746_cells_intron_counts.csv", index_col='Unnamed: 0')

In [37]:
intron_counts.shape

(45768, 23179)

In [36]:
intron_counts.head()

Unnamed: 0.1,Unnamed: 0,F2S4_150422_002_A01,F2S4_150422_002_B01,F2S4_150422_002_C01,F2S4_150422_002_D01,F2S4_150422_002_E01,F2S4_150422_002_F01,F2S4_150422_002_G01,F2S4_150422_002_H01,F2S4_150427_001_A01,...,F1S4_180124_316_G01,F1S4_180124_316_H01,F1S4_180124_317_A01,F1S4_180124_317_B01,F1S4_180124_317_C01,F1S4_180124_317_D01,F1S4_180124_317_E01,F1S4_180124_317_F01,F1S4_180124_317_G01,F1S4_180124_317_H01
0,0610005C13Rik,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0610006L08Rik,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0610007P14Rik,3,0,48,27,0,12,0,0,8,...,0,0,0,0,10,0,0,1,0,0
3,0610009B22Rik,0,0,0,0,1,0,0,0,0,...,0,0,2,0,1,0,0,0,0,0
4,0610009E02Rik,0,0,16,0,0,0,0,0,7,...,0,0,0,0,0,0,0,0,0,0


In [33]:
introns = intron_counts.filter(regex='^(Srrm3|Srrm4)', axis=0)
all_introns = intron_counts[intron_counts.index.astype(str).str.contains('Srrm3|Srrm4', case=True, na=False)]

In [34]:
introns

Unnamed: 0.1,Unnamed: 0,F2S4_150422_002_A01,F2S4_150422_002_B01,F2S4_150422_002_C01,F2S4_150422_002_D01,F2S4_150422_002_E01,F2S4_150422_002_F01,F2S4_150422_002_G01,F2S4_150422_002_H01,F2S4_150427_001_A01,...,F1S4_180124_316_G01,F1S4_180124_316_H01,F1S4_180124_317_A01,F1S4_180124_317_B01,F1S4_180124_317_C01,F1S4_180124_317_D01,F1S4_180124_317_E01,F1S4_180124_317_F01,F1S4_180124_317_G01,F1S4_180124_317_H01


In [35]:
all_introns

Unnamed: 0.1,Unnamed: 0,F2S4_150422_002_A01,F2S4_150422_002_B01,F2S4_150422_002_C01,F2S4_150422_002_D01,F2S4_150422_002_E01,F2S4_150422_002_F01,F2S4_150422_002_G01,F2S4_150422_002_H01,F2S4_150427_001_A01,...,F1S4_180124_316_G01,F1S4_180124_316_H01,F1S4_180124_317_A01,F1S4_180124_317_B01,F1S4_180124_317_C01,F1S4_180124_317_D01,F1S4_180124_317_E01,F1S4_180124_317_F01,F1S4_180124_317_G01,F1S4_180124_317_H01
