# Environment

In [27]:
# Standard library imports
import os
import shutil
from concurrent.futures import ProcessPoolExecutor, as_completed

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam

# rpy2 imports
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri, r, Formula
from rpy2.robjects.packages import importr
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.vectors import StrVector, DataFrame

import pandas as pd
import glob
import os
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing

In [28]:
data_dir = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords/Create_counts/output"
working_dir = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords"
os.chdir(working_dir)

In [29]:
# Enable automatic conversion between pandas and R dataframes
pandas2ri.activate()

In [30]:
# Import necessary R packages
dexseq = importr('DEXSeq')
deseq2 = importr('DESeq2')

In [31]:
def prepare_dexseq_annotation():
    """
    Create DEXSeq-formatted GFF file from the original annotation
    """
    input_gff = "DATA/gencode.v31.basic.annotation.gff"
    output_gff = "DATA/gencode.v31.basic.annotation.DEXSeq.gff"
    
    # Path to the DEXSeq preparation script
    dexseq_script = "/home/kubacki.michal/.conda/envs/jupyter_nb/lib/R/library/DEXSeq/python_scripts/dexseq_prepare_annotation.py"
    
    # Run the preparation script
    cmd = f"python {dexseq_script} {input_gff} {output_gff}"
    print(f"Running command: {cmd}")
    return_code = os.system(cmd)
    
    if return_code == 0:
        print("Successfully created DEXSeq annotation file")
        return output_gff
    else:
        print("Failed to create DEXSeq annotation file")
        return None

# Then modify your create_dexseq_dataset function to use the DEXSeq-formatted GFF
def create_dexseq_dataset(sample_info, processed_files, dexseq):
    """
    Create DEXSeqDataSet with proper formatting
    """
    try:
        # Ensure we have the DEXSeq-formatted GFF file
        dexseq_gff = "DATA/gencode.v31.basic.annotation.DEXSeq.gff"
        if not os.path.exists(dexseq_gff):
            dexseq_gff = prepare_dexseq_annotation()
            if not dexseq_gff:
                raise ValueError("Failed to create DEXSeq annotation file")
        
        # Prepare sample data
        sample_data = pd.DataFrame({
            'sample': sample_info['sample'],
            'condition': sample_info['condition']
        })
        
        print("\nSample data:")
        print(sample_data)
        
        # Convert to R objects
        with localconverter(ro.default_converter + pandas2ri.converter):
            sample_data_r = ro.conversion.py2rpy(sample_data)
        
        # Create DEXSeqDataSet using the DEXSeq-formatted GFF
        dxd = dexseq.DEXSeqDataSetFromHTSeq(
            countfiles=ro.StrVector(processed_files),
            sampleData=sample_data_r,
            design=Formula('~ sample + exon + condition:exon'),
            flattenedfile=dexseq_gff  # Use the DEXSeq-formatted GFF
        )
        
        return dxd
    
    except Exception as e:
        print(f"Error creating DEXSeq dataset: {str(e)}")
        print("\nChecking processed files:")
        for f in processed_files:
            print(f"\nFile: {os.path.basename(f)}")
            with open(f, 'r') as file:
                print(file.read(500))
        raise

In [32]:
def get_dexseq_gff_features():
    """
    Extract features from DEXSeq-formatted GFF file
    """
    gff_features = set()
    dexseq_gff = "DATA/gencode.v31.basic.annotation.DEXSeq.gff"
    
    # Ensure DEXSeq GFF exists
    if not os.path.exists(dexseq_gff):
        print("Creating DEXSeq annotation file...")
        dexseq_gff = prepare_dexseq_annotation()
        if not dexseq_gff:
            raise ValueError("Failed to create DEXSeq annotation file")
    
    # Read features from DEXSeq-formatted GFF
    with open(dexseq_gff, 'r') as f:
        for line in f:
            if 'exonic_part' in line:
                fields = line.strip().split('\t')
                attrs = dict(item.strip().split(' ', 1) for item in fields[8].strip().split(';'))
                gene_id = attrs['gene_id'].strip('"')
                exon_num = attrs['exonic_part_number'].strip('"')
                feature_id = f"{gene_id}:E{exon_num}"
                gff_features.add(feature_id)
    return gff_features

In [33]:
def process_count_file(file_path, gff_features):
    """
    Process DEXSeq count file to match the GFF feature format
    """
    output_dir = os.path.dirname(file_path)
    basename = os.path.basename(file_path)
    output_path = os.path.join(output_dir, f"processed_{basename}")
    
    try:
        # Create ordered dictionary of gff features
        gff_feature_dict = {feature: True for feature in sorted(gff_features)}
        
        # Read and process file
        count_dict = {}
        with open(file_path, 'r') as f:
            for line in f:
                if not line.startswith('_'):  # Skip special entries
                    parts = line.strip().split('\t')
                    if len(parts) == 2:
                        feature_id, count = parts
                        # Remove quotes and split feature ID
                        feature_id = feature_id.strip('"')
                        if '":"' in feature_id:
                            gene_id, exon_num = feature_id.split('":"')
                            exon_num = exon_num.strip('"')
                            # Format to match GFF style
                            feature_id = f"{gene_id}:E{exon_num}"
                            
                            # Only keep features that exist in GFF
                            if feature_id in gff_feature_dict:
                                count_dict[feature_id] = int(count)
        
        # Create output with all GFF features (using 0 for missing counts)
        with open(output_path, 'w') as f:
            for feature in gff_feature_dict:
                count = count_dict.get(feature, 0)
                f.write(f"{feature}\t{count}\n")
        
        print(f"\nProcessed {basename}")
        print(f"Features written: {len(gff_feature_dict)}")
        
        # Verify file contents
        with open(output_path, 'r') as f:
            first_lines = [next(f) for _ in range(5)]
        print("First few lines:")
        for line in first_lines:
            print(line.strip())
        
        return output_path
    
    except Exception as e:
        print(f"Error processing {basename}: {str(e)}")
        return None

def verify_processed_file(file_path, gff_features):
    """
    Verify that a processed file matches GFF features exactly
    """
    try:
        file_features = set()
        with open(file_path, 'r') as f:
            for line in f:
                feature_id = line.strip().split('\t')[0]
                file_features.add(feature_id)
        
        missing_features = gff_features - file_features
        extra_features = file_features - gff_features
        
        print(f"\nVerification of {os.path.basename(file_path)}:")
        print(f"Total features in file: {len(file_features)}")
        print(f"Missing features: {len(missing_features)}")
        print(f"Extra features: {len(extra_features)}")
        
        return len(missing_features) == 0 and len(extra_features) == 0
    
    except Exception as e:
        print(f"Error verifying {file_path}: {str(e)}")
        return False

def main():
    # Set up paths
    data_dir = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords/Create_counts/output"
    working_dir = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords"
    os.chdir(working_dir)
    
    # First, ensure we have DEXSeq-formatted GFF
    dexseq_gff = "DATA/gencode.v31.basic.annotation.DEXSeq.gff"
    if not os.path.exists(dexseq_gff):
        prepare_dexseq_annotation()
    
    # Get features from DEXSeq-formatted GFF
    print("Loading DEXSeq GFF features...")
    gff_features = get_dexseq_gff_features()
    print(f"Found {len(gff_features)} features in DEXSeq GFF")
    
    # Get count files (note the extension should match your files)
    count_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) 
                  if f.endswith('.dexeq_counts')]
    print(f"\nFound {len(count_files)} count files")
    
    # Print sample of count file content
    if count_files:
        print("\nSample count file content:")
        with open(count_files[0], 'r') as f:
            print(f.read(500))
    
    # Process files
    processed_files = []
    for file in count_files:
        processed_file = process_count_file(file, gff_features)
        if processed_file and verify_processed_file(processed_file, gff_features):
            processed_files.append(processed_file)
    
    if not processed_files:
        print("No files were processed successfully")
        return
    
    # Create sample info (EDO and ND1 only)
    edo_nd1_samples = pd.DataFrame([
        {'sample': os.path.basename(f).replace('.dexeq_counts', '').replace('processed_', ''),
         'condition': 'EDO' if 'EDO' in f else 'ND1'}
        for f in processed_files
        if 'EDO' in f or 'ND1' in f
    ])
    
    # Sort samples to ensure consistent order
    edo_nd1_samples = edo_nd1_samples.sort_values('sample').reset_index(drop=True)
    
    print("\nFinal sample information:")
    print(edo_nd1_samples)
    
    # Create DEXSeq dataset
    try:
        filtered_files = [f for f in processed_files 
                         if any(s in f for s in edo_nd1_samples['sample'])]
        
        print("\nProcessed files for DEXSeq:")
        for f in filtered_files:
            print(f"- {os.path.basename(f)}")
        
        dxd = create_dexseq_dataset(edo_nd1_samples, filtered_files, dexseq)
        print("Successfully created DEXSeq dataset!")
        return dxd
    except Exception as e:
        print(f"\nFailed to create DEXSeq dataset: {str(e)}")
        return None

if __name__ == "__main__":
    # Enable automatic conversion between pandas and R dataframes
    pandas2ri.activate()
    
    # Run the analysis
    dxd = main()

Loading DEXSeq GFF features...
Found 405499 features in DEXSeq GFF

Found 9 count files

Sample count file content:
"ENSG00000000003.14":"001"	98
"ENSG00000000003.14":"002"	1033
"ENSG00000000003.14":"003"	342
"ENSG00000000003.14":"004"	4
"ENSG00000000003.14":"005"	303
"ENSG00000000003.14":"006"	288
"ENSG00000000003.14":"007"	283
"ENSG00000000003.14":"008"	220
"ENSG00000000003.14":"009"	384
"ENSG00000000003.14":"010"	256
"ENSG00000000003.14":"011"	9
"ENSG00000000003.14":"012"	8
"ENSG00000000005.6":"001"	8
"ENSG00000000005.6":"002"	10
"ENSG00000000005.6":"003"	12
"ENSG00000000005.6":"004"	8
"ENSG00000000005.6":

Processed ND1_2.dexeq_counts
Features written: 405499
First few lines:
ENSG00000000003.14:E001	98
ENSG00000000003.14:E002	1033
ENSG00000000003.14:E003	342
ENSG00000000003.14:E004	4
ENSG00000000003.14:E005	303

Verification of processed_ND1_2.dexeq_counts:
Total features in file: 405499
Missing features: 0
Extra features: 0

Processed PW1_3.dexeq_counts
Features written: 405499
Fi

R[write to console]: Error in (function (countfiles, sampleData, design = ~sample + exon +  : 
  Count files do not correspond to the flattened annotation file



Error creating DEXSeq dataset: Error in (function (countfiles, sampleData, design = ~sample + exon +  : 
  Count files do not correspond to the flattened annotation file


Checking processed files:

File: processed_ND1_2.dexeq_counts
ENSG00000000003.14:E001	98
ENSG00000000003.14:E002	1033
ENSG00000000003.14:E003	342
ENSG00000000003.14:E004	4
ENSG00000000003.14:E005	303
ENSG00000000003.14:E006	288
ENSG00000000003.14:E007	283
ENSG00000000003.14:E008	220
ENSG00000000003.14:E009	384
ENSG00000000003.14:E010	256
ENSG00000000003.14:E011	9
ENSG00000000003.14:E012	8
ENSG00000000005.6:E001	8
ENSG00000000005.6:E002	10
ENSG00000000005.6:E003	12
ENSG00000000005.6:E004	8
ENSG00000000005.6:E005	10
ENSG00000000005.6:E006	7
ENSG00000000005.6

File: processed_EDO_1.dexeq_counts
ENSG00000000003.14:E001	107
ENSG00000000003.14:E002	900
ENSG00000000003.14:E003	340
ENSG00000000003.14:E004	1
ENSG00000000003.14:E005	278
ENSG00000000003.14:E006	246
ENSG00000000003.14:E007	262
ENSG00000000003.14:E008	196
ENSG000