In [31]:
import pandas as pd
import numpy as np
from typing import List, Dict, Set
import os


In [32]:
working_dir = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords"
os.chdir(working_dir)

In [2]:
def load_dexseq_results(results_file: str) -> pd.DataFrame:
    """Load DEXSeq results and perform initial filtering."""
    results = pd.read_csv(results_file)
    # Convert p-values to numeric, replacing any invalid values with 1.0
    results['padj'] = pd.to_numeric(results['padj'], errors='coerce').fillna(1.0)
    return results

In [21]:
def extract_gene_id(feature_id: str) -> str:
    """Extract gene ID from the feature identifier."""
    return feature_id.split(':')[0].split('.')[0]

In [22]:
def analyze_target_genes(results_df: pd.DataFrame,
                        target_genes: List[str],
                        padj_threshold: float = 0.1,
                        log2fc_threshold: float = 0.5) -> Dict[str, pd.DataFrame]:
    """
    Analyze differential splicing for target genes.
    
    Args:
        results_df: DEXSeq results DataFrame
        target_genes: List of target gene names
        padj_threshold: Adjusted p-value threshold for significance
        log2fc_threshold: Log2 fold change threshold for significance
    
    Returns:
        Dictionary with gene names as keys and filtered results as values
    """
    # Extract gene IDs from groupID column
    results_df['gene_id'] = results_df['groupID'].apply(extract_gene_id)
    
    print("Extracted gene IDs:")
    print(results_df["gene_id"][:10])

    # Create a mapping of gene names to their IDs
    gene_name_to_id = {
        'SETD5': 'ENSG00000168137',
        'NSD2': 'ENSG00000109685',
        'POLE': 'ENSG00000177084',
        'HTT': 'ENSG00000197386',
        'PER3': 'ENSG00000049246',
        'MED14': 'ENSG00000180182'
    }
    
    gene_results = {}
    
    for gene_name in target_genes:
        if gene_name in gene_name_to_id:
            gene_id = gene_name_to_id[gene_name]
            
            # Filter results for this gene
            gene_df = results_df[results_df['gene_id'] == gene_id].copy()
            
            if not gene_df.empty:
                # Add significance flags
                gene_df['is_significant'] = (gene_df['padj'] < padj_threshold) & \
                                          (abs(gene_df['log2fold_treated_untreated']) > log2fc_threshold)
                
                # Sort by adjusted p-value
                gene_df = gene_df.sort_values('padj')
                
                # Calculate additional statistics
                stats = {
                    'total_exons': len(gene_df),
                    'significant_exons': gene_df['is_significant'].sum(),
                    'min_padj': gene_df['padj'].min(),
                    'max_abs_log2fc': abs(gene_df['log2fold_treated_untreated']).max()
                }
                
                gene_results[gene_name] = {
                    'results': gene_df,
                    'stats': stats
                }
    
    return gene_results

In [23]:
def summarize_results(gene_results: Dict[str, Dict]) -> pd.DataFrame:
    """Create a summary DataFrame of the analysis results."""
    summary_data = []
    
    for gene_name, data in gene_results.items():
        stats = data['stats']
        summary_data.append({
            'Gene': gene_name,
            'Total_Exons': stats['total_exons'],
            'Significant_Exons': stats['significant_exons'],
            'Min_Padj': stats['min_padj'],
            'Max_Abs_Log2FC': stats['max_abs_log2fc'],
            'Percent_Significant': (stats['significant_exons'] / stats['total_exons'] * 100)
        })
    
    return pd.DataFrame(summary_data)

In [24]:
# File paths
results_file = '/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords/output/dexseq_results_PW1_vs_EDO.csv'

In [25]:
# Target genes
target_genes = ['SETD5', 'NSD2', 'POLE', 'HTT', 'PER3', 'MED14']

In [26]:
# Load results
print("Loading DEXSeq results...")
results_df = load_dexseq_results(results_file)

Loading DEXSeq results...


In [27]:
results_df.head()

Unnamed: 0.1,Unnamed: 0,groupID,featureID,exonBaseMean,dispersion,stat,pvalue,padj,untreated,treated,...,genomicData.start,genomicData.end,genomicData.width,genomicData.strand,countData.PW1_2,countData.EDO_2,countData.PW1_1,countData.PW1_3,countData.EDO_3,countData.EDO_1
0,ENSG00000000003.14:E001,ENSG00000000003.14,E001,134.011319,0.001096,11692.250158,0.0,0.0,18.525079,23.223678,...,100627109,100628669,1561,-,182,139,119,189,86,123
1,ENSG00000000003.14:E002,ENSG00000000003.14,E002,1413.870707,0.000176,2999.977756,0.0,0.0,60.481014,61.663207,...,100628670,100629986,1317,-,2630,1237,1419,2220,687,1007
2,ENSG00000000003.14:E003,ENSG00000000003.14,E003,501.415434,0.000195,14129.283387,0.0,0.0,38.040654,39.47326,...,100630759,100630866,108,-,926,434,471,802,254,375
3,ENSG00000000003.14:E004,ENSG00000000003.14,E004,2.232228,0.019562,2399.891284,0.0,0.0,2.799125,2.527304,...,100632063,100632068,6,-,5,2,2,4,1,1
4,ENSG00000000003.14:E005,ENSG00000000003.14,E005,399.355186,0.000202,16136.218547,0.0,0.0,34.351957,35.226392,...,100632485,100632568,84,-,719,347,393,654,196,289


In [28]:
# Analyze target genes
print("\nAnalyzing target genes...")
gene_results = analyze_target_genes(results_df, target_genes)


Analyzing target genes...
Extracted gene IDs:
0    ENSG00000000003
1    ENSG00000000003
2    ENSG00000000003
3    ENSG00000000003
4    ENSG00000000003
5    ENSG00000000003
6    ENSG00000000003
7    ENSG00000000003
8    ENSG00000000003
9    ENSG00000000003
Name: gene_id, dtype: object


In [42]:
# Print first few elements of the gene_results dictionary
for i, (gene_name, data) in enumerate(gene_results.items()):
    if i < 2:  # Only show first 2 genes
        print(f"\n{gene_name}:")
        print("Stats:", data['stats'])
        print("\nFirst few rows of results:")
        # print(data['results'].head())


SETD5:
Stats: {'total_exons': 32, 'significant_exons': 0, 'min_padj': 0.0, 'max_abs_log2fc': 0.4425705149425831}

First few rows of results:

NSD2:
Stats: {'total_exons': 40, 'significant_exons': 1, 'min_padj': 0.0, 'max_abs_log2fc': 0.6627029360342807}

First few rows of results:


In [29]:
# Create summary
print("\nGenerating summary...")
summary_df = summarize_results(gene_results)


Generating summary...


In [38]:
summary_df.head()

Unnamed: 0,Gene,Total_Exons,Significant_Exons,Min_Padj,Max_Abs_Log2FC,Percent_Significant
0,SETD5,32,0,0.0,0.442571,0.0
1,NSD2,40,1,0.0,0.662703,2.5
2,POLE,51,1,0.0,10.43471,1.960784
3,HTT,67,0,0.0,0.462,0.0
4,PER3,29,2,0.0,2.541446,6.896552


In [33]:
# Save results
print("\nSaving results...")
summary_df.to_csv('snord116_target_genes_summary.csv', index=False)


Saving results...


In [34]:
# Save detailed results for each gene
for gene_name, data in gene_results.items():
    output_file = f'snord116_target_gene_{gene_name}_detailed.csv'
    data['results'].to_csv(output_file, index=False)
    print(f"Saved detailed results for {gene_name} to {output_file}")

Saved detailed results for SETD5 to snord116_target_gene_SETD5_detailed.csv
Saved detailed results for NSD2 to snord116_target_gene_NSD2_detailed.csv
Saved detailed results for POLE to snord116_target_gene_POLE_detailed.csv
Saved detailed results for HTT to snord116_target_gene_HTT_detailed.csv
Saved detailed results for PER3 to snord116_target_gene_PER3_detailed.csv
Saved detailed results for MED14 to snord116_target_gene_MED14_detailed.csv


In [35]:
# Print summary
print("\nAnalysis Summary:")
print(summary_df.to_string())


Analysis Summary:
    Gene  Total_Exons  Significant_Exons  Min_Padj  Max_Abs_Log2FC  Percent_Significant
0  SETD5           32                  0       0.0        0.442571             0.000000
1   NSD2           40                  1       0.0        0.662703             2.500000
2   POLE           51                  1       0.0       10.434710             1.960784
3    HTT           67                  0       0.0        0.462000             0.000000
4   PER3           29                  2       0.0        2.541446             6.896552
5  MED14           31                  0       0.0        0.386772             0.000000


In [36]:
# Print detailed findings for significant results
print("\nDetailed findings for significant differential splicing events:")
for gene_name, data in gene_results.items():
    sig_events = data['results'][data['results']['is_significant']]
    if not sig_events.empty:
        print(f"\n{gene_name}:")
        for _, event in sig_events.iterrows():
            print(f"  Exon: {event['featureID']}")
            print(f"  Adjusted p-value: {event['padj']:.2e}")
            print(f"  Log2 fold change: {event['log2fold_treated_untreated']:.2f}")


Detailed findings for significant differential splicing events:

NSD2:
  Exon: E005
  Adjusted p-value: 0.00e+00
  Log2 fold change: -0.66

POLE:
  Exon: E051
  Adjusted p-value: 3.71e-147
  Log2 fold change: -10.43

PER3:
  Exon: E014
  Adjusted p-value: 0.00e+00
  Log2 fold change: 0.54
  Exon: E002
  Adjusted p-value: 7.69e-96
  Log2 fold change: -2.54
