In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

In [2]:
# Set the working directory
working_dir = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords"
os.chdir(working_dir)
print(f"Current working directory: {os.getcwd()}")

Current working directory: /beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords


In [7]:
def check_dexseq_results(results_file: str, output_dir: str = "qc_plots"):
    """
    Perform quality checks on DEXSeq results file.
    
    Args:
        results_file: Path to DEXSeq results CSV file
        output_dir: Directory to save QC plots
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Load results
    print("Loading results file...")
    df = pd.read_csv(results_file)
    
    # Basic statistics
    print("\n=== Basic Statistics ===")
    print(f"Total number of tests: {len(df)}")
    print(f"Number of NA p-values: {df['pvalue'].isna().sum()}")
    print(f"Number of NA adjusted p-values: {df['padj'].isna().sum()}")
    print(f"Number of significant results (padj < 0.05): {(df['padj'] < 0.05).sum()}")
    print(f"Number of significant results (padj < 0.1): {(df['padj'] < 0.1).sum()}")
    
    # Check for extreme fold changes
    fc_stats = df['log2fold_treated_control'].describe()
    print("\n=== Fold Change Statistics ===")
    print(fc_stats)
    
    # Identify potential problematic results
    print("\n=== Potential Issues ===")
    problematic = df[
        (df['log2fold_treated_control'].abs() > 5) |  # Extreme fold changes
        (df['dispersion'] > 10) |                     # High dispersion
        (df['pvalue'].isna()) |                      # Missing p-values
        (df['stat'].abs() > 10000)                   # Extreme test statistics
    ]
    print(f"Number of potentially problematic results: {len(problematic)}")
    if len(problematic) > 0:
        print("\nSample of problematic results:")
        print(problematic[['groupID', 'featureID', 'log2fold_treated_control', 'dispersion', 'pvalue', 'stat']].head())
    
    # Create plots
    plt.style.use('default')
    
    # 1. P-value distribution
    plt.figure(figsize=(10, 6))
    plt.hist(df['pvalue'].dropna(), bins=50, edgecolor='black')
    plt.title('P-value Distribution')
    plt.xlabel('P-value')
    plt.ylabel('Frequency')
    plt.savefig(f"{output_dir}/pvalue_distribution.png")
    plt.close()
    
    # 2. Volcano plot
    plt.figure(figsize=(10, 6))
    plt.scatter(df['log2fold_treated_control'], 
               -np.log10(df['pvalue']),
               alpha=0.5)
    plt.title('Volcano Plot')
    plt.xlabel('Log2 Fold Change')
    plt.ylabel('-log10(p-value)')
    plt.savefig(f"{output_dir}/volcano_plot.png")
    plt.close()
    
    # 3. MA plot
    plt.figure(figsize=(10, 6))
    plt.scatter(df['exonBaseMean'],
               df['log2fold_treated_control'],
               alpha=0.5)
    plt.xscale('log')
    plt.title('MA Plot')
    plt.xlabel('Mean Expression')
    plt.ylabel('Log2 Fold Change')
    plt.savefig(f"{output_dir}/ma_plot.png")
    plt.close()
    
    # 4. Dispersion plot
    plt.figure(figsize=(10, 6))
    plt.scatter(df['exonBaseMean'],
               df['dispersion'],
               alpha=0.5)
    plt.xscale('log')
    plt.yscale('log')
    plt.title('Dispersion Plot')
    plt.xlabel('Mean Expression')
    plt.ylabel('Dispersion')
    plt.savefig(f"{output_dir}/dispersion_plot.png")
    plt.close()
    
    # Check for the problematic gene/exon mentioned in the error
    problem_genes = ['ENSG00000285404.1', 'ENSG00000100150.19', 
                    'ENSG00000128245.15', 'ENSG00000252909.1']
    
    print("\n=== Checking Problematic Genes ===")
    for gene in problem_genes:
        gene_results = df[df['groupID'].str.contains(gene, na=False)]
        if len(gene_results) > 0:
            print(f"\nResults for {gene}:")
            print(gene_results[['featureID', 'log2fold_treated_control', 
                              'pvalue', 'padj', 'dispersion']].head())
    
    # Save problematic results to file
    if len(problematic) > 0:
        problematic.to_csv(f"{output_dir}/problematic_results.csv")
        print(f"\nProblematic results saved to {output_dir}/problematic_results.csv")
    
    # Return summary statistics
    return {
        'total_tests': len(df),
        'significant_005': (df['padj'] < 0.05).sum(),
        'significant_01': (df['padj'] < 0.1).sum(),
        'na_pvalues': df['pvalue'].isna().sum(),
        'problematic_count': len(problematic),
        'median_dispersion': df['dispersion'].median(),
        'median_fold_change': df['log2fold_treated_control'].median()
    }

In [4]:
# Example usage
results_file = "output/dexseq_results_PW1_vs_combined_controls.csv"
df = pd.read_csv(results_file)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,groupID,featureID,exonBaseMean,dispersion,stat,pvalue,padj,control,treated,...,genomicData.strand,countData.EDO_1,countData.EDO_2,countData.EDO_3,countData.ND1_1,countData.ND1_2,countData.ND1_3,countData.PW1_1,countData.PW1_2,countData.PW1_3
0,ENSG00000000003.14:E001,ENSG00000000003.14,E001,120.179059,0.011335,2984.629607,0.0,0.0,16.638651,14.784409,...,-,123,139,86,83,106,98,119,182,189
1,ENSG00000000003.14:E002,ENSG00000000003.14,E002,1270.531715,0.006461,396.26092,8.975001e-87,1.2130459999999999e-86,30.813401,30.602511,...,-,1007,1237,687,906,1184,974,1419,2630,2220
2,ENSG00000000003.14:E003,ENSG00000000003.14,E003,444.716283,0.008585,1806.024855,0.0,0.0,23.933712,23.706313,...,-,375,434,254,277,397,359,471,926,802
3,ENSG00000000003.14:E004,ENSG00000000003.14,E004,1.981576,0.022925,3109.695893,0.0,0.0,2.492781,2.656301,...,-,1,2,1,1,4,0,2,5,4
4,ENSG00000000003.14:E005,ENSG00000000003.14,E005,358.92921,0.007125,2538.819848,0.0,0.0,22.574929,22.337963,...,-,289,347,196,233,334,297,393,719,654


In [8]:
summary_stats = check_dexseq_results(results_file)

Loading results file...

=== Basic Statistics ===
Total number of tests: 290520
Number of NA p-values: 0
Number of NA adjusted p-values: 0
Number of significant results (padj < 0.05): 287297
Number of significant results (padj < 0.1): 287832

=== Fold Change Statistics ===
count    290432.000000
mean         -0.168433
std           1.957209
min         -31.908590
25%          -0.154926
50%          -0.004989
75%           0.147049
max          28.618847
Name: log2fold_treated_control, dtype: float64

=== Potential Issues ===
Number of potentially problematic results: 20155

Sample of problematic results:
                groupID featureID  log2fold_treated_control  dispersion  \
45   ENSG00000000460.17      E001                 -8.061519    0.135117   
213  ENSG00000001617.12      E023                -11.586685    0.030460   
214  ENSG00000001617.12      E024                -11.586685    0.030460   
285  ENSG00000002330.13      E010                -10.580658    0.086691   
286  ENSG0000

  result = getattr(ufunc, method)(*inputs, **kwargs)



=== Checking Problematic Genes ===

Results for ENSG00000285404.1:
       featureID  log2fold_treated_control         pvalue           padj  \
286350      E001                       NaN   0.000000e+00   0.000000e+00   
286351      E002                       NaN   0.000000e+00   0.000000e+00   
286352      E003                       NaN   0.000000e+00   0.000000e+00   
286353      E004                       NaN  2.568590e-306  5.283282e-306   
286354      E005                       NaN  8.544953e-182  1.406783e-181   

        dispersion  
286350    0.062429  
286351    0.071076  
286352    0.071076  
286353    0.081908  
286354    0.153038  

Results for ENSG00000100150.19:
       featureID  log2fold_treated_control         pvalue           padj  \
286350      E001                       NaN   0.000000e+00   0.000000e+00   
286351      E002                       NaN   0.000000e+00   0.000000e+00   
286352      E003                       NaN   0.000000e+00   0.000000e+00   
286353      

In [9]:
# Print summary
print("\n=== Analysis Summary ===")
for key, value in summary_stats.items():
    print(f"{key}: {value}")


=== Analysis Summary ===
total_tests: 290520
significant_005: 287297
significant_01: 287832
na_pvalues: 0
problematic_count: 20155
median_dispersion: 0.02005845463844835
median_fold_change: -0.0049886364337101


In [19]:
# List of problematic genes from the error message
problematic_genes = ['ENSG00000285404.1', 'ENSG00000100150.19', 
                    'ENSG00000128245.15', 'ENSG00000252909.1']

# Create masks for each condition
extreme_fc_mask = (df['log2fold_treated_control'].abs() > 5)
high_disp_mask = (df['dispersion'] > 10)
missing_vals_mask = (
    df['log2fold_treated_control'].isna() |
    df['pvalue'].isna() |
    df['padj'].isna()
)
extreme_stat_mask = (df['stat'].abs() > 10000)
problematic_genes_mask = df['groupID'].str.contains('|'.join(problematic_genes), regex=True)

# Print counts for each condition
print(f"Records with extreme fold changes (>5): {extreme_fc_mask.sum()}")
print(f"Records with high dispersion (>10): {high_disp_mask.sum()}")
print(f"Records with missing values: {missing_vals_mask.sum()}")
print(f"Records with extreme test statistics (>10000): {extreme_stat_mask.sum()}")
print(f"Records from problematic genes: {problematic_genes_mask.sum()}")

Records with extreme fold changes (>5): 7704
Records with high dispersion (>10): 18
Records with missing values: 88
Records with extreme test statistics (>10000): 12509
Records from problematic genes: 88


In [21]:
# Examine records with extreme test statistics
extreme_stat_records = df[extreme_stat_mask][['groupID', 'featureID', 'stat', 'pvalue', 'padj', 'log2fold_treated_control', 'dispersion']]
print("\nSample of records with extreme test statistics:")
print(extreme_stat_records.head())

# Check if these extreme statistics correspond to very small p-values
print("\nP-value distribution for extreme test statistics:")
print(extreme_stat_records['pvalue'].describe())


Sample of records with extreme test statistics:
                groupID featureID          stat  pvalue  padj  \
453  ENSG00000003393.15      E003  14198.396557     0.0   0.0   
454  ENSG00000003393.15      E004  14819.648036     0.0   0.0   
455  ENSG00000003393.15      E005  13040.469603     0.0   0.0   
456  ENSG00000003393.15      E006  10972.388415     0.0   0.0   
457  ENSG00000003393.15      E007  14404.573893     0.0   0.0   

     log2fold_treated_control  dispersion  
453                 -0.048457    0.001974  
454                  0.042537    0.001905  
455                 -0.007857    0.002467  
456                 -0.023938    0.002879  
457                 -0.017551    0.002282  

P-value distribution for extreme test statistics:
count    12509.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: pvalue, dtype: float64


In [13]:
# List of problematic genes from the error message
problematic_genes = ['ENSG00000285404.1', 'ENSG00000100150.19', 
                    'ENSG00000128245.15', 'ENSG00000252909.1']

# Create a clean DataFrame by removing:
# 1. Records with extreme fold changes
# 2. Records with high dispersion
# 3. Records with missing values
# 4. Records with extreme test statistics
# 5. The specific problematic genes
clean_df = df[
    # Remove extreme fold changes (keeping values between -5 and 5)
    # (df['log2fold_treated_control'].abs() <= 5) &
    # Remove high dispersion
    # (df['dispersion'] <= 10) &
    # Remove missing values
    (df['log2fold_treated_control'].notna()) &
    (df['pvalue'].notna()) &
    (df['padj'].notna()) &
    # Remove extreme test statistics
    # (df['stat'].abs() <= 10000) &
    # Remove problematic genes
    (~df['groupID'].str.contains('|'.join(problematic_genes), regex=True))
]

# Print summary of filtering
print(f"Original number of records: {len(df)}")
print(f"Number of records after cleaning: {len(clean_df)}")
print(f"Number of records removed: {len(df) - len(clean_df)}")

# Save cleaned results to new CSV file
output_file = "output/dexseq_results_PW1_vs_combined_controls_cleaned_permisive.csv"
clean_df.to_csv(output_file, index=False)
print(f"\nCleaned results saved to: {output_file}")

# Print summary statistics of cleaned data
print("\nSummary of cleaned results:")
print(f"Number of significant results (padj < 0.05): {(clean_df['padj'] < 0.05).sum()}")
print(f"Number of significant results (padj < 0.1): {(clean_df['padj'] < 0.1).sum()}")
print("\nFold change statistics of cleaned data:")
print(clean_df['log2fold_treated_control'].describe())

Original number of records: 290520
Number of records after cleaning: 290432
Number of records removed: 88

Cleaned results saved to: output/dexseq_results_PW1_vs_combined_controls_cleaned_permisive.csv

Summary of cleaned results:
Number of significant results (padj < 0.05): 287209
Number of significant results (padj < 0.1): 287744

Fold change statistics of cleaned data:
count    290432.000000
mean         -0.168433
std           1.957209
min         -31.908590
25%          -0.154926
50%          -0.004989
75%           0.147049
max          28.618847
Name: log2fold_treated_control, dtype: float64


In [17]:
extreme_stat_records['groupID'].value_counts()

groupID
ENSG00000224078.15+ENSG00000277785.1+ENSG00000207093.1+ENSG00000128739.22+ENSG00000274640.1+ENSG00000207137.1+ENSG00000251815.1+ENSG00000278123.1+ENSG00000279050.1+ENSG00000278715.1+ENSG00000276314.1+ENSG00000275524.1+ENSG00000207014.1+ENSG00000273173.5+ENSG00000207460.1+ENSG00000207279.1+ENSG00000276844.1+ENSG00000275529.1+ENSG00000260780.1+ENSG00000251896.1+ENSG00000252277.1+ENSG00000275127.1+ENSG00000207263.1+ENSG00000286110.1+ENSG00000207442.1+ENSG00000207001.1+ENSG00000214265.11+ENSG00000207063.1+ENSG00000273835.1+ENSG00000261069.3+ENSG00000259905.7    245
ENSG00000151914.20                                                                                                                                                                                                                                                                                                                                                                                                                      

In [10]:
# List of problematic genes from the error message
problematic_genes = ['ENSG00000285404.1', 'ENSG00000100150.19', 
                    'ENSG00000128245.15', 'ENSG00000252909.1']

# Create a clean DataFrame by removing:
# 1. Records with extreme fold changes
# 2. Records with high dispersion
# 3. Records with missing values
# 4. Records with extreme test statistics
# 5. The specific problematic genes
clean_df = df[
    # Remove extreme fold changes (keeping values between -5 and 5)
    (df['log2fold_treated_control'].abs() <= 5) &
    # Remove high dispersion
    (df['dispersion'] <= 10) &
    # Remove missing values
    (df['log2fold_treated_control'].notna()) &
    (df['pvalue'].notna()) &
    (df['padj'].notna()) &
    # Remove extreme test statistics
    (df['stat'].abs() <= 10000) &
    # Remove problematic genes
    (~df['groupID'].str.contains('|'.join(problematic_genes), regex=True))
]

# Print summary of filtering
print(f"Original number of records: {len(df)}")
print(f"Number of records after cleaning: {len(clean_df)}")
print(f"Number of records removed: {len(df) - len(clean_df)}")

# Save cleaned results to new CSV file
output_file = "output/dexseq_results_PW1_vs_combined_controls_cleaned.csv"
clean_df.to_csv(output_file, index=False)
print(f"\nCleaned results saved to: {output_file}")

# Print summary statistics of cleaned data
print("\nSummary of cleaned results:")
print(f"Number of significant results (padj < 0.05): {(clean_df['padj'] < 0.05).sum()}")
print(f"Number of significant results (padj < 0.1): {(clean_df['padj'] < 0.1).sum()}")
print("\nFold change statistics of cleaned data:")
print(clean_df['log2fold_treated_control'].describe())

Original number of records: 290520
Number of records after cleaning: 270277
Number of records removed: 20243

Cleaned results saved to: output/dexseq_results_PW1_vs_combined_controls_cleaned.csv

Summary of cleaned results:
Number of significant results (padj < 0.05): 267114
Number of significant results (padj < 0.1): 267637

Fold change statistics of cleaned data:
count    270277.000000
mean         -0.001579
std           0.493325
min          -4.988818
25%          -0.145721
50%          -0.001390
75%           0.151192
max           4.984485
Name: log2fold_treated_control, dtype: float64
