In [1]:
import os
import pandas as pd
import re
from typing import Dict, List, Tuple

def parse_gtf_attributes(attribute_string: str) -> Dict[str, str]:
    """Parse GTF attribute string into a dictionary."""
    attributes = {}
    for attr in attribute_string.strip().split(';'):
        if attr.strip():
            try:
                key, value = attr.strip().split(' ', 1)
                attributes[key] = value.strip('"')
            except ValueError:
                continue
    return attributes

def create_tx2gene_from_gtf(gtf_file: str, output_file: str) -> pd.DataFrame:
    """
    Create transcript to gene mapping from GTF file.
    
    Args:
        gtf_file: Path to input GTF file
        output_file: Path to output TSV file
        
    Returns:
        DataFrame with transcript to gene mapping
    """
    print(f"Processing GTF file: {gtf_file}")
    
    # Store transcript-gene mappings
    tx2gene_dict: Dict[str, Tuple[str, str, str]] = {}
    
    # Process GTF file
    with open(gtf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
                
            fields = line.strip().split('\t')
            if len(fields) < 9 or fields[2] not in ['transcript', 'gene']:
                continue
                
            attributes = parse_gtf_attributes(fields[8])
            
            if fields[2] == 'transcript':
                transcript_id = attributes.get('transcript_id', '').split('.')[0]
                gene_id = attributes.get('gene_id', '').split('.')[0]
                gene_name = attributes.get('gene_name', '')
                gene_type = attributes.get('gene_type', '')
                
                if transcript_id and gene_id:
                    tx2gene_dict[transcript_id] = (gene_id, gene_name, gene_type)
    
    # Create DataFrame
    records = []
    for transcript_id, (gene_id, gene_name, gene_type) in tx2gene_dict.items():
        records.append({
            'transcript_id': transcript_id,
            'gene_id': gene_id,
            'gene_name': gene_name,
            'gene_type': gene_type
        })
    
    tx2gene_df = pd.DataFrame(records)
    
    # Save basic version (transcript_id, gene_id only)
    basic_df = tx2gene_df[['transcript_id', 'gene_id']]
    basic_df.to_csv(output_file, sep='\t', index=False, header=False)
    
    # Save detailed version
    detailed_output = output_file.replace('.tsv', '_detailed.tsv')
    tx2gene_df.to_csv(detailed_output, sep='\t', index=False)
    
    print(f"\nBasic tx2gene file saved to: {output_file}")
    print(f"Detailed tx2gene file saved to: {detailed_output}")
    print(f"\nSummary:")
    print(f"Total transcripts: {len(tx2gene_df)}")
    print(f"Total genes: {len(tx2gene_df['gene_id'].unique())}")
    print(f"\nGene type distribution:")
    print(tx2gene_df['gene_type'].value_counts().head())
    
    return tx2gene_df

def validate_tx2gene(tx2gene_df: pd.DataFrame) -> None:
    """Validate the tx2gene mapping."""
    print("\nValidation Report:")
    
    # Check for missing values
    missing = tx2gene_df.isnull().sum()
    if missing.sum() > 0:
        print("\nWarning: Missing values found:")
        print(missing[missing > 0])
    
    # Check for transcripts mapping to multiple genes
    transcript_counts = tx2gene_df.groupby('transcript_id')['gene_id'].nunique()
    multi_mapped = transcript_counts[transcript_counts > 1]
    if not multi_mapped.empty:
        print(f"\nWarning: {len(multi_mapped)} transcripts map to multiple genes")
        print("\nExample multi-mapped transcripts:")
        print(multi_mapped.head())
    
    # Check transcript/gene ID format
    invalid_transcript = ~tx2gene_df['transcript_id'].str.match(r'^ENST\d+$')
    invalid_gene = ~tx2gene_df['gene_id'].str.match(r'^ENSG\d+$')
    
    if invalid_transcript.any():
        print(f"\nWarning: {invalid_transcript.sum()} transcript IDs have unexpected format")
        print("\nExample invalid transcript IDs:")
        print(tx2gene_df[invalid_transcript]['transcript_id'].head())
    
    if invalid_gene.any():
        print(f"\nWarning: {invalid_gene.sum()} gene IDs have unexpected format")
        print("\nExample invalid gene IDs:")
        print(tx2gene_df[invalid_gene]['gene_id'].head())

# Example usage
if __name__ == "__main__":
    # Define paths
    gtf_file = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords/DATA/v38/gencode.v38.annotation.gtf"
    output_dir = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords/DATA/v38"
    output_file = os.path.join(output_dir, "tx2gene.tsv")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create tx2gene mapping
    tx2gene_df = create_tx2gene_from_gtf(gtf_file, output_file)
    
    # Validate the mapping
    validate_tx2gene(tx2gene_df)
    
    # Optional: analyze overlapping genes
    print("\nAnalyzing potential overlapping genes...")
    transcript_gene_counts = tx2gene_df.groupby('transcript_id')['gene_id'].count()
    overlapping = transcript_gene_counts[transcript_gene_counts > 1]
    
    if not overlapping.empty:
        print(f"\nFound {len(overlapping)} transcripts associated with multiple genes")
        print("\nExample cases:")
        for transcript_id in overlapping.head().index:
            genes = tx2gene_df[tx2gene_df['transcript_id'] == transcript_id]
            print(f"\nTranscript: {transcript_id}")
            print(genes[['gene_id', 'gene_name', 'gene_type']])

Processing GTF file: /beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords/DATA/v38/gencode.v38.annotation.gtf

Basic tx2gene file saved to: /beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords/DATA/v38/tx2gene.tsv
Detailed tx2gene file saved to: /beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Snords/DATA/v38/tx2gene_detailed.tsv

Summary:
Total transcripts: 236853
Total genes: 60605

Gene type distribution:
gene_type
protein_coding                        161899
lncRNA                                 47657
processed_pseudogene                   10158
transcribed_unprocessed_pseudogene      3098
unprocessed_pseudogene                  2618
Name: count, dtype: int64

Validation Report:

Analyzing potential overlapping genes...
