# Environment

In [5]:
import pandas as pd
import numpy as np
import gzip
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle
import gc

In [None]:
# Set the current working directory
os.chdir('/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_SRRM3')

# Print the current working directory to confirm the change
print(f"Current working directory: {os.getcwd()}")


# Local Functions

In [7]:
def load_gtex_data(file_path):
    """
    Load GTEx .gct.gz file into a pandas DataFrame
    
    Parameters:
    file_path (str): Path to the GTEx gct.gz file
    
    Returns:
    tuple: (data_df, description_df, metadata_dict)
    """
    # Read the first few lines to get metadata
    with gzip.open(file_path, 'rt') as f:
        version = f.readline().strip()  # First line: version
        dims = f.readline().strip().split('\t')  # Second line: dimensions
        
    # Parse dimensions
    n_genes, n_samples = map(int, dims)
    
    # Read the actual data, skipping the first two metadata lines
    # GCT format has gene names in first column, descriptions in second column
    data_df = pd.read_csv(file_path, 
                         compression='gzip',
                         sep='\t', 
                         skiprows=2)
    
    # Separate gene descriptions from expression data
    description_df = data_df.iloc[:, :2]
    expression_df = data_df.iloc[:, 2:]
    
    # Set gene names as index
    expression_df.index = data_df.iloc[:, 0]
    
    # Create metadata dictionary
    metadata = {
        'version': version,
        'n_genes': n_genes,
        'n_samples': n_samples
    }
    
    return expression_df, description_df, metadata

def basic_analysis(expression_df):
    """
    Perform basic analysis on the expression data
    
    Parameters:
    expression_df (pd.DataFrame): Expression data matrix
    
    Returns:
    dict: Dictionary containing analysis results
    """
    analysis = {
        'total_transcripts': len(expression_df),
        'total_samples': len(expression_df.columns),
        'mean_expression': expression_df.mean().mean(),
        'median_expression': expression_df.median().median(),
        'non_zero_fraction': (expression_df > 0).mean().mean() * 100,
        'genes_detected_per_sample': (expression_df > 0).sum(),
        'sample_stats': expression_df.describe()
    }
    return analysis

def filter_low_expression(expression_df, min_tpm=1, min_samples=10):
    """
    Filter out lowly expressed transcripts
    
    Parameters:
    expression_df (pd.DataFrame): Expression data matrix
    min_tpm (float): Minimum TPM threshold
    min_samples (int): Minimum number of samples that must exceed min_tpm
    
    Returns:
    pd.DataFrame: Filtered expression matrix
    """
    # Count samples exceeding threshold for each transcript
    samples_above_threshold = (expression_df >= min_tpm).sum(axis=1)
    
    # Filter transcripts
    filtered_df = expression_df[samples_above_threshold >= min_samples]
    
    return filtered_df

def plot_expression_distribution(expression_df, output_file=None):
    """
    Plot distribution of expression values
    
    Parameters:
    expression_df (pd.DataFrame): Expression data matrix
    output_file (str): Optional path to save the plot
    """
    plt.figure(figsize=(10, 6))
    
    # Calculate mean expression per transcript
    mean_expression = expression_df.mean(axis=1)
    
    # Plot distribution of log2 transformed mean expression
    sns.histplot(np.log2(mean_expression + 1), bins=50)
    plt.xlabel('Log2(TPM + 1)')
    plt.ylabel('Count')
    plt.title('Distribution of Mean Expression Values')
    
    if output_file:
        plt.savefig(output_file)
    plt.close()

In [8]:
# %% Function to load brain region data
def load_brain_data(file_list):
    """
    Load GTEx brain region-specific data files
    
    Parameters:
    file_list (list): List of file paths for brain region data
    
    Returns:
    dict: Dictionary with region names as keys and DataFrames as values
    """
    brain_data = {}
    
    for file_path in file_list:
        # Extract region name from file path
        region = file_path.split('gene_tpm_2017-06-05_v8_brain_')[1].split('.gct.gz')[0]
        print(f"Loading data for {region}...")
        
        try:
            # Load data
            expr_df, desc_df, meta = load_gtex_data(file_path)
            
            # Store in dictionary
            brain_data[region] = {
                'expression': expr_df,
                'description': desc_df,
                'metadata': meta
            }
        except Exception as e:
            print(f"Error loading {region}: {str(e)}")
            continue
    
    return brain_data

# %% Modified function to load GCT files
def load_gtex_data(file_path):
    """
    Load GTEx .gct.gz file into a pandas DataFrame
    
    Parameters:
    file_path (str): Path to the GTEx gct.gz file
    
    Returns:
    tuple: (data_df, description_df, metadata_dict)
    """
    # Read the first few lines to get metadata
    with gzip.open(file_path, 'rt') as f:
        version = f.readline().strip()  # First line: version
        dims_line = f.readline().strip().split('\t')  # Second line: dimensions
        
    # Parse dimensions - take first two numbers regardless of format
    n_genes = int(dims_line[0])
    n_samples = int(dims_line[1])
    
    # Read the actual data
    data_df = pd.read_csv(file_path, 
                         compression='gzip',
                         sep='\t', 
                         skiprows=2)
    
    # Get the first two columns (Name/Description or gene_id/gene_name)
    first_col = data_df.columns[0]
    second_col = data_df.columns[1]
    
    # Separate gene descriptions from expression data
    description_df = data_df.iloc[:, :2].copy()
    description_df.columns = ['gene_id', 'gene_name']  # Rename columns consistently
    
    # Create expression DataFrame
    expression_df = data_df.iloc[:, 2:]
    expression_df.index = data_df[first_col]  # Use first column as index
    
    metadata = {
        'version': version,
        'n_genes': n_genes,
        'n_samples': n_samples
    }
    
    return expression_df, description_df, metadata

# Load - full GTEx data

[The GTEx Analysis V8 release](https://gtexportal.org/home/downloads/adult-gtex/bulk_tissue_expression)

In [9]:
%%script false --no-raise-error
file_path = "./DATA/GTEx/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz"

# Load data
expression_df, description_df, metadata = load_gtex_data(file_path)

In [10]:
pickle_file = "./DATA/GTEx/gtex_data.pkl"

In [11]:
%%script false --no-raise-error
with open(pickle_file, 'wb') as f:
    pickle.dump((expression_df, description_df, metadata), f)
print("Data loaded from original file and saved to pickle file.")

In [None]:
if os.path.exists(pickle_file):
    # Load data from pickle file if it exists
    with open(pickle_file, 'rb') as f:
        expression_df, description_df, metadata = pickle.load(f)
    print("Data loaded from pickle file.")
else:
    print("Pickle file does not exist. Please run the script to create it.")

In [None]:
# Print basic information
print(f"Dataset version: {metadata['version']}")
print(f"Number of transcripts: {metadata['n_genes']}")
print(f"Number of samples: {metadata['n_samples']}")

In [None]:
print(type(expression_df))
expression_df.head()


Genes metadata

In [None]:
print(type(description_df))
description_df.head()


In [None]:
type(metadata)
metadata

In [17]:
%%script false --no-raise-error
# Perform basic analysis
analysis_results = basic_analysis(expression_df)
print("\nBasic Analysis Results:")
for key, value in analysis_results.items():
    if not isinstance(value, pd.DataFrame):
        print(f"{key}: {value}")

In [18]:
%%script false --no-raise-error
# Filter low expression
filtered_df = filter_low_expression(expression_df, min_tpm=1, min_samples=10)
print(f"\nTranscripts after filtering: {len(filtered_df)}")

# Plot expression distribution
plot_expression_distribution(filtered_df, "expression_distribution.png")

# Look for SRRM3 - full GTEx data

# Examined ACCESSION numbers:

## From NCBI:

- [NM_001110199.3](https://www.ncbi.nlm.nih.gov/nuccore/NM_001110199.3/) (Ensembl match: ENST00000611745.2/ENSP00000480851.1)
- [NM_001291831.2](https://www.ncbi.nlm.nih.gov/nuccore/NM_001291831.2/) (None)

## From Ensembl:

- [ENSG00000177679](https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000177679;r=7:76201896-76287288)


In [19]:
# %% Define the canonical and noncanonical NCBI IDs
canonical_refseq = "NM_001110199.3"
noncanonical_refseq = "NM_001291831.2"

In [None]:
# %% Look for SRRM3 - Ensembl ID
srrm3_entries = description_df[description_df['gene_id'].str.contains('ENSG00000177679', na=False)]
print("SRRM3 transcripts found:")
print(srrm3_entries)

In [None]:
# %% Get expression data for these transcripts
srrm3_transcripts = expression_df.loc[srrm3_entries['transcript_id']]
print("\nExpression data shape:", srrm3_transcripts.shape)

# %% Get basic statistics for each transcript
stats = pd.DataFrame({
    'mean': srrm3_transcripts.mean(axis=1),
    'median': srrm3_transcripts.median(axis=1),
    'std': srrm3_transcripts.std(axis=1),
    'non_zero_samples': (srrm3_transcripts > 0).sum(axis=1)
})
print("\nTranscript statistics:")
print(stats)

# %% Look for our transcript of interest (MANE Select)
mane_select = "ENST00000611745"  # This corresponds to NM_001110199.3 (variant without exon 15)
mane_transcripts = [t for t in srrm3_transcripts.index if t.startswith(mane_select)]
if mane_transcripts:
    print(f"\nFound MANE Select transcript: {mane_transcripts[0]}")
    print(f"Mean expression: {srrm3_transcripts.loc[mane_transcripts[0]].mean():.2f} TPM")
else:
    print(f"\nMANE Select transcript {mane_select} not found")

In [None]:
# %% Plot expression distribution
plt.figure(figsize=(12, 6))
sns.boxplot(data=srrm3_transcripts.T)
plt.xticks(rotation=45, ha='right')
plt.title('Expression Distribution of SRRM3 Transcripts')
plt.ylabel('TPM')
plt.tight_layout()
plt.show()

# From the literature:
- [Pancreatic microexons regulate islet function and glucose homeostasis](https://www.nature.com/articles/s42255-022-00734-2)
    - **ENST00000611745:** This isoform contains the **enhancer of microexons domain (eMIC)**.
    - **ENST00000612155:** This isoform has a **truncated eMIC domain**.

In [None]:
# %% Find eMIC transcripts
srrm3_eMIC = description_df[description_df['transcript_id'].str.contains('ENST00000611745', na=False)]
print("SRRM3 transcripts found:")
print(srrm3_eMIC)

In [None]:
# %% Find truncated eMIC transcripts
srrm3_truncated = description_df[description_df['transcript_id'].str.contains('ENST00000612155', na=False)]
print("SRRM3 transcripts found:")
print(srrm3_truncated)

# Add tissue metadata

In [25]:
# %% Load the sample attributes file
sample_attributes = pd.read_csv('./DATA/GTEx/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', sep='\t')

In [None]:
print(sample_attributes.shape)  
print(expression_df.shape)

In [None]:
sample_attributes.head()

In [None]:
sample_attributes.SMTSD.value_counts()

In [None]:
sample_attributes[sample_attributes.SMTS == "Brain"].SMTSD.value_counts()

In [30]:
# %% Create a mapping dictionary from sample ID to tissue type
tissue_mapping = pd.Series(sample_attributes.SMTSD.values, 
                          index=sample_attributes.SAMPID).to_dict()

In [None]:
pd.Series(expression_df.columns).map(tissue_mapping)[:10]

In [32]:
# %% Create MultiIndex dataframe with tissue information
def get_tissue_types(expression_df, sample_attributes):
    """Add tissue type information to expression data"""
    # Create tissue mapping
    tissue_mapping = pd.Series(sample_attributes.SMTSD.values, 
                             index=sample_attributes.SAMPID).to_dict()
    
    # Get tissue types for each sample
    tissue_types = pd.Series(expression_df.columns).map(tissue_mapping)
    
    # Create MultiIndex columns
    multi_index = pd.MultiIndex.from_arrays(
        [tissue_types, expression_df.columns],
        names=['tissue_type', 'sample_id']
    )
    
    # Create new dataframe with tissue information
    expression_with_tissue = expression_df.copy()
    expression_with_tissue.columns = multi_index
    
    return expression_with_tissue

In [None]:
gc.collect()

In [None]:
# %% Apply the transformation
expression_with_tissue = get_tissue_types(expression_df, sample_attributes)

In [None]:
expression_with_tissue.head()

In [None]:
expression_with_tissue.index

In [None]:
expression_with_tissue["gene_id"] = expression_with_tissue.index.str.split('.').str[0]


In [None]:
import mygene

# Remove version numbers from gene IDs
sel_genes_no_version = expression_with_tissue["gene_id"]

# Initialize mygene client and query
mg = mygene.MyGeneInfo()
results = mg.querymany(sel_genes_no_version.unique(), 
                      scopes='ensembl.gene', 
                      fields='symbol', 
                      species='human')

# Create mapping dictionary from gene ID to symbol
gene_to_symbol = {item['query']: item.get('symbol', '') 
                 for item in results if 'symbol' in item}

# Map symbols to expression data
expression_with_tissue["gene_name"] = expression_with_tissue["gene_id"].map(gene_to_symbol)

In [None]:
# Add gene names to enriched_overlaps_df3
expression_with_tissue["gene_name"] = [item.get('symbol', '') for item in results]
expression_with_tissue.head()

# Expression across tissues
(`ENST00000611745.1`, `ENST00000612155.1`)

In [30]:
# Define transcripts of interest
transcripts_of_interest = ['ENST00000611745.1', 'ENST00000612155.1']

In [None]:
%%capture
# %% Calculate mean expression by tissue
tissue_means = expression_with_tissue.groupby(level='tissue_type', axis=1).mean()

In [38]:
def analyze_transcript_expression(expression_with_tissue, transcripts):
    """Analyze expression of specific transcripts across tissues"""
    
    # Initialize lists to store results
    results = []
    
    # Get expression data for specified transcripts
    transcript_data = expression_with_tissue.loc[transcripts]
    
    # Calculate statistics for each tissue
    for tissue in transcript_data.columns.get_level_values('tissue_type').unique():
        tissue_data = transcript_data.loc[:, tissue]
        
        for transcript in transcripts:
            transcript_values = tissue_data.loc[transcript]
            results.append({
                'tissue_type': tissue,
                'transcript_id': transcript,
                'mean': transcript_values.mean(),
                'std': transcript_values.std(),
                'count': len(transcript_values)
            })
    
    # Convert to DataFrame
    return pd.DataFrame(results)

In [None]:
gc.collect()

In [39]:
# Get tissue means
transcript_expression = analyze_transcript_expression(expression_with_tissue, transcripts_of_interest)

In [None]:
# %% Plot expression across tissues
plt.figure(figsize=(15, 8))
sns.barplot(data=transcript_expression, 
           x='tissue_type', 
           y='mean',
           hue='transcript_id',
           errorbar='sd',
           capsize=0.1,
           saturation=0.8)

plt.xticks(rotation=90)
plt.xlabel('Tissue')
plt.ylabel('Mean Expression (TPM)')
plt.title('SRRM3 Transcript Expression Across Tissues')
plt.legend(title='Transcript ID', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Print summary statistics
print("\nSummary statistics for each transcript:")
for transcript in transcripts_of_interest:
    transcript_data = transcript_expression[transcript_expression['transcript_id'] == transcript]
    print(f"\n{transcript}:")
    print(f"Highest expression in: {transcript_data.loc[transcript_data['mean'].idxmax(), 'tissue_type']}")
    print(f"Mean expression across all tissues: {transcript_data['mean'].mean():.2f}")
    print("\nTop 5 tissues by expression:")
    print(transcript_data.nlargest(5, 'mean')[['tissue_type', 'mean', 'std']])

In [None]:
# %% Create heatmap
plt.figure(figsize=(15, 8))
pivot_data = transcript_expression.pivot(
    index='transcript_id', 
    columns='tissue_type', 
    values='mean'
)
sns.heatmap(pivot_data, 
            cmap='YlOrRd',
            annot=False,
            cbar_kws={'label': 'Mean Expression (TPM)'})
plt.title('SRRM3 Transcript Expression Heatmap')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# %% Add brain-specific analysis
brain_tissues = [col for col in pivot_data.columns if 'Brain' in col]
if brain_tissues:
    # print("\nBrain-specific expression:")
    brain_data = pivot_data[brain_tissues]
    # print(brain_data)
    
    # Visualize brain-specific expression
    plt.figure(figsize=(10, 6))
    brain_data.T.plot(kind='bar')
    plt.title('SRRM3 Transcript Expression in Brain Tissues')
    plt.xlabel('Brain Region')
    plt.ylabel('Mean Expression (TPM)')
    plt.legend(title='Transcript ID', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

# Load -  differnet brain regions GTEx data

[Gene TPMs by tissue](https://gtexportal.org/home/downloads/adult-gtex/bulk_tissue_expression)

## Load data

In [53]:
%%script false --no-raise-error
# %% Load brain region data
brain_files = [
    "gene_tpm_2017-06-05_v8_brain_amygdala.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_nucleus_accumbens_basal_ganglia.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_anterior_cingulate_cortex_ba24.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_putamen_basal_ganglia.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_caudate_basal_ganglia.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_spinal_cord_cervical_c-1.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_cerebellar_hemisphere.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_substantia_nigra.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_cerebellum.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_cortex.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_frontal_cortex_ba9.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_hippocampus.gct.gz",
    "gene_tpm_2017-06-05_v8_brain_hypothalamus.gct.gz"
]

brain_data = load_brain_data(brain_files)

In [54]:
brain_pickle = "gtex_brain_data.pkl"

In [55]:
%%script false --no-raise-error
# %% Save to pickle
print("Saving brain data to pickle file...")
with open(brain_pickle, 'wb') as f:
    pickle.dump(brain_data, f)
print("Data saved.")

In [None]:
# %% Load brain data from pickle
print("Loading brain data from pickle file...")
with open(brain_pickle, 'rb') as f:
    brain_data = pickle.load(f)
print("Data loaded.")

In [None]:
# %% Verify the data format
region = list(brain_data.keys())[0]
print(f"\nRegion: {region}")
print("\nDescription data shape:")
print(brain_data[region]['description'].shape)
print("\nExpression data shape:")
print(brain_data[region]['expression'].shape)
print("\n---")

In [58]:
%%script false --no-raise-error
# %% Verify the data format
region = list(brain_data.keys())[0]
print(f"\nRegion: {region}")
print("\nDescription data format:")
print(brain_data[region]['description'].head())
print("\nExpression data format:")
print(brain_data[region]['expression'].head())
print("\n---")


In [None]:
brain_data["amygdala"].keys()

In [None]:
brain_data["amygdala"]["metadata"]

In [None]:
brain_data["nucleus_accumbens_basal_ganglia"]["description"].head()

In [None]:
brain_data["nucleus_accumbens_basal_ganglia"]["expression"].head()

In [63]:
# Add a column to associate gene_id with its gene_name from the description for all brain regions
for region in brain_data:
    description_df = brain_data[region]["description"]
    # Create mapping using index (id) to gene_name
    id_to_name = dict(zip(description_df["gene_id"], description_df["gene_name"]))

    # Create a new column 'gene_name' in expression_df for each region
    expression_df = brain_data[region]["expression"]
    
    # Add gene_name as a new column using the DataFrame index (which is now id)
    expression_df["gene_name"] = expression_df.index.map(id_to_name)

    # Move gene_name and Description columns to the beginning
    # Get all columns except gene_name and Description
    data_cols = [col for col in expression_df.columns if col not in ["gene_name", "Description"]]
    # Reorder columns with gene_name and Description first
    new_cols = ["gene_name", "Description"] + data_cols
    expression_df = expression_df[new_cols]

    # Update the expression data in brain_data
    brain_data[region]["expression"] = expression_df

In [None]:
brain_data["amygdala"]["expression"].head()

# Look for SRRM3 - different brain regions GTEx data
**! Only one isoform across the datasets**

In [None]:
# Look for records with gene SRRM3 in the 'gene_name' column
srrm3_records = brain_data["amygdala"]["expression"][brain_data["amygdala"]["expression"]["Description"].str.contains("SRRM3", case=False, na=False)]

print("\nRecords containing SRRM3 in gene_name:")
srrm3_records.head()

In [None]:
# Look for ENSG00000177679 in the gene_name column
srrm3_ensembl = "ENSG00000177679"
srrm3_info = brain_data["amygdala"]["expression"][brain_data["amygdala"]["expression"]["gene_name"].str.contains(srrm3_ensembl, case=False, na=False)]

print("\nInformation for SRRM3 (ENSG00000177679):")
srrm3_info.head()


In [None]:
# Look for records with gene SRRM3 in the 'gene_name' column for each brain region
for region in brain_data:
    srrm3_records = brain_data[region]["expression"][brain_data[region]["expression"]["Description"].str.contains("SRRM3", case=False, na=False)]
    
    print(f"\nSRRM3 gene_name for {region}:")
    print(srrm3_records["gene_name"])