In [6]:
import pandas as pd
import numpy as np
import gzip
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
import os

# Set the current working directory
os.chdir('/home/michal/Raffaele/SRRM3')

# Print the current working directory to confirm the change
print(f"Current working directory: {os.getcwd()}")


In [8]:
def load_gtex_data(file_path):
    """
    Load GTEx .gct.gz file into a pandas DataFrame
    
    Parameters:
    file_path (str): Path to the GTEx gct.gz file
    
    Returns:
    tuple: (data_df, description_df, metadata_dict)
    """
    # Read the first few lines to get metadata
    with gzip.open(file_path, 'rt') as f:
        version = f.readline().strip()  # First line: version
        dims = f.readline().strip().split('\t')  # Second line: dimensions
        
    # Parse dimensions
    n_genes, n_samples = map(int, dims)
    
    # Read the actual data, skipping the first two metadata lines
    # GCT format has gene names in first column, descriptions in second column
    data_df = pd.read_csv(file_path, 
                         compression='gzip',
                         sep='\t', 
                         skiprows=2)
    
    # Separate gene descriptions from expression data
    description_df = data_df.iloc[:, :2]
    expression_df = data_df.iloc[:, 2:]
    
    # Set gene names as index
    expression_df.index = data_df.iloc[:, 0]
    
    # Create metadata dictionary
    metadata = {
        'version': version,
        'n_genes': n_genes,
        'n_samples': n_samples
    }
    
    return expression_df, description_df, metadata

def basic_analysis(expression_df):
    """
    Perform basic analysis on the expression data
    
    Parameters:
    expression_df (pd.DataFrame): Expression data matrix
    
    Returns:
    dict: Dictionary containing analysis results
    """
    analysis = {
        'total_transcripts': len(expression_df),
        'total_samples': len(expression_df.columns),
        'mean_expression': expression_df.mean().mean(),
        'median_expression': expression_df.median().median(),
        'non_zero_fraction': (expression_df > 0).mean().mean() * 100,
        'genes_detected_per_sample': (expression_df > 0).sum(),
        'sample_stats': expression_df.describe()
    }
    return analysis

def filter_low_expression(expression_df, min_tpm=1, min_samples=10):
    """
    Filter out lowly expressed transcripts
    
    Parameters:
    expression_df (pd.DataFrame): Expression data matrix
    min_tpm (float): Minimum TPM threshold
    min_samples (int): Minimum number of samples that must exceed min_tpm
    
    Returns:
    pd.DataFrame: Filtered expression matrix
    """
    # Count samples exceeding threshold for each transcript
    samples_above_threshold = (expression_df >= min_tpm).sum(axis=1)
    
    # Filter transcripts
    filtered_df = expression_df[samples_above_threshold >= min_samples]
    
    return filtered_df

def plot_expression_distribution(expression_df, output_file=None):
    """
    Plot distribution of expression values
    
    Parameters:
    expression_df (pd.DataFrame): Expression data matrix
    output_file (str): Optional path to save the plot
    """
    plt.figure(figsize=(10, 6))
    
    # Calculate mean expression per transcript
    mean_expression = expression_df.mean(axis=1)
    
    # Plot distribution of log2 transformed mean expression
    sns.histplot(np.log2(mean_expression + 1), bins=50)
    plt.xlabel('Log2(TPM + 1)')
    plt.ylabel('Count')
    plt.title('Distribution of Mean Expression Values')
    
    if output_file:
        plt.savefig(output_file)
    plt.close()

In [None]:
file_path = "GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz"

# Load data
expression_df, description_df, metadata = load_gtex_data(file_path)

In [None]:
# Print basic information
print(f"Dataset version: {metadata['version']}")
print(f"Number of transcripts: {metadata['n_genes']}")
print(f"Number of samples: {metadata['n_samples']}")

In [None]:
# Perform basic analysis
analysis_results = basic_analysis(expression_df)
print("\nBasic Analysis Results:")
for key, value in analysis_results.items():
    if not isinstance(value, pd.DataFrame):
        print(f"{key}: {value}")

In [None]:
# Filter low expression
filtered_df = filter_low_expression(expression_df, min_tpm=1, min_samples=10)
print(f"\nTranscripts after filtering: {len(filtered_df)}")

# Plot expression distribution
plot_expression_distribution(filtered_df, "expression_distribution.png")