# OMGeEP

In [132]:
%git config --global user.email "oramar1256@gmail.com"
%git config --global user.name "oramar1256"

UsageError: Line magic function `%git` not found.


# Download dependencies

In [130]:
%pip install lightgbm scikit-learn pandas numpy joblib
%pip install iterative-stratification
%pip install shap

Note: you may need to restart the kernel to use updated packages.
Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9
Note: you may need to restart the kernel to use updated packages.
Collecting shap
  Downloading shap-0.48.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.48.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m8.9 MB/s[0m  [33m0:00:00[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Import libraries

In [129]:
import ast
import re
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, hamming_loss, roc_auc_score
from lightgbm import LGBMClassifier

# Optional: iterative stratification
try:
    from iterstrat.ml_stratifiers import iterative_train_test_split
    ITERATIVE_AVAILABLE = True
except ImportError:
    ITERATIVE_AVAILABLE = False

import warnings
warnings.filterwarnings('ignore')

# Global variables

In [123]:
OUTPUT_DIR = 'ifbdata/atlanteco_hack/OMGeEP/output_files'

# Read genomic data

## Helper functions

In [119]:
def normalize_gene_abundances(df:pd.DataFrame, method:str='tss', id_col:str=None) -> pd.DataFrame:
    """
    Normalize gene abundance data for comparison between samples.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with gene IDs in first column and samples as remaining columns
    method : str, default 'tss'
        Normalization method:
        - 'tss': Total Sum Scaling (relative abundance, sums to 1)
        - 'tss_percent': Total Sum Scaling as percentages (sums to 100)
        - 'z_score': Z-score normalization (mean=0, std=1)
        - 'min_max': Min-max scaling (0 to 1 range)
        - 'log_tss': Log-transformed TSS (log10(TSS + pseudocount))
        - 'clr': Centered Log Ratio transformation
    id_col : str, optional
        Name of ID column. If None, assumes first column is ID
        
    Returns:
    --------
    pandas.DataFrame
        Normalized DataFrame with same structure as input
    """
    
    # Make a copy to avoid modifying original
    df_norm = df.copy()
    
    # Identify ID column
    if id_col is None:
        id_col = df.columns[0]
    
    # Get sample columns (all except ID column)
    sample_cols = [col for col in df.columns if col != id_col]
    
    # Extract abundance matrix
    abundance_matrix = df_norm[sample_cols].values
    
    if method == 'tss':
        # Total Sum Scaling - convert to relative abundances
        col_sums = abundance_matrix.sum(axis=0)
        normalized_matrix = abundance_matrix / col_sums
        
    elif method == 'tss_percent':
        # Total Sum Scaling as percentages
        col_sums = abundance_matrix.sum(axis=0)
        normalized_matrix = (abundance_matrix / col_sums) * 100
        
    elif method == 'z_score':
        # Z-score normalization (standardization)
        normalized_matrix = (abundance_matrix - abundance_matrix.mean(axis=0)) / abundance_matrix.std(axis=0)
        
    elif method == 'min_max':
        # Min-max scaling to [0, 1] range
        min_vals = abundance_matrix.min(axis=0)
        max_vals = abundance_matrix.max(axis=0)
        normalized_matrix = (abundance_matrix - min_vals) / (max_vals - min_vals)
        
    elif method == 'log_tss':
        # Log-transformed TSS (common in metagenomics)
        col_sums = abundance_matrix.sum(axis=0)
        tss_matrix = abundance_matrix / col_sums
        # Add small pseudocount to avoid log(0)
        pseudocount = 1e-10
        normalized_matrix = np.log10(tss_matrix + pseudocount)
        
    elif method == 'clr':
        # Centered Log Ratio transformation
        # Add small pseudocount to avoid log(0)
        pseudocount = 1e-10
        log_matrix = np.log(abundance_matrix + pseudocount)
        geometric_means = log_matrix.mean(axis=0)
        normalized_matrix = log_matrix - geometric_means
        
    else:
        raise ValueError(f"Unknown normalization method: {method}")
    
    # Replace the sample columns with normalized values
    df_norm[sample_cols] = normalized_matrix
    
    return df_norm
    

In [120]:
def read_genomic_data(genomic_data_path:str, rows_to_skip:int=7) -> pd.DataFrame:

    # read df
    gen_df = pd.read_csv(genomic_data_path, sep='\t')
    # rename gene id row to ID
    gen_df.rename(columns={'Unnamed: 0':'ID'}, inplace=True)
    # remove metaparams
    gen_df = gen_df.iloc[rows_to_skip:]
    # convert NAN to 0
    gen_df.fillna(0, inplace=True)
    # change values to numeric (expect geneID)
    sample_cols = gen_df.columns.drop('ID')
    gen_df[sample_cols] = gen_df[sample_cols].apply(pd.to_numeric, errors='coerce')

    # Remove samples with no genes
    # 1. Calculate column sums for sample columns
    col_sums = gen_df[sample_cols].sum()
    # 2. Find columns with zero sum
    zero_sum_cols = col_sums[col_sums == 0].index.tolist()
    # 3. Remove zero-sum columns
    if zero_sum_cols:
        gen_df = gen_df.drop(columns=zero_sum_cols)

    # reset index
    gen_df.reset_index(drop=True, inplace=True)
    
    # normalize the results per sample
    gen_df_normalized = normalize_gene_abundances(gen_df, method='tss', id_col='ID')

    # return normalized df
    return gen_df_normalized

## Running genomic code

In [121]:
ben_gen_df = read_genomic_data(genomic_data_path='ifbdata/atlanteco_hack/MetaGenomics/BenguelaCurrent_GeneAb/BenguelaCurrent_ffn_GeneAb_T.tsv')

In [122]:
wedd_gen_df = read_genomic_data(genomic_data_path='ifbdata/atlanteco_hack/MetaGenomics/WeddellSea_GeneAb/WeddellSea_ffn_GeneAb_T.tsv')

# Read environmental data

# Read Proteomics data

# Read Metabolomic data (labels)

## Helper functions

In [88]:
def filter_relevant_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Keep only columns starting with 'featureId' or 'SAMEA'.
    """
    return df.loc[:, df.columns.str.startswith(('featureId', 'SAMEA'))]

In [89]:
def clean_samea_column_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove the trailing '_RX' from SAMEA column names.
    E.g., SAMEA123456_R01_R2 -> SAMEA123456_R01
    """
    rename_map = {
        col: re.sub(r'(SAMEA\d+_R\d+)_R\d+$', r'\1', col)
        if col.startswith('SAMEA') else col
        for col in df.columns
    }
    return df.rename(columns=rename_map)

In [102]:
def group_samea_columns(df: pd.DataFrame, threshold: float = 2e4) -> pd.DataFrame:
    """
    Group columns with the same SAMEAXXXXXX prefix:
      - If any column in the group > threshold → grouped value = 1
      - If all columns in the group ≤ threshold → grouped value = 0

    The grouped columns will replace the original SAMEA columns.
    """
    # Map each SAMEA column to its base prefix (SAMEAXXXXXX)
    prefix_map = {
        col: re.match(r'(SAMEA\d+)', col).group(1)
        if col.startswith('SAMEA') else col
        for col in df.columns
    }

    result = df.copy()
    for prefix in set(prefix_map.values()):
        if prefix.startswith('SAMEA'):
            same_cols = [col for col, pfx in prefix_map.items() if pfx == prefix]

            # 🔹 Ensure these columns are numeric (convert strings → numbers, non-numeric → NaN)
            numeric_block = df[same_cols].apply(pd.to_numeric, errors='coerce')

            # ✅ Compute on numeric_block, not df
            result[prefix] = (numeric_block > threshold).any(axis=1).astype(int)

            # Drop original group columns
            result = result.drop(columns=same_cols)

    return result


In [96]:
def process_metabolomic_data(metabolome_data_path:str, threshold: float = 2e4) -> pd.DataFrame:
    """
    Full pipeline:
      1. Filter columns
      2. Clean SAMEA column names
      3. Group SAMEA columns using threshold logic
    """
    metabolome_df = pd.read_csv(metabolome_data_path, sep='\t')
    metabolome_df.drop(metabolome_df.tail(1).index,inplace=True) # drop last row
    metabolome_df = filter_relevant_columns(metabolome_df)
    metabolome_df = clean_samea_column_names(metabolome_df)
    metabolome_df = group_samea_columns(metabolome_df, threshold=threshold)
    return metabolome_df

## Running metabolomic code

In [124]:
metabolome_path = 'ifbdata/atlanteco_hack/MetaMetabolomics/1_Feature_table_univariate_analysis_hackathon'

processed_metabolomic_data = process_metabolomic_data(metabolome_path)

In [125]:
processed_metabolomic_data.head()
sample_cols = [col for col in processed_metabolomic_data.columns if col != 'featureId']
counts = processed_metabolomic_data[sample_cols].stack().value_counts()
print(counts)

1    651561
0     44715
Name: count, dtype: int64


In [126]:
len(processed_metabolomic_data['featureId'].unique())

1842

# Next section (template header)