In [148]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from dataclasses import dataclass
from typing import Optional, Dict, List, Tuple, Union
from pathlib import Path
from scipy import stats
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

from palbociclib_signature_analysis import palbo_signatures
from palbo_RNAseq_analysis import palbo_RNAseq
from Integrative_analysis import geneset_heatmap, build_library

In [160]:
@dataclass
class DataPaths:
    """Centralize data paths configuration"""
    BASE_PATH: Path = Path('./input_data')
    CLINICAL_PATH: Path = BASE_PATH / 'clinical_features/brca_tcga_pan_can_atlas_2018_clinical_data.tsv'
    RECEPTOR_PATH: Path = BASE_PATH / 'clinical_features/TCGA_BRCA_clinical_receptors.txt'
    MRNA_BASE: Path = BASE_PATH / 'mRNA_data'

# split the data based molecular and clinical subtype

class TCGA_GeneExpressionData:
    """Handle gene expression data processing and analysis"""
    def __init__(self):
        self.clinical_feature = None
        self.receptor_feature = None
        self.gene_sets = {}
        
    def load_clinical_data(self, paths: DataPaths) -> None:
        """Load and process clinical data"""
        try:
            self.clinical_feature = pd.read_table(paths.CLINICAL_PATH, index_col=0)
            # NaN value as Unidentified 
            self.clinical_feature.fillna("Uni", inplace=True)
            self.receptor_feature = pd.read_table(paths.RECEPTOR_PATH, index_col=0)
            self._process_receptor_data()
        except Exception as e:
            logger.error(f"Error loading clinical data: {e}")
            raise
            
    def _process_receptor_data(self) -> None:
        """Process receptor status into clinical types of TNBC, Hormone Receptor(HR) positive and other"""
        conditions = [
            (self.receptor_feature['er_status_by_ihc'].eq('Negative') & 
             self.receptor_feature['pr_status_by_ihc'].eq('Negative') & 
             self.receptor_feature['her2_status_by_ihc'].eq('Negative')),
            ((self.receptor_feature['er_status_by_ihc'].eq('Positive') | 
              self.receptor_feature['pr_status_by_ihc'].eq('Positive')) & 
             self.receptor_feature['her2_status_by_ihc'].eq('Negative'))
        ]
        choices = ['TNBC', 'HR_positive']
        self.receptor_feature['clinical_types'] = np.select(conditions, choices, default='other')
    
    @staticmethod
    def preprocess_mRNA(fpath: Union[str, Path], remove_na: bool = True) -> pd.DataFrame:
        """Load and process cBioPortal data"""
        try:
            data = pd.read_table(fpath, index_col=1).iloc[:, 1:].T
            data.columns = [col[:12] for col in data.columns]  # sample to patient conversion
            return data.dropna(axis=1, how='all') if remove_na else data
        except Exception as e:
            logger.error(f"Error processing cBioPortal data from {fpath}: {e}")
            raise
            
    def load_gene_sets(self, paths: DataPaths) -> None:
        """Load all gene set expression data"""
        gene_sets = {
            'MYC_V1': 'MYC_V1/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt',
            'G2M': 'G2M/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt',
            'E2F': 'E2F/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt',
            'EMT': 'EMT/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt',
            'Glyco': 'Glycosis/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt',
            'TNF_alpha': 'TNF_alpha/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt'
        }
        
        for name, file_path in gene_sets.items():
            full_path = paths.MRNA_BASE / file_path
            self.gene_sets[name] = self.preprocess_mRNA(full_path)

    # merge two methods into one 
    def split_by_types(self, data: pd.DataFrame, by_type: Union[str, List[str]] = "clinical") -> Dict[str, pd.DataFrame]:
        """
        Split DataFrame by clinical or molecular subtypes.
        
        Args:
            data: DataFrame to split
            by_type: "clinical" for TNBC/HR classification or "molecular" for BRCA subtypes
            
        Returns:
            Dictionary of split DataFrames by subtype
        """
        TYPE_MAPPINGS = {
            "clinical": {
                "feature": "receptor_feature",
                "column": "clinical_types",
                "categories": ['TNBC', 'HR_positive', 'other']
            },
            "molecular": {
                "feature": "clinical_feature",
                "column": "Subtype",
                "categories": ['BRCA_LumA', 'BRCA_LumB', 'BRCA_Normal', 'BRCA_Basal', 'Uni']
            }
        }
        
        if by_type not in TYPE_MAPPINGS:
            raise ValueError(f"by_type must be one of {list(TYPE_MAPPINGS.keys())}")
        
        type_info = TYPE_MAPPINGS[by_type]
        feature_data = getattr(self, type_info["feature"])

    
        splits = {}
        for category in type_info["categories"]:
            mask = feature_data[type_info["column"]] == category
            patient_ids = feature_data[mask].index
            splits[category] = data.loc[:, data.columns.isin(patient_ids)]
        
        return splits

In [161]:
analysis = TCGA_GeneExpressionData()

In [162]:
paths = DataPaths()
paths.CLINICAL_PATH

WindowsPath('input_data/clinical_features/brca_tcga_pan_can_atlas_2018_clinical_data.tsv')

In [165]:
analysis.load_clinical_data(paths)
analysis.load_gene_sets(paths)


In [168]:
myc_data = analysis.gene_sets["MYC_V1"]

In [171]:
analysis.split_by_types(myc_data, by_type="molecular")

{'BRCA_LumA':          TCGA-3C-AAAU  TCGA-3C-AALK  TCGA-4H-AAAK  TCGA-5L-AAT0  TCGA-A1-A0SD  \
 DUT            0.6023       -0.3937       -0.0797        0.3578       -0.8788   
 STARD7        -1.3445       -1.2665       -0.9335       -0.2179       -0.2309   
 RPL22         -0.2799       -0.2862        0.4544        0.1556       -1.1861   
 CNBP          -1.6371       -0.9784       -0.1956       -0.3261        0.8095   
 SERBP1        -0.5621       -0.8030       -0.4045       -0.7926       -1.0936   
 HNRNPD        -1.1776        0.4492        0.1823       -0.1698       -0.6136   
 TOMM70         0.8178       -1.1175       -0.5851       -1.1624        0.4686   
 SRPK1          0.5468       -0.7420       -0.5574       -1.1367       -0.5069   
 SET           -1.5067       -0.1952        0.5289       -0.2066       -0.0425   
 PCBP1         -2.0056        1.6845       -1.2200        0.7217       -0.2146   
 GSPT1          0.6645       -0.6960       -0.4627       -0.8755        0.1349   
 PT

In [116]:
import math

In [141]:
pd.isna(analysis.clinical_data["Subtype"][16])

True