In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from dataclasses import dataclass
from typing import Optional, Dict, List, Tuple, Union
from pathlib import Path
from scipy import stats
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

from palbociclib_signature_analysis import palbo_signatures
from palbo_RNAseq_analysis import palbo_RNAseq
from Integrative_analysis import geneset_heatmap, build_library

In [58]:
@dataclass
class DataPaths:
    """Centralize data paths configuration"""
    BASE_PATH: Path = Path('./input_data')
    CLINICAL_PATH: Path = BASE_PATH / 'clinical_features/brca_tcga_pan_can_atlas_2018_clinical_data.tsv'
    RECEPTOR_PATH: Path = BASE_PATH / 'clinical_features/TCGA_BRCA_clinical_receptors.txt'
    MRNA_BASE: Path = BASE_PATH / 'mRNA_data'

# split the data based molecular and clinical subtype

class TCGA_GeneExpressionData:
    """Handle gene expression data processing and analysis"""
    def __init__(self):
        self.clinical_data = None
        self.receptor_data = None
        self.gene_sets = {}
        
    def load_clinical_data(self, paths: DataPaths) -> None:
        """Load and process clinical data"""
        try:
            self.clinical_data = pd.read_table(paths.CLINICAL_PATH, index_col=0)
            self.receptor_data = pd.read_table(paths.RECEPTOR_PATH, index_col=0)
            self._process_receptor_data()
        except Exception as e:
            logger.error(f"Error loading clinical data: {e}")
            raise
            
    def _process_receptor_data(self) -> None:
        """Process receptor data into clinical types"""
        conditions = [
            (self.receptor_data['er_status_by_ihc'].eq('Negative') & 
             self.receptor_data['pr_status_by_ihc'].eq('Negative') & 
             self.receptor_data['her2_status_by_ihc'].eq('Negative')),
            ((self.receptor_data['er_status_by_ihc'].eq('Positive') | 
              self.receptor_data['pr_status_by_ihc'].eq('Positive')) & 
             self.receptor_data['her2_status_by_ihc'].eq('Negative'))
        ]
        choices = ['TNBC', 'HR_positive']
        self.receptor_data['clinical_types'] = np.select(conditions, choices, default='other')
    
    @staticmethod
    def preprocess_mRNA(fpath: Union[str, Path], remove_na: bool = True) -> pd.DataFrame:
        """Load and process cBioPortal data"""
        try:
            data = pd.read_table(fpath, index_col=1).iloc[:, 1:].T
            data.columns = [col[:12] for col in data.columns]  # sample to patient conversion
            return data.dropna(axis=1, how='all') if remove_na else data
        except Exception as e:
            logger.error(f"Error processing cBioPortal data from {fpath}: {e}")
            raise
            
    def load_gene_sets(self, paths: DataPaths) -> None:
        """Load all gene set expression data"""
        gene_sets = {
            'MYC_V1': 'MYC_V1/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt',
            'G2M': 'G2M/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt',
            'E2F': 'E2F/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt',
            'EMT': 'EMT/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt',
            'Glyco': 'Glycosis/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt',
            'TNF_alpha': 'TNF_alpha/mRNA expression z-scores relative to diploid samples (RNA Seq V2 RSEM).txt'
        }
        
        for name, file_path in gene_sets.items():
            full_path = paths.MRNA_BASE / file_path
            self.gene_sets[name] = self.preprocess_mRNA(full_path)

In [59]:
analysis = TCGA_GeneExpressionData()

In [60]:
paths = DataPaths()

In [68]:
analysis.load_clinical_data(paths)
print(analysis.clinical_data)
analysis.receptor_data
analysis.load_gene_sets(paths)


             Neoplasm_Disease_Stage  Disease_Free_Month  \
Patient_ID                                                
TCGA-3C-AAAU                STAGE X           59.440444   
TCGA-3C-AALI              STAGE IIB          131.669790   
TCGA-3C-AALJ              STAGE IIB           48.459743   
TCGA-3C-AALK               STAGE IA                 NaN   
TCGA-4H-AAAK             STAGE IIIA           11.440971   
...                             ...                 ...   
TCGA-WT-AB44               STAGE IA           29.029819   
TCGA-XX-A899             STAGE IIIA           15.353256   
TCGA-XX-A89A              STAGE IIB           16.043660   
TCGA-Z7-A8R5             STAGE IIIA                 NaN   
TCGA-Z7-A8R6                STAGE I          107.045402   

              Fraction_Genome_Altered  Mutation_Count  Overall_Survival_Month  \
Patient_ID                                                                      
TCGA-3C-AAAU                   0.7787            24.0              133

In [147]:
analysis.clinical_data["Subtype"].fillna("Unidentified").unique()

array(['BRCA_LumA', 'BRCA_Her2', 'BRCA_LumB', 'BRCA_Normal', 'BRCA_Basal',
       'Unidentified'], dtype=object)

In [116]:
import math

In [141]:
pd.isna(analysis.clinical_data["Subtype"][16])

True