In [1]:
import numpy as np
import pandas as pd
import pickle
import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import Sampler
import matplotlib.pyplot as plt
import json
from collections import defaultdict
import yaml
import logging
import random
from sklearn.model_selection import train_test_split



In [41]:
# combine the DemographicDataset and GenomicsDataset into one dataset class
class PatientDataset(Dataset):
    def __init__(self, genomics_data, demographic_data,labels_df,
                 small_molecule_embeddings, large_molecule_embeddings,
                  cancer_types, patient_ids, ethnicity):
        self.genomics_data = genomics_data
        self.demographic_data = demographic_data
        self.cancer_types = cancer_types
        self.patient_ids = patient_ids
        self.labels_df = labels_df
        self.small_molecule_embeddings = small_molecule_embeddings
        self.large_molecule_embeddings = large_molecule_embeddings
        self.ethnicity = {key[0]: i  for i, key in enumerate(ethnicity.items())}
    def __len__(self):
        return len(self.patient_ids)
    
    def process_demographics(self):
        # divide age by 50 --> normalize the age by 50
        # divide weight by 100 --> normalize the age by 100
        self.demographic_data['AGE'] = self.demographic_data['AGE'] / 50
        self.demographic_data['WEIGHT_AT_START_OF_REGIMEN'] = self.demographic_data['WEIGHT_AT_START_OF_REGIMEN'] / 100

    def extract_drug_embedding_and_label(self, idx):
        label_patients = self.labels_df[self.labels_df['ENCORE_PATIENT_ID'].isin([idx])]
        label_patients.drop_duplicates(subset=['ENCORE_PATIENT_ID'], inplace=True)
        y_label = label_patients['Label'].values
        # Explicit sizes for small and large molecule embeddings
        SMALL_MOLECULE_SIZE = 768
        LARGE_MOLECULE_SIZE = 1024
        drug_embed_large = np.zeros(LARGE_MOLECULE_SIZE)
        drug_embed_small = np.zeros(SMALL_MOLECULE_SIZE)
        if len(label_patients['BENCHMARK_GROUP'].tolist()) > 0:
            treatment = label_patients['BENCHMARK_GROUP'].tolist()[0]
            # Extract only the string parts from treatment
            drugs = re.findall(r'\b\w+\b', treatment)
            for drug in drugs:
                if drug not in self.small_molecule_embeddings.keys():
                    " extract the drug embedding from the large molecule embeddings"
                    if drug  in self.large_molecule_embeddings.keys():
                        drug_embed_large += self.large_molecule_embeddings[drug]
                else:
                    " extract the drug embedding from the small molecule embeddings"
                    drug_embed_small += self.small_molecule_embeddings[drug]       
        else:
            y_label = np.array([])
        return drug_embed_small,drug_embed_large, y_label

    def __getitem__(self, idx):
        self.process_demographics()
        #print("idx: ", idx)
        patient_genome = self.genomics_data[self.genomics_data['PATIENTID'].isin([idx])]
        patient_genome.drop(columns=['PATIENTID',  'TUMOURID', 'ETHNICITY'], inplace=True)
        patient_genome = patient_genome.values
        
        patient_demographics = self.demographic_data[self.demographic_data['PATIENTID'].isin([idx])]
        patient_demographics.drop(columns=['PATIENTID',  'ENCORE_PATIENT_ID'], inplace=True)
        patient_demographics['Cancer Type'] = patient_demographics['Cancer Type'].apply(lambda x: self.cancer_types[x])
        # convert ethnicity to integer using the ethnicity dictionary
        #patient_demographics['ETHNICITY'] = patient_demographics['ETHNICITY'].map(self.ethnicity)
        patient_demographics.drop(columns=['ETHNICITY'], inplace=True)
        #patient_demographics['Ethnicity'] = patient_demographics['Ethnicity'].apply(lambda x: self.cancer_types[x])
        patient_demographics = patient_demographics.values
        drug_embed_small,drug_embed_large, y_label = self.extract_drug_embedding_and_label(idx)
        # if drug_embedding is 0, then do not include the patient in the dataset
        #if not y_label.any == 0:
        #    y_label = -1
        #    drug_embed_small = -1*np.ones_like(drug_embed_small)
        #    drug_embed_large = -1*np.ones_like(drug_embed_large)
        patient_data = {'patient_id': idx,
                        'genome': patient_genome,
                        'demographics': patient_demographics,
                        'drug_embed_small': drug_embed_small,
                        'drug_embed_large': drug_embed_large,
                        'label': y_label}
        return patient_data



In [42]:


# Define the BalancedBatchSampler class
class EthnicityBalancedSampler(Sampler):
    def __init__(self, genomics_data, batch_size):
        self.genomics_data = genomics_data
        self.batch_size = batch_size

        # Group indices by ehtnicity
        self.groups = self.genomics_data.groupby('ETHNICITY')['PATIENTID'].apply(list).to_dict()
        # ensure batch size can accomodate all groups
        assert self.batch_size >= len(self.groups.keys()), (
            'Batch size must be greater than or equal to the number of groups'
        )
    def __iter__(self):
        indices = []
        # Shuffle each ethinicity group
        group_indices = {eth: random.sample(indices, len(indices)) 
                         for eth, indices in self.groups.items()}
        
        # create batches
        while any(group_indices.values()):
            batch = []
            for eth, idx_list in group_indices.items():
                if idx_list:
                    batch.append(idx_list.pop()) # Take one sample from each group
            # Add additional samples to fill the batch size
            while len(batch) < self.batch_size:
                availabel_ethnicities = [eth for eth, idx_list in group_indices.items() if idx_list]
                if not availabel_ethnicities:
                    break
                eth = random.choice(availabel_ethnicities)
                batch.append(group_indices[eth].pop())
            yield batch
    
    def __len__(self):
        total_samples = sum(len(indices) for indices in self.ethnicity_groups.values())
        return total_samples // self.batch_size


In [43]:
data_dir = '/home/azureuser/cloudfiles/code/Users/Omid.Bazgir/data/'
# read the genomics data as the pickle file
with open(data_dir + 'patient_data.pkl', 'rb') as f:
    patient_data = pickle.load(f)

In [44]:
# read the label data and drug data as pickle files
with open(data_dir + 'label_pickle.pkl', 'rb') as f:
    label_data = pickle.load(f)
with open(data_dir + 'drug_data_embedding.pkl', 'rb') as f:
    drug_data = pickle.load(f)
small_molecule_embeddings = drug_data['small_molecule']
large_molecule_embeddings = drug_data['large_molecule']

In [45]:
labels_df = label_data['weak_label']


In [46]:
genomics_data = patient_data['genomics']
demographic_data = patient_data['demographic']
# fillna in the ETHINICITY with the most frequent value
demographic_data['ETHNICITY'].fillna('White', inplace=True)
# Add ETHNICITY to the genomics data using the patient_id
genomics_data = genomics_data.merge(demographic_data[['PATIENTID', 'ETHNICITY']], on='PATIENTID', how='left')

cancer_types = {}
for i, cancer in enumerate(demographic_data['Cancer Type'].unique().tolist()):
    cancer_types[cancer] = i

# read ethnicity data from the json file
with open(data_dir + 'ethnicity.json', 'r') as json_file:
    Ethnicity = json.load(json_file)

# read the config (yaml) file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

config['genome_encoder']['input_dim'] = genomics_data.shape[1] - 3

In [47]:
# Set up logging
logging.basicConfig(filename='training.log', level=logging.INFO, format='%(asctime)s %(message)s')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Define batch size
batch_size = config['train_param']['batch_size']

# Create DataLoader

# create a dataloader
patient_ids = genomics_data['PATIENTID'].unique().tolist()
labels_df = labels_df[labels_df['ENCORE_PATIENT_ID'].isin(patient_ids)]
labels_df = labels_df.drop_duplicates(subset=['ENCORE_PATIENT_ID'], keep='first')
patient_ids = labels_df['ENCORE_PATIENT_ID'].unique().tolist()
# split the patient_ids into train and test
patient_train_ids, patient_test_ids = train_test_split(patient_ids, test_size=0.2, random_state=42)
genomics_data_train = genomics_data[genomics_data['PATIENTID'].isin(patient_train_ids)]
demographic_data_train = demographic_data[demographic_data['PATIENTID'].isin(patient_train_ids)]
labels_df_train = labels_df[labels_df['ENCORE_PATIENT_ID'].isin(patient_train_ids)]


genomics_data_test = genomics_data[genomics_data['PATIENTID'].isin(patient_test_ids)]
demographic_data_test = demographic_data[demographic_data['PATIENTID'].isin(patient_test_ids)]
labels_df_test = labels_df[labels_df['ENCORE_PATIENT_ID'].isin(patient_test_ids)]
sampler_train = EthnicityBalancedSampler(genomics_data_train, batch_size)
patient_dataset_train = PatientDataset(genomics_data_train, demographic_data_train, 
                                       labels_df_train,small_molecule_embeddings, large_molecule_embeddings,
                                        cancer_types, patient_train_ids, Ethnicity)
patient_dataloader_train = DataLoader(patient_dataset_train, batch_sampler= sampler_train)

sampler_test = EthnicityBalancedSampler(genomics_data_test, batch_size)
patient_dataset_test = PatientDataset(genomics_data_test, demographic_data_test, 
                                      labels_df_test,small_molecule_embeddings, large_molecule_embeddings,
                                      cancer_types, patient_test_ids, Ethnicity)
patient_dataloader_test = DataLoader(patient_dataset_test, batch_sampler= sampler_test)



In [48]:
for batch in patient_dataloader_train:
    a = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.demographic_data['AGE'] = self.demographic_data['AGE'] / 50
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.demographic_data['WEIGHT_AT_START_OF_REGIMEN'] = self.demographic_data['WEIGHT_AT_START_OF_REGIMEN'] / 100
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFram