In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import warnings
import sklearn
import random
from sklearn.neighbors import NearestNeighbors

warnings.filterwarnings("ignore")

### Do derivation / test split

In [None]:
def deriv_test_split(patient_list, shuffle=False, random_state=42):
    # Divide patients to train / validation / test groups
         
    random.seed(random_state)
    
    if shuffle == True:
        random.shuffle(patient_list)
    
    # Calculate the number of items in each sublist
    total_items = len(patient_list)
    deriv_size = int(total_items * 0.85)
    test_size = total_items - deriv_size  # To ensure all items are included

    # Divide the list into sublists
    deriv_list = patient_list[:deriv_size]
    test_list = patient_list[deriv_size:]

    return deriv_list, test_list

In [None]:
my_path = '~/mounts/research/husdatalake/disease/scripts/Preleukemia/oona_new'

In [None]:
disease = 'any_MN'

In [None]:
data_all = pd.read_csv(my_path + '/data/modelling/' + disease + '_modelling_data_reduced.csv', engine='c', low_memory=False)

In [None]:
data_all

## Filter out hard positive rows

In [None]:
hard_positives = data_all[(data_all['disease'] == 1) & (data_all['time_to_dg'] < 90)]

data = data_all.drop(hard_positives.index)

In [None]:
len(data)+len(hard_positives)

In [None]:
hard_positives['time_to_dg'].hist()

In [None]:
## Keep only hard positives up to +10 days after diagnosis
hard_positives = hard_positives[hard_positives['time_to_dg'] >= -10]

In [None]:
hard_positives['time_to_dg'].hist()

In [None]:
# Replace negative values (measured after dg) with 0, keep others unchanged
hard_positives['time_to_dg'] = hard_positives['time_to_dg'].apply(lambda x: 0 if x < 0 else x)

In [None]:
hard_positives['time_to_dg'].hist()

In [None]:
hard_positives['hp'] = 1

## Check data

In [None]:
len(data)

In [None]:
data[data['disease'] == 1]['time_to_dg'].min()

In [None]:
data[data['disease'] == 1]['time_to_dg'].max()

## Check

In [None]:
d = data[data['disease'] == 1]
h = data[data['disease'] == 0]

In [None]:
# perc of disease datapoints
100 * len(d) / len(data)

In [None]:
# n disease patients
d_n_pat = len(d['henkilotunnus'].unique())
d_n_pat

In [None]:
# n healthy patients
h_n_pat = len(h['henkilotunnus'].unique())
h_n_pat 

In [None]:
del d
del h

## Create deriv and test sets

In [None]:
disease_patients = list(data[data['disease'] == 1]['henkilotunnus'].unique())
healthy_patients = list(data[data['disease'] == 0]['henkilotunnus'].unique())

In [None]:
rs=42

In [None]:
# Divide patient IDs to deriv / test

deriv_disease, test_disease = deriv_test_split(disease_patients, shuffle=True, random_state=rs)
deriv_healthy, test_healthy = deriv_test_split(healthy_patients, shuffle=True, random_state=rs)

In [None]:
len(deriv_disease), len(test_disease)

In [None]:
len(deriv_healthy), len(test_healthy)

In [None]:
# Get datapoints based on patient lists
deriv_disease_data = data[data['henkilotunnus'].isin(deriv_disease)]
test_disease_data = data[data['henkilotunnus'].isin(test_disease)]

deriv_healthy_data = data[data['henkilotunnus'].isin(deriv_healthy)]
test_healthy_data = data[data['henkilotunnus'].isin(test_healthy)]

In [None]:
#del data

In [None]:
deriv_disease_data['age'].median(), test_disease_data['age'].median()

In [None]:
deriv_disease_data['sukupuoli_selite'].value_counts(normalize=True), test_disease_data['sukupuoli_selite'].value_counts(normalize=True)

In [None]:
deriv_healthy_data['age'].median(), test_healthy_data['age'].median()

In [None]:
deriv_healthy_data['sukupuoli_selite'].value_counts(normalize=True), test_healthy_data['sukupuoli_selite'].value_counts(normalize=True)

In [None]:
deriv_disease_data['time_to_dg'].median(), test_disease_data['time_to_dg'].median()

In [None]:
del data

In [None]:
# Concatenate disease + healthy sets together
deriv_data = pd.concat([deriv_disease_data, deriv_healthy_data], ignore_index=True)
test_data = pd.concat([test_disease_data, test_healthy_data], ignore_index=True)

In [None]:
deriv_data

In [None]:
del deriv_disease_data
del test_disease_data
del deriv_healthy_data
del test_healthy_data

In [None]:
# Censoring 'disease' == 0 for XGBoost
deriv_data.loc[deriv_data['disease'] == 0, 'time_to_dg'] *= -1
test_data.loc[test_data['disease'] == 0, 'time_to_dg'] *= -1

In [None]:
deriv_data[deriv_data['disease'] == 1]['age'].median(), test_data[test_data['disease'] == 1]['age'].median()

In [None]:
deriv_data[deriv_data['disease'] == 0]['age'].median(), test_data[test_data['disease'] == 0]['age'].median()

In [None]:
deriv_data[deriv_data['disease'] == 1]['sukupuoli_selite'].mean(), test_data[test_data['disease'] == 1]['sukupuoli_selite'].mean()

In [None]:
deriv_data[deriv_data['disease'] == 0]['sukupuoli_selite'].mean(), test_data[test_data['disease'] == 0]['sukupuoli_selite'].mean()

In [None]:
deriv_ht = list(deriv_data['henkilotunnus'].unique())

In [None]:
test_ht = list(test_data['henkilotunnus'].unique())

In [None]:
def check_common_elements(list1, list2):
    # Convert lists to sets for faster membership testing
    set1 = set(list1)
    set2 = set(list2)
    
    # Check if there is any common element
    common_elements = set1.intersection(set2)
    
    return len(common_elements) > 0


In [None]:
check_common_elements(deriv_ht, test_ht)

## Append hard positives to deriv data

In [None]:
deriv_data

In [None]:
deriv_data['hp'] = 0

In [None]:
deriv_data = pd.concat([hard_positives, deriv_data])

In [None]:
print('\nSanity check: Is there any test data in derivation set')
deriv_ht = list(deriv_data['henkilotunnus'].unique())
test_ht = list(test_data['henkilotunnus'].unique())
test_in_deriv = np.intersect1d(test_ht, deriv_ht).size > 0

test_in_deriv

In [None]:
len(deriv_data[deriv_data['hp'] == 1])

In [None]:
# Remove hard positive datapoints from those patients whose data is found in test data
deriv_data = deriv_data[~deriv_data['henkilotunnus'].isin(test_data['henkilotunnus'])]

In [None]:
len(deriv_data[deriv_data['hp'] == 1])

In [None]:
print('\nSanity check: Is there any test data in derivation set')
deriv_ht = list(deriv_data['henkilotunnus'].unique())
test_ht = list(test_data['henkilotunnus'].unique())
test_in_deriv = np.intersect1d(test_ht, deriv_ht).size > 0

test_in_deriv

In [None]:
# pt counts after hp
len(deriv_disease), len(test_disease)

## Save deriv and test sets

In [None]:
deriv_data = deriv_data.reset_index(drop=True)

In [None]:
test_data = test_data.reset_index(drop=True)

In [None]:
len(deriv_data) + len(test_data)

In [None]:
if disease == 'any_MN':
    # Save test data with disease labels
    test_data.to_csv(my_path + '/data/modelling/' + disease + '_test_data_disease_labels.csv', index=False)

### Drop "type" column before modelling (any_MN)

In [None]:
if disease == 'any_MN':
    deriv_data = deriv_data.drop(columns=['type'])
    test_data = test_data.drop(columns=['type'])

In [None]:
test_data

In [None]:
deriv_data

### Save

In [None]:
deriv_data.to_csv(my_path + '/data/modelling/' + disease + '_derivation_data.csv', index=False)

In [None]:
del deriv_data

In [None]:
test_data.to_csv(my_path + '/data/modelling/' + disease + '_test_data.csv', index=False)

In [None]:
#del test_data