In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import re
import warnings
import sklearn
import json
import scipy
from scipy.stats import shapiro, ttest_ind, mannwhitneyu

warnings.filterwarnings("ignore")

In [None]:
def collinear_features(x, threshold):
    # Find correlating features in a dataframe
    
    # Store correlating pairs into a list
    correlating_pairs = []

    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                correlating_pairs.append((col.values[0], row.values[0]))
                
    return correlating_pairs
    

In [None]:
my_path = '~/mounts/research/husdatalake/disease/scripts/Preleukemia/oona_new'

In [None]:
data_path = '~/mounts/research/husdatalake/disease/processed_data/Preleukemia/'

In [None]:
disease = 'MDS'

In [None]:
if disease == 'de_novo_AML':
    dis = 'de novo AML'
elif disease == 'primary_MF':
    dis = 'MF'
else:
    dis = disease

In [None]:
# Read univariate model results
with open('univariate_models/' + disease + '_univariate_pvalues.json', 'r') as file:
    univariate_results = json.load(file)

In [None]:
univariate_results

In [None]:
len(univariate_results)

## Load data

Cannot load full data, kernel dies. Read 100k rows 

In [None]:
data_sub = pd.read_csv(data_path + 'lagged_data_' + dis + '3.csv', engine='c', nrows=100000, low_memory=False)

In [None]:
data_sub = data_sub.loc[:, ~data_sub.columns.str.contains('l_metam|l_myelos|l_blast|pt_gf|crea|p_tt|ferrit|e_retic|event_1y', case=False)]

In [None]:
data_sub

## Find collinear features

In [None]:
data_sub = data_sub[data_sub['sukupuoli_selite'].isin(['Nainen', 'Mies'])]

In [None]:
data_sub['sukupuoli_selite'] = data_sub['sukupuoli_selite'].replace({'Nainen' : 0, 'Mies' : 1})

In [None]:
collinearity_threshold = 0.95

In [None]:
correlating_pairs = collinear_features(data_sub.drop(columns=['henkilotunnus', 'time_to_dg','disease']), threshold=collinearity_threshold)

In [None]:
len(correlating_pairs)

## Remove collinear features based on univariate models

Remove one of collinear variables based on the univariate model p-values. The variable which has a lower p-value is considered more important and thus kept

In [None]:
correlating_pairs

In [None]:
p_threshold = 0.001

print('\nREMOVING COLLINEAR VARIABLES BASED ON UNIVARIATE MODELS')
to_remove = []

for pair in correlating_pairs:

    print('')
    print(pair)
    var1 = pair[0]
    var2 = pair[1]
    
    p1 = univariate_results[var1]
    p2 = univariate_results[var1]
    
    print(var1, p1)
    print(var2, p2)
    
    if p1 > p2:
        #if p1 > p_threshold: # Only remove if p > p_threshold
            print('removing', var1)
            if var1 not in to_remove:
                to_remove.append(var1)
    else:
        #if p2 > p_threshold:
            print('removing', var2)
            if var2 not in to_remove:
                to_remove.append(var2)
    

In [None]:
to_remove

In [None]:
len(to_remove)

In [None]:
# Force hemoglobin into the models
to_remove.remove('b_hb_g_l_tulos_norm')

In [None]:
data_sub = data_sub.drop(columns=to_remove)

In [None]:
data_sub

In [None]:
features = data_sub.drop(columns=['henkilotunnus', 'time_to_dg', 'disease']).columns.to_list()

In [None]:
features

## Read full data for further processing (only selected features)

In [None]:
disease='MF'

In [None]:
if disease == 'de_novo_AML':
    dis = 'de novo AML'
elif disease == 'primary_MF':
    dis = 'MF'
else:
    dis = disease

In [None]:
data = pd.read_csv(data_path + 'lagged_data_' + dis + '3.csv', usecols=['henkilotunnus', 'time_to_dg', 'disease']+features, engine='c', low_memory=False)

In [None]:
data = data[data['sukupuoli_selite'].isin(['Nainen', 'Mies'])]

In [None]:
data['sukupuoli_selite'] = data['sukupuoli_selite'].replace({'Nainen' : 0, 'Mies' : 1})

In [None]:
# Drop underaged
data = data[~((data['disease'] == 0) & (data['age'] < 18))]

In [None]:
data = data[~((data['disease'] == 1) & (data['age'] < 18))]

In [None]:
data

In [None]:
healthy_df = data[data['disease'] == 0]
disease_df = data[data['disease'] == 1]

In [None]:
del data

In [None]:
lab_features = healthy_df.columns[7:].to_list()

In [None]:
lab_features

## 3. Trimming the top and bottom 0.1% of values per feature (healthy patients only) to remove outliers

Setting extreme column values to 0

In [None]:
def trim_extremes(df, features, lower_q=0.001, upper_q=0.999):
    clean_df = df.copy()
    for col in features:
        print(col)
        low = df[col].quantile(lower_q)
        high = df[col].quantile(upper_q)
        
        # Replace values outside [low, high] with 0
        clean_df[col] = clean_df[col].apply(lambda x: x if low <= x <= high else 0)
    
    return clean_df

In [None]:
len(healthy_df)

In [None]:
healthy_df_clean = trim_extremes(healthy_df, lab_features, lower_q=0.001, upper_q=0.999)

In [None]:
len(healthy_df_clean)

## Plot

In [None]:
data = pd.concat([healthy_df_clean, disease_df], axis=0)

In [None]:
del healthy_df_clean
del healthy_df
del disease_df


In [None]:
## Save
data.to_csv(my_path + '/data/modelling/' + disease + '_modelling_data_reduced.csv', index=False)

In [None]:
fig = plt.figure(figsize=(4, 4))
sns.boxplot(data=data, x='age', hue='disease', showfliers=True)
#plt.axvline(x=0, color='r', linestyle='--')
plt.show()
plt.close()

In [None]:
data[data['disease'] == 0]['sukupuoli_selite'].value_counts(normalize=True)

In [None]:
data[data['disease'] == 1]['sukupuoli_selite'].value_counts(normalize=True)

## Visualize latest measurements (norm columns)

In [None]:
norm_cols = [col for col in data.columns if 'norm' in col]

In [None]:
for lab in norm_cols:
    fig = plt.figure(figsize=(4, 4))
    sns.boxplot(data=data, x=lab, hue='disease')
    plt.axvline(x=0, color='r', linestyle='--')
    plt.show()
    plt.close()