In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import warnings
import sklearn
import random
import xgboost as xgb
from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler
import ast
import os
from scipy import stats
import json

warnings.filterwarnings("ignore")

#plt.style.use('ggplot')

In [None]:
def extract_features_per_timepoint(df, min_points=5, plot=False, verbose=True):
    
    rows = []
    total_patients = df['patient_id'].nunique()
    
    for pt, (pid, group) in enumerate(df.groupby('patient_id')):
    
        if verbose and pt % 1000 == 0:
            print(f"Processing patient {pt + 1} / {total_patients}")
        
        group = group.sort_values('time_to_dg', ascending=True).reset_index(drop=True)
    
        for i in range(len(group)):
    
            current = group.iloc[i]
            past = group.iloc[:i+1] # Include also current datapoint
            
            row = {
                'patient_id': pid,
                'time_to_dg': current['time_to_dg'],
                'disease_status' : current['disease_status'],
                'risk_score_now': current['risk_score'],
                'n_prev': len(past),
            }
    
            current_time = current['time_to_dg']
            time_1y_ago = current_time - 365
            past_1y = past[(past['time_to_dg'] > time_1y_ago) & (past['time_to_dg'] <= current_time)]
    
            # If more than min points previous risk scores, get full history slope metrics
            if len(past) >= min_points:
                
                times = past['time_to_dg'].values
                scores = past['risk_score'].values
                rel_times = times - times.max()
            
                # Fit a 1st-degree (linear) polynomial
                coeffs = np.polyfit(times, scores, 1)
                slope = coeffs[0]
                intercept = coeffs[1]
            
                if plot == True:
            
                    # Create the fitted line
                    fitted_scores = np.polyval(coeffs, times)
                    plt.figure(figsize=(8, 5))
                    plt.scatter(times, scores, color='blue', label='Data points')
                    plt.plot(times, fitted_scores, color='red', label=f'Fitted line: y = {slope:.2f}x + {intercept:.2f}')
                    plt.xlabel('times')
                    plt.ylabel('scores')
                    plt.title('Full history linear Fit')
                    plt.legend()
                    plt.grid(True)
                    plt.show()
            
                row['mean_score'] = np.mean(scores)
                row['max_score'] = np.max(scores)
                row['slope'] = coeffs[0]
                row['delta_score'] = scores[-1] - scores[0]
                row['volatility'] = np.std(scores)
                #row['auc'] = np.trapz(scores, x=rel_times)
    
                # If at least one risk scores measured during last year, get last 1 year slope metrics
                if len(past_1y) > 1:
                    times = past_1y['time_to_dg'].values
                    scores = past_1y['risk_score'].values
                    rel_times = times - times.max()
                
                    # Fit a 1st-degree (linear) polynomial
                    coeffs = np.polyfit(times, scores, 1)
                    slope = coeffs[0]
                    intercept = coeffs[1]
                
                    if plot == True:
                
                        # Create the fitted line
                        fitted_scores = np.polyval(coeffs, times)
                        plt.figure(figsize=(8, 5))
                        plt.scatter(times, scores, color='blue', label='Data points')
                        plt.plot(times, fitted_scores, color='red', label=f'Fitted line: y = {slope:.2f}x + {intercept:.2f}')
                        plt.xlabel('times')
                        plt.ylabel('scores')
                        plt.title('Past 1 year linear Fit')
                        plt.legend()
                        plt.grid(True)
                        plt.show()
                
                    row['slope_1y'] = coeffs[0]
                    row['delta_score_1y'] = scores[-1] - scores[0]
                    row['volatility_1y'] = np.std(scores)
                else:
                    # If no data during last month, assume there is no change
                    row['slope_1y'] = 0
                    row['delta_score_1y'] = 0
                    row['volatility_1y'] = 0
        
            rows.append(row)
            
    return pd.DataFrame(rows)

In [None]:
my_path = '~/mounts/research/husdatalake/disease/scripts/Preleukemia/oona_new'

In [None]:
disease = 'MF'

In [None]:
# Whether to include hard positives
include_hp = True

In [None]:
# How many previous datapoints are needed for applying trajectory model
min_points=3

# 1. Read deriv/test data

In [None]:
deriv_data = pd.read_csv(my_path + '/data/modelling/' + disease + '_derivation_data.csv', engine='c', low_memory=False)

In [None]:
test_data = pd.read_csv(my_path + '/data/modelling/' + disease + '_test_data.csv', engine='c', low_memory=False)

In [None]:
deriv_data = deriv_data[~deriv_data['henkilotunnus'].isin(test_data['henkilotunnus'])]

In [None]:
print('\nSanity check: Is there any test data in derivation set')
deriv_ht = list(deriv_data['henkilotunnus'].unique())
test_ht = list(test_data['henkilotunnus'].unique())
test_in_deriv = np.intersect1d(test_ht, deriv_ht).size > 0


test_in_deriv

## Read model

In [None]:
model = xgb.Booster()  # Create a Booster object
model.load_model(my_path + '/results/basic_model/' +  disease + '_basic_model.json')

# 2. Extract features per timepoint using prior risk history for deriv data

### Predict risk scores

In [None]:
x_deriv = deriv_data.drop(columns=['henkilotunnus', 'disease', 'time_to_dg', 'hp'])
y_deriv = deriv_data['time_to_dg']

dderiv = xgb.DMatrix(x_deriv, label=y_deriv)

deriv_risk_scores = model.predict(dderiv)

deriv_info = deriv_data[['henkilotunnus', 'time_to_dg', 'disease', 'hp']]

deriv_info['risk_score'] = deriv_risk_scores

# Update 'time_to_dg' to negative if 'hp' == 0
deriv_info['time_to_dg'] = np.where((deriv_info['disease'] == 1) & (deriv_info['hp'] == 0), -deriv_info['time_to_dg'], deriv_info['time_to_dg'])

In [None]:
disease_patients = list(deriv_data[deriv_data['disease'] == 1]['henkilotunnus'].unique())
healthy_patients = list(deriv_data[deriv_data['disease'] == 0]['henkilotunnus'].unique())
patients = disease_patients + healthy_patients

In [None]:
len(patients)

In [None]:
len(disease_patients)

In [None]:
# Filter subset of healthy patients
deriv_info = deriv_info[deriv_info['henkilotunnus'].isin(patients)]

In [None]:
df = deriv_info.copy()

if include_hp == True:
    # Multiple hp datapoints measured on the same day -- linear fit does not converge
    # Collapse data into single row
    numeric_cols = df.select_dtypes(include='number').columns.difference(['time_to_dg'])  # Avoid aggregating grouping cols
    df = df.groupby(['henkilotunnus', 'time_to_dg'], as_index=False)[numeric_cols].median()
else:
    ## Remove hp==1 rows
    df = df[df['hp'] == 0]

df = df.drop(columns=['hp'])

df = df.rename(columns={'henkilotunnus': 'patient_id', 'disease': 'disease_status'})

In [None]:
features_df = extract_features_per_timepoint(df, min_points=min_points, plot=False)

In [None]:
features_df.head(45)

In [None]:
if include_hp == True:
    features_df.to_csv('trajectory_model/' + disease + '_full_risk_score_deriv_data_with_hp.csv', index=False)
else:
    features_df.to_csv('trajectory_model/' + disease + '_full_risk_score_deriv_data.csv', index=False)

# 3. Extract features per timepoint using prior risk history for test data

In [None]:
x_test = test_data.drop(columns=['henkilotunnus', 'disease', 'time_to_dg'])
y_test = test_data['time_to_dg']

dtest = xgb.DMatrix(x_test, label=y_test)

test_risk_scores = model.predict(dtest)

test_info = test_data[['henkilotunnus', 'time_to_dg', 'disease']]

test_info['risk_score'] = test_risk_scores

# Update 'time_to_dg' to negative if disease
test_info['time_to_dg'] = np.where((test_info['disease'] == 1), -test_info['time_to_dg'], test_info['time_to_dg'])

In [None]:
test_disease_patients = list(test_data[test_data['disease'] == 1]['henkilotunnus'].unique())
test_healthy_patients = list(test_data[test_data['disease'] == 0]['henkilotunnus'].unique())
test_patients = test_disease_patients + test_healthy_patients

In [None]:
len(test_disease_patients)

In [None]:
len(test_patients)

In [None]:
# Filter subset of healthy patients
test_info = test_info[test_info['henkilotunnus'].isin(test_patients)]

In [None]:
df = test_info.copy()

df = df.rename(columns={'henkilotunnus': 'patient_id', 'disease': 'disease_status'})

In [None]:
test_features_df = extract_features_per_timepoint(df, min_points=min_points)

In [None]:
test_features_df

In [None]:
test_features_df.to_csv('trajectory_model/' + disease + '_risk_score_test_data.csv', index=False)