# Run scorer on old data

Please use `old_data_investigation` notebook to export all required data.

## Imports

In [None]:
import pandas as pd
import os
import re
import numpy as np
import sys
import math
import logging

In [None]:
sys.path.insert(0, "../..")

from local_testing_utilities.notebook_utils.survival_data import parse_survival_data

from txmatching.scorers.split_hla_additive_scorer import SplitScorer
from txmatching.scorers.high_res_hla_additive_scorer import HighResScorer
from txmatching.scorers.high_res_other_hla_types_additive_scorer import \
    HighResWithDQDPScorer

from txmatching.utils.blood_groups import BloodGroup
from txmatching.utils.country_enum import Country

from txmatching.patients.patient import Donor, Recipient
from txmatching.patients.patient_parameters import PatientParameters

from tests.test_utilities.hla_preparation_utils import (create_antibodies,
                                                        create_hla_typing)

from tests.test_utilities.prepare_app_for_tests import DbTests

## Load data

In [None]:
df_survival = parse_survival_data('data/LD_kidney_survival_with_donor_params.csv')
df_survival.tail()

In [None]:
df_survival.count()

## Compute KDRI

### Convert EGFR to proper units ml/min/1.73m2

In [None]:
# odhadne tělesný povrch
# https://cs.wikiversity.org/wiki/Odhad_t%C4%9Blesn%C3%A9ho_povrchu_a_v%C3%BDpo%C4%8Det_BMI
# m - kg
# h - cm
# return - m2
def estimate_bsa(m, h):
    return 71.84 * m**0.425 * h**0.725 / 10**4

estimate_bsa(75, 180)

In [None]:
# eGFR ml/s -> ml/min/1.73m2
def convert_egfr(egfr, m, h):
    return egfr * 60 / (estimate_bsa(m, h) / 1.73)

df_survival['Donor_EGFR2'] = convert_egfr(df_survival['Donor_EGFR'], df_survival['Donor_Weight'], df_survival['Donor_Height'])

### Compute creatitin from eGFR

In [None]:
# Creatinine micromol/L
# eGFR ml/min/1.73m2
def compute_egfr(creatine, age, female, black):
    return 186 * (creatine/88.4)**-1.154 * age**-0.203 * (0.742 if female else 1) * (1.210 if black else 1)

# Test according to https://ukidney.com/nephrology-resources/egfr-calculator
egfr = compute_egfr(80, 40, True, False)
egfr

In [None]:
# Creatinine micromol/L
# eGFR ml/min/1.73m2
def compute_crea(egfr, age, female, black):
    return (
        (186 * age**-0.203 * (0.742 if female else 1) * (1.210 if black else 1)) / egfr
    ) ** (1/1.154) * 88.4

compute_crea(egfr, 40, True, False)

In [None]:
# Convert from micromol/L to mg/dL
def convert_crea(crea):
    M = 113.12 # creatinin molar mass (g/mol)
    return crea * M / 10000

convert_crea(80)

In [None]:
df_survival['Donor_Crea'] = df_survival.apply(lambda row: convert_crea(compute_crea(row['Donor_EGFR2'], row['Donor_AgeAtTx'], row['Donor_Sex'] == 'F', 1)), axis=1)

### Compute KDRI

In [None]:
# Creatine mg/dL
def compute_kdri(age, height, weight, hypertension, diabetes, stroke, crea, HCV, DCD):
    x_beta = (0.0128 * (age - 40) - 0.0194*(age-18)*(age<18) + 0.0107*(age-50)*(age>50)
               - 0.0464*(height-170)/10 - 0.0199*(weight-80)/5*(weight<80)
               + 0.126*hypertension + 0.13*diabetes + 0.0881*stroke
               + 0.22*(crea-1) - 0.209*(crea-1.5)*(crea>1.5)
               + 0.24*HCV + 0.133*DCD
              )
    return np.exp(x_beta)

# Test according to https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6522334/
print(compute_kdri(30, 175, 85, 0, 0, 0, 0.8, 0, 0))
print(compute_kdri(60, 183, 85, 0, 1, 0, 2.0, 0, 0))

In [None]:
df_survival['Donor_KDRI'] = compute_kdri(df_survival['Donor_AgeAtTx'], df_survival['Donor_Height'], df_survival['Donor_Weight'], 0, 0, 0, df_survival['Donor_Crea'], 0, 0)
df_survival['Donor_Simple_KDRI'] = compute_kdri(df_survival['Donor_AgeAtTx'], 170, 85, 0, 0, 0, 1, 0, 0)

In [None]:
# df_survival.Donor_EGFR2.hist()

In [None]:
# df_survival[['Donor_EGFR', 'Donor_EGFR2', 'Donor_Crea', 'Donor_KDRI']].hist(figsize=(20, 10), bins=50)

## Compute score

In [None]:
# Initialize db
try:
    test = DbTests()
    test.setUp()
except:
    print("Db already initialized")
    pass

In [None]:
# test.tearDown()

In [None]:
logger = logging.getLogger()
logger.setLevel('WARN')

In [None]:
def row_to_score(row, scorer):
    donor_typization = row.donor_typization
    recipient_typization = row.recipient_typization
    
    if donor_typization == '' or recipient_typization == '':
        return None
    
    donor_typing = donor_typization.split(" ")
    recipient_typing = recipient_typization.split(" ")
    
    donor = Donor(
        db_id=-1,
        medical_id='donor',
        related_recipient_db_id=1,
        parameters=PatientParameters(
            blood_group=BloodGroup.A,
            country_code=Country.CZE,
            hla_typing=create_hla_typing(
                donor_typing
            )
        )
    )
    
    recipient = Recipient(
        db_id=1,
        acceptable_blood_groups=[],
        related_donor_db_id=1,
        medical_id='recipient',
        parameters=PatientParameters(
            blood_group=BloodGroup.A,
            country_code=Country.CZE,
            hla_typing=create_hla_typing(recipient_typing)
        ),
        hla_antibodies=create_antibodies([])
    )
    
    score = scorer.score_transplant(
        donor=donor,
        recipient=recipient,
        original_donor=None
    )
    
    return score

In [None]:
# Select scorer that will be used
split_scorer = SplitScorer()
high_res_scorer = HighResScorer()
high_res_other_hla_types_scorer = HighResWithDQDPScorer()

In [None]:
df_survival['split_score'] = df_survival.apply(lambda row: row_to_score(row, split_scorer), axis=1)
df_survival['high_res_score'] = df_survival.apply(lambda row: row_to_score(row, high_res_scorer), axis=1)
df_survival['high_res_2_score'] = df_survival.apply(lambda row: row_to_score(row, high_res_other_hla_types_scorer), axis=1)

## Export anonymized data

In [None]:
from datetime import timedelta

date_shift = -np.abs(np.random.normal(loc=0.0, scale=1.0, size=len(df_survival))) * timedelta(days=360) - timedelta(days=180)

df_anon = pd.DataFrame({
    'start_date': (df_survival['StartDate'] + date_shift - np.abs(np.random.normal(loc=0.0, scale=1.0, size=len(df_survival))) * timedelta(days=180)).dt.round('d'),
    'last_visit_date': (df_survival['LastVisitDate'] + date_shift).dt.round('d'),
    'end_date': (df_survival['EndDate'] + date_shift).dt.round('d'),
    'end_reason': df_survival['EndReason'],
    'donor_risk_index_simple': df_survival['Donor_Simple_KDRI'].round(4),
    'donor_risk_index_advanced': df_survival['Donor_KDRI'].round(4),
    'transplant_split_score': df_survival['split_score'].round(4),
    'transplant_high_res_score': df_survival['high_res_score'].round(4)
}).sort_values(by='start_date').reset_index(drop=True)

df_anon.to_csv('data/kidney_survival_with_score_anon.csv')
df_anon

## Results

In [None]:
df_survival['Donor_Is_Female'] = df_survival['Donor_Sex'].apply(lambda sex: 1 if sex == 'F' else 0 if sex == 'M' else None)
# df_survival

In [None]:
result_cols = [
    'split_score', 'high_res_score', 'high_res_2_score',
    'Donor_EGFR', 'Donor_EGFR2', 'Donor_Crea', 'Donor_Simple_KDRI', 'Donor_KDRI', 'Donor_AgeAtTx' , 'Donor_Is_Female'
]

### Delay distribution on ended patients

In [None]:
#df_survival_ended = df_survival_filtered[df_survival_filtered.EndDate.notnull()]
df_survival_ended = df_survival[df_survival.EndDate.notnull()]
print(len(df_survival_ended.index))
df_survival_ended.delay.hist()

### Distribution

In [None]:
df_survival[result_cols].hist(figsize=(20, 15))

### Score distribution on ended patients

In [None]:
df_survival_ended[result_cols].hist(bins=26, figsize=(20, 15))

### Score distribution on ended patients with delay < 10 years

In [None]:
df_survival_ended[df_survival_ended.delay<3650][result_cols].hist(bins=26, figsize=(20, 15))

### Dependency between delay and score on ended patients

In [None]:
for col in result_cols:
    corr = df_survival_ended[col].corr(df_survival_ended.delay)
    df_survival_ended.plot.scatter(x='delay', y=col, title=f"corr = {corr:.4f}")

### Ended patients with the smalles delay

In [None]:
df_survival_ended.sort_values(by='delay', ascending=True).head()