# Investigation of old transplant data

To run this, please copy to `data` directory
- old patient data (`KDP-processed` directory)
- `LD_kidney_survival.csv`
- `patients_list_recipientID.csv`

Such data can be found in our GDrive (Produkty > TXM > Pairing data > Old matchings).

## Imports

In [None]:
import pandas as pd
import os
from dataclasses import dataclass, asdict
from typing import List
import re
import numpy as np
import sys
import math

In [None]:
sys.path.insert(0, "../..")

from local_testing_utilities.notebook_utils.pairing_data import parse_pairing_data
from local_testing_utilities.notebook_utils.survival_data import parse_survival_data

## A) Load old patients data

In [None]:
df_all_patients = parse_pairing_data('data/KDP-processed', 'data/patients_list_recipientID.csv', remove_single_donors=True)

# All patients for all txm events
df_all_patients.head()
#df_all_patients.loc[df_all_patients.txm_event == 31]

# I checked in xlsx files that the data were parsed to the last czech patient
# df_all_patients.groupby(['txm_event']).last()

Patients without recipient id

In [None]:
df_all_patients.loc[df_all_patients.recipient_id.isnull()]

In [None]:
print(len(df_all_patients.index))
print("---")

print(len(df_all_patients.groupby(['donor_name']).first().index))
print(len(df_all_patients.groupby(['donor_typization']).first().index))
print(len(df_all_patients.groupby(['donor_name', 'donor_typization']).first().index))
print("---")

print(len(df_all_patients.groupby(['recipient_name']).first().index))
print(len(df_all_patients.groupby(['recipient_typization']).first().index))
print(len(df_all_patients.groupby(['recipient_name', 'recipient_typization']).first().index))

print("---")
print(len(df_all_patients.groupby(['recipient_luminex_2']).first().index))
print(len(df_all_patients.groupby(['recipient_luminex_2', 'recipient_typization']).first().index))

### Export unique patients
This was used for exporting patients for Kahle

In [None]:
# pd.set_option('display.max_rows', 500)
# df_unique_patients = df_all_patients[~df_all_patients.recipient_year_of_birth.isin(['nan', 'x'])].groupby(['recipient_name', 'recipient_year_of_birth']).first()[[]]
# df_unique_patients.to_csv('patients_list.csv')
# display(df_unique_patients)
# pd.reset_option('display.max_rows')

### Problematic data

#### 1. Donors with the same name but different typization

In [None]:
_problematic_patients = df_all_patients.set_index('donor_name').join(df_all_patients.set_index('donor_name'), lsuffix='_l', rsuffix='_r')\
    .loc[lambda x: x.donor_typization_l < x.donor_typization_r]

print(len(_problematic_patients.index))
_problematic_patients[['txm_event_l', 'donor_typization_l', 'donor_typization_r', 'txm_event_r', ]]\
 .groupby(['donor_name', 'donor_typization_l', 'donor_typization_r'])\
 .agg(lambda x: ",".join({str(i) for i in x}))

#### 2. Recipients with the same name but different typization

In [None]:
_problematic_patients = df_all_patients.set_index('recipient_name').join(df_all_patients.set_index('recipient_name'), lsuffix='_l', rsuffix='_r')\
    .loc[lambda x: x.recipient_typization_l < x.recipient_typization_r]

print(len(_problematic_patients.index))
_problematic_patients[['txm_event_l', 'recipient_typization_l', 'recipient_typization_r', 'txm_event_r', ]]\
 .groupby(['recipient_name', 'recipient_typization_l', 'recipient_typization_r'])\
 .agg(lambda x: ",".join({str(i) for i in x}))

#### 3. Donors with the same typization but different name

In [None]:
_problematic_patients = df_all_patients.set_index('donor_typization').join(df_all_patients.set_index('donor_typization'), lsuffix='_l', rsuffix='_r')\
    .loc[lambda x: x.donor_name_l < x.donor_name_r]

print(len(_problematic_patients.index))
_problematic_patients[['txm_event_l', 'donor_name_l', 'donor_name_r', 'txm_event_r', ]]\
 .groupby(['donor_name_l', 'donor_name_r', 'donor_typization'])\
 .agg(lambda x: ",".join(sorted({str(i) for i in x})))

#### 4. Recipients with the same typization but different name

In [None]:
_problematic_patients = df_all_patients.set_index('recipient_typization').join(df_all_patients.set_index('recipient_typization'), lsuffix='_l', rsuffix='_r')\
    .loc[lambda x: x.recipient_name_l < x.recipient_name_r]

print(len(_problematic_patients.index))
_problematic_patients[['txm_event_l', 'recipient_name_l', 'recipient_name_r', 'txm_event_r', ]].fillna(-1)\
 .groupby(['recipient_name_l', 'recipient_name_r', 'recipient_typization'])\
 .agg(lambda x: ",".join(sorted({str(i) for i in x})))

### Plotting

In [None]:
%matplotlib inline

a = df_all_patients.groupby(['recipient_id']).count().groupby(['txm_event']).count()['donor_blood_group']
a.plot.bar(title='# patients in # rounds', figsize=(10, 5))

In [None]:
df_all_patients.groupby(['txm_event']).nunique()[['recipient_id', 'recipient_typization', 'recipient_name','donor_typization', 'donor_name' ]]\
    .plot.bar(figsize=(20, 5), title='Number of specified values for each txm_event')

In [None]:
df_patients_to_event = pd.pivot_table(df_all_patients.assign(one=1), values='one', index=['recipient_id'], columns=['txm_event'], aggfunc=np.sum, fill_value=0)
df_event_to_patients = pd.pivot_table(df_all_patients.assign(one=1), values='one', index=['txm_event'], columns=['recipient_id'], aggfunc=np.sum, fill_value=0)
df_event_to_patients.plot.area(figsize=(20,10), legend=False, title='In what txm events each patient was')

In [None]:
# Uncomment to show the above in table
# df_patients_to_event

## B) Kidney survival data

In [None]:
df_survival = parse_survival_data('data/LD_kidney_survival_with_donor_params.csv')
df_survival.head()

### Closed transplants with smallest delay

In [None]:
df_survival.loc[df_survival.EndReason.notnull()].sort_values(by='delay').head()

## C) Join both data

### C.1. Preprocess patiens

In [None]:
df_patients_summary = df_all_patients.groupby(
    ['donor_name', 'donor_typization', 'donor_blood_group', 'donor_relationship', 'recipient_name',
     'recipient_typization', 'recipient_id']
)[['txm_event']].agg(lambda x: ",".join([str(i) for i in x])).reset_index()

df_patients_summary.rename(columns={
    'donor_name': 'orig_donor_name',
    'donor_typization': 'orig_donor_typization',
    'donor_blood_group': 'orig_donor_blood_group',
    'donor_relationship': 'orig_donor_relationship',
    # 'recipient_luminex_2': 'recipient_luminex',
    'txm_event': 'txm_events'
}, inplace=True)

df_patients_summary['last_txm_event'] = df_patients_summary['txm_events'].apply(lambda events: int(str(events).split(",")[-1])).astype('int32')

print("Number of patients:")
print(len(df_patients_summary.index))
print(len(df_patients_summary.recipient_id.unique()))
df_patients_summary#.head()

### C.2. Preprocess survival data

In [None]:
df_survival_summary = df_survival

### C.3. Join both data

In [None]:
df_joined = df_patients_summary.set_index('recipient_id').join(df_survival_summary.set_index('RecipientID'), how='inner', rsuffix='_r')
df_joined = df_joined.reset_index()
print("Number of transplanted patients:")
print(len(df_joined.index))

### Oldest transplant in joined data

In [None]:
df_joined_oldest = df_joined.sort_values(by='StartDate')
oldest_start_date = df_joined_oldest.iloc[0].StartDate
print(oldest_start_date)
df_joined_oldest.iloc[0:1]

### Number survival data that are more recent than the oldest transplant

In [None]:
df_survival_summary_recent = df_survival_summary.loc[df_survival_summary.StartDate >= oldest_start_date].sort_values(by='StartDate')
print(len(df_survival_summary_recent.index))

In [None]:
df_joined.plot(x='last_txm_event', y='StartDate', style=".", figsize=(10, 7))