# Excel Quality Pipeline – Discovery Notebook

## Objective
Build a reproducible data pipeline that produces a single cleaned
quality dataset from multiple Excel inputs.

This notebook is for exploration and validation only.
Final logic will be migrated into the `src/` module.


In [181]:
import pandas as pd
pd.__version__


'2.3.3'

In [182]:
file_path = "../data/raw/PATIENT OPD VISITS- from MCC & Email.xlsx"
df = pd.read_excel(file_path)
df.shape

(4204, 18)

In [183]:
header = df.columns.tolist()

In [184]:
print(header)

['MRN', 'PatEngName', 'pat_birthdate', 'regtime', 'VISIT_DATETIME', 'ARRIVETIME', 'STARTTIME', 'VISIT_TYPE', 'PORTAL STATUS', 'VISIT STATUS', 'BOOKING_TYPE', 'clinic_key', 'CLINIC_NAME', 'MRP', 'JOB TITLE', 'SPECIALTY', 'Added_by', 'addtime']


In [185]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4204 entries, 0 to 4203
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   MRN             4204 non-null   int64         
 1   PatEngName      4094 non-null   object        
 2   pat_birthdate   4047 non-null   datetime64[ns]
 3   regtime         4204 non-null   datetime64[ns]
 4   VISIT_DATETIME  4047 non-null   datetime64[ns]
 5   ARRIVETIME      3314 non-null   datetime64[ns]
 6   STARTTIME       3208 non-null   datetime64[ns]
 7   VISIT_TYPE      4204 non-null   object        
 8   PORTAL STATUS   4047 non-null   object        
 9   VISIT STATUS    4047 non-null   object        
 10  BOOKING_TYPE    4047 non-null   object        
 11  clinic_key      4047 non-null   float64       
 12  CLINIC_NAME     4204 non-null   object        
 13  MRP             4204 non-null   object        
 14  JOB TITLE       4047 non-null   object        
 15  SPEC

In [186]:
# Drop uneeded col
cols_to_drop = [
    "regtime",
    "ARRIVETIME",
    "STARTTIME",
    "PORTAL STATUS",
    "clinic_key",
    "JOB TITLE",
    "SPECIALTY",
    "Added_by",
    "addtime",
]

df = df.drop(columns=cols_to_drop)


In [187]:
df.columns

Index(['MRN', 'PatEngName', 'pat_birthdate', 'VISIT_DATETIME', 'VISIT_TYPE',
       'VISIT STATUS', 'BOOKING_TYPE', 'CLINIC_NAME', 'MRP'],
      dtype='object')

In [188]:
# Normalize column names
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)
df.columns

Index(['mrn', 'patengname', 'pat_birthdate', 'visit_datetime', 'visit_type',
       'visit_status', 'booking_type', 'clinic_name', 'mrp'],
      dtype='object')

In [189]:
df['visit_datetime'].dtype

dtype('<M8[ns]')

In [190]:
# change col visit_datetime to datetime DD/MM/YYYY

df["visit_datetime"] = pd.to_datetime(
    df["visit_datetime"],
    errors="coerce"
).dt.date


In [191]:
df["visit_datetime"].dtype
type(df.loc[0, "visit_datetime"])


datetime.date

In [192]:
# Adding the new col

new_cols = [
    "liver_r_d",
    "liver_status",
    "kidney_recipient",
    "kidney_donor",
]

for col in new_cols:
    df[col] = None

df.columns

Index(['mrn', 'patengname', 'pat_birthdate', 'visit_datetime', 'visit_type',
       'visit_status', 'booking_type', 'clinic_name', 'mrp', 'liver_r_d',
       'liver_status', 'kidney_recipient', 'kidney_donor'],
      dtype='object')

In [193]:
df.head()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
0,218455,NORAH BAHIAS ALI ZARAB,1987-04-16,2025-12-23,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
1,118631,ZAHRA MAHDI ALI ALAWAZIM,2003-02-24,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
2,219640,SULTANAH HAMAD ABDULLAH ALHASSAN,1959-01-10,2025-12-08,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
3,220961,SEHAM MOHAMMED KHAMEES ALHASSAN,1969-06-03,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
4,220533,ABDULAZIZ ADNAN SALEH ALHADAD,1978-09-30,2025-12-21,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,


In [194]:
df['visit_status'].value_counts()

visit_status
STARTED    3208
NO SHOW     733
ARRIVED     106
Name: count, dtype: int64

In [195]:
# Remove NO SHOW visit_status

df = df[df["visit_status"] != "NO SHOW"]


In [196]:
df['visit_status'].value_counts()

visit_status
STARTED    3208
ARRIVED     106
Name: count, dtype: int64

In [197]:
# Fill liver columns using Liver Registry (ONLY)
# For each MRN in df:
# Look it up in Liver Registry
# If found:
###### Rec_Don → liver_r_d
###### Current Status1 → liver_status
# If not found:
##### leave as NaN

liver_df = pd.read_excel("../data/raw/Liver Registry - 1 JAN.xlsx")
liver_df.columns = (liver_df.columns.str.strip().str.lower().str.replace(" ", "_"))



In [198]:
liver_df.columns.tolist()



['r_mrn',
 'id_number',
 'patient_name',
 'mobile1_0',
 'payment_0',
 'rec_don',
 'nationality',
 'city_of_residence',
 'specify_other',
 'age_group',
 'age',
 'blood_group',
 'gender',
 'date_open_file_0',
 'current_status1',
 'patient_type_1',
 'initial_tx_assessment_date',
 'workup_start_date',
 'workup_completed_date',
 'date_added_to_waiting_list',
 'surgery_date',
 'tx_type',
 'transplant_place',
 'comment_0',
 'closefile_r',
 'closefiledate_r',
 'closefilereason_r',
 'assigned_consultant_0',
 'assigned_surgeon_r',
 'primary_disease']

In [199]:
df = df.merge(
    liver_df[["r_mrn", "rec_don", "current_status1"]],
    left_on="mrn",
    right_on="r_mrn",
    how="left"
)


In [200]:
df["liver_r_d"] = df["rec_don"]
df["liver_status"] = df["current_status1"]
df.tail()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor,r_mrn,rec_don,current_status1
3466,203844,Abdullah,NaT,NaT,Follow up,,,Psychology,Alia Al Enazi,Recipient,Post Tx,,,203844.0,Recipient,Post Tx
3467,221651,MOHAMMED,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,,,,
3468,221730,Abdulqawi,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,,,,
3469,40991,SHATHA,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,,,,
3470,107061,ABDULLAH,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,,,,


In [201]:
df = df.drop(columns=["r_mrn", "rec_don", "current_status1"])


In [202]:
df[df["liver_r_d"].notna()]


Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
12,104431,SAIF RAJA H ALHAJRI,1976-06-28,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,Recipient,Hepatology patient,,
20,220797,BASHAYER FAWAZ ALSAEGR,1992-04-17,2025-12-10,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,Donor,Pre Tx ready for transplant,,
21,220503,ALHASSAN ABDULRAHIM ABDULLAH ALZAHRANI,1999-08-20,2025-12-10,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,Donor,Post Tx,,
22,220468,SAAD SALEM M ALMAHRI,1986-03-17,2025-12-10,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,Donor,Post Tx,,
23,220993,IBRAHIM ABDULKARIM IBRAHIM ALHABIB,1993-09-13,2025-12-10,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,Donor,Pre Tx ready for transplant,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3454,221422,MESHARI,NaT,NaT,New,,,Psychology,Alia Al Enazi,Donor,Pre Tx on workups,,
3455,221756,FAHAD,NaT,NaT,New,,,Psychology,Alia Al Enazi,Donor,Pre Tx on workups,,
3456,218992,HISSAH,NaT,NaT,New,,,Psychology,Alia Al Enazi,Recipient,Pre Tx on workups,,
3461,221468,WADI,NaT,NaT,New,,,Psychology,Alia Al Enazi,Donor,Pre Tx on workups,,


In [203]:
# Fill kidney_recipient using Kidney Recipient registry (ONLY)
# For each mrn in the main df:
# Look it up in Kidney Recipient - 1 JAN.xlsx

# If found:
###### copy Current Status1 → kidney_recipient
# If not found:
###### leave as NaN

kidney_rec_df = pd.read_excel("../data/raw/Kidney Recipient - 1 JAN.xlsx")
kidney_rec_df.columns = (kidney_rec_df.columns.str.strip().str.lower().str.replace(" ", "_"))


In [204]:
df = df.merge(
    kidney_rec_df[["r_mrn", "current_status1"]],
    left_on="mrn",
    right_on="r_mrn",
    how="left"
)
df.head()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor,r_mrn,current_status1
0,218455,NORAH BAHIAS ALI ZARAB,1987-04-16,2025-12-23,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,,,
1,118631,ZAHRA MAHDI ALI ALAWAZIM,2003-02-24,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,,,
2,219640,SULTANAH HAMAD ABDULLAH ALHASSAN,1959-01-10,2025-12-08,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,,,
3,220961,SEHAM MOHAMMED KHAMEES ALHASSAN,1969-06-03,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,,,
4,220533,ABDULAZIZ ADNAN SALEH ALHADAD,1978-09-30,2025-12-21,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,,,


In [205]:
df["kidney_recipient"] = df["current_status1"]
df = df.drop(columns=["r_mrn", "current_status1"])



In [206]:
df.tail()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
3466,203844,Abdullah,NaT,NaT,Follow up,,,Psychology,Alia Al Enazi,Recipient,Post Tx,Nephrology patient,
3467,221651,MOHAMMED,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,
3468,221730,Abdulqawi,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,
3469,40991,SHATHA,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,
3470,107061,ABDULLAH,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,Post LD Tx,


In [207]:
# Fill kidney_donor
# For each mrn in the main df:
# Match it to d_mrn in Kidney Donor - 1 JAN.xlsx
# Copy current_status1_d → kidney_donor
# If no match → leave NaN

kidney_donor_df = pd.read_excel("../data/raw/Kidney Donor - 1 JAN.xlsx")

kidney_donor_df.columns = (
    kidney_donor_df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
)

df = df.merge(
    kidney_donor_df[["d_mrn", "current_status1_d"]],
    left_on="mrn",
    right_on="d_mrn",
    how="left"
)

df["kidney_donor"] = df["current_status1_d"]

df = df.drop(columns=["d_mrn", "current_status1_d"])


In [208]:
df.head()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
0,218455,NORAH BAHIAS ALI ZARAB,1987-04-16,2025-12-23,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
1,118631,ZAHRA MAHDI ALI ALAWAZIM,2003-02-24,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
2,219640,SULTANAH HAMAD ABDULLAH ALHASSAN,1959-01-10,2025-12-08,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
3,220961,SEHAM MOHAMMED KHAMEES ALHASSAN,1969-06-03,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
4,220533,ABDULAZIZ ADNAN SALEH ALHADAD,1978-09-30,2025-12-21,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,


In [209]:
df['clinic_name'].value_counts()

clinic_name
Anesthesia                                     840
Clinical Nutrition                             556
Adult Post Renal Transplant                    500
TRANSPLANT SURGERY                             458
Adult Post Kidney Transplant Virtual Clinic    264
Adult Pre Renal Transplant                     133
ADULT POST LIVER TRANSPLANT                    131
Social Work                                    110
Pediatric Post Renal Transplant                102
Adult Pre Liver Transplant                      97
Pediatric Liver Transplant                      66
Transplant Dressing (procedure)                 54
Psychology                                      47
Adult Post Liver Transplant Virtual             42
Adult Pre Renal Transplant Virtual Clinic       37
Pre anesthesia liver transplant                 15
Pediatric Pre Renal Transplant                  11
Adult Pre Liver Transplant Virtual               5
Transplant Cystoscopy Clinic                     3
Name: count, dtype:

In [210]:
# Remove visits based on clinic + transplant status (ONLY)

# Remove row if ALL of the following are true:

# 1-clinic_name is Anesthesia OR Clinical Nutrition
# 2- liver_r_d is NaN
# 3-kidney_recipient is NaN
# 4-kidney_donor is NaN

mask = (
    df["clinic_name"].isin(["Anesthesia", "Clinical Nutrition"])
    & df["liver_r_d"].isna()
    & df["kidney_recipient"].isna()
    & df["kidney_donor"].isna()
)

df = df[~mask]


In [211]:
df[
    df["clinic_name"].isin(["Anesthesia", "Clinical Nutrition"])
][["liver_r_d", "kidney_recipient", "kidney_donor"]].isna().all(axis=1).sum()


np.int64(0)

In [212]:
df.head()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
7,174153,AYSH ABDO HAMAD HADIDI,1955-02-23,2025-12-23,FOLLOW UP,STARTED,NORMAL BOOKING,Adult Post Renal Transplant,SUMAYAH ABDULAZIZ MOHAMMED ASKANDARANI,,,Post LD Tx,
9,77521,AMAL SULAIMAN OUDAH ALHAWITI,1986-09-28,2025-12-17,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,Post DD Tx,
12,104431,SAIF RAJA H ALHAJRI,1976-06-28,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,Recipient,Hepatology patient,Post DD Tx,
13,101282,NOUF SAUD MOHAMMED ALDAWSARI,1982-11-29,2025-12-14,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,Post LD Tx,
17,6364,MOHAMMED BEN ALI BIN SULAIMAN AL JAFAR,1982-08-20,2025-12-21,FOLLOW UP,STARTED,NORMAL BOOKING,TRANSPLANT SURGERY,MAHMOUD ALI MAHMOUD OBEID,,,Post LD Tx,


In [213]:
df[df[["liver_r_d", "kidney_recipient", "kidney_donor"]].isna().all(axis=1)]

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
530,13459,SHAIKHAH MOHAMMED A ALSAMHAN,1940-08-04,2025-12-15,NEW,STARTED,NORMAL BOOKING,Adult Pre Liver Transplant,Eyad Mohammad Osman Elhassan Gadour,,,,
1166,221813,MOHAMMAD MOQBIL ABDULLAH ALQARAWI,1959-01-10,2025-12-28,NEW,STARTED,NORMAL BOOKING,Adult Pre Liver Transplant,HADI MOHAMMED SHOEI KURIRY,,,,
1169,218567,REEM ABDULHAKIM HAMAD ALKHALDI,2010-07-14,2025-12-02,Walk-In,STARTED,WALK-IN,Pediatric Liver Transplant,Razan Monther Abdulmajied Bader,,,,
2335,8909,BANEEN ABDULLAH MOHAMMED ALSAIHATI,1986-05-17,2025-12-29,FOLLOW UP,STARTED,NORMAL BOOKING,TRANSPLANT SURGERY,MAHMOUD ALI MAHMOUD OBEID,,,,
2744,200160,AHMED MOHAMED ALY AHMED ZIDAN,1981-07-25,2025-12-31,CHART CHECK,STARTED,WALK-IN,TRANSPLANT SURGERY,AHMED MOHAMED ALY AHMED ZIDAN,,,,
2756,6097,REHAB AHMED GABER ABDALL,1978-06-10,2025-12-28,CHART CHECK,STARTED,WALK-IN,TRANSPLANT SURGERY,AHMED MOHAMED ALY AHMED ZIDAN,,,,
3018,220426,SALMAN OMAR JAFAR ALOTAIBI,1986-09-14,2025-12-09,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,,,,
3041,200362,NISY MATHEWS,1998-08-15,2025-12-16,Walk-In,STARTED,WALK-IN,Adult Pre Renal Transplant,KHALID AHMEDH BELEED AKKARI,,,,
3170,210421,Devika Reghu,1996-05-04,2025-12-29,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,MAHMOUD ALI MAHMOUD OBEID,,,,


In [214]:
# Add need_review flag
df["need_review"] = "no"
review_mask = (
    df[["liver_r_d", "kidney_recipient", "kidney_donor"]]
    .isna()
    .all(axis=1)
)

df.loc[review_mask, "need_review"] = "yes"


In [215]:
df['liver_r_d'].value_counts()


liver_r_d
Recipient    547
Donor        125
Name: count, dtype: int64

In [216]:
# Create and fill r_d (LOCKED LOGIC)

# We look at three columns:
#liver_r_d, kidney_recipient, kidney_donor
# A) If only one of the three is non-NaN:
# Only liver_r_d filled → r_d = "Liver " + liver_r_d
# Only kidney_recipient filled → r_d = kidney_recipient
# Only kidney_donor filled → r_d = kidney_donor
# B) If liver_r_d is non-NaN AND (either kidney_recipient OR kidney_donor is non-NaN):
# Use clinic_name keyword:
# If clinic_name contains renal or kidney → r_d becomes the non-NaN kidney value (recipient preferred, else donor)
# If clinic_name contains liver → r_d = "Liver " + liver_r_d

# Vectorized version
mask_liver_only = df['liver_r_d'].notna() & df['kidney_recipient'].isna() & df['kidney_donor'].isna()
mask_k_recip_only = df['kidney_recipient'].notna() & df['liver_r_d'].isna() & df['kidney_donor'].isna()
mask_k_donor_only = df['kidney_donor'].notna() & df['liver_r_d'].isna() & df['kidney_recipient'].isna()

df['r_d'] = np.select(
    [
        mask_liver_only,
        mask_k_recip_only,
        mask_k_donor_only,
    ],
    [
        "Liver " + df['liver_r_d'].astype(str),
        df['kidney_recipient'],
        df['kidney_donor']
    ],
    default=np.nan
)

# Now handle mixed cases (where liver + at least one kidney is present)
mixed_mask = (
    df['liver_r_d'].notna() &
    (df['kidney_recipient'].notna() | df['kidney_donor'].notna()) &
    (df['r_d'].isna())  # still null after previous step
)

clinic_lower = df['clinic_name'].fillna('').str.lower()

renal_clinic = clinic_lower.str.contains('renal|kidney')
liver_clinic = clinic_lower.str.contains('liver')

df.loc[mixed_mask & renal_clinic & ~liver_clinic, 'r_d'] = \
    df.loc[mixed_mask & renal_clinic & ~liver_clinic, 'kidney_recipient'].fillna(
        df.loc[mixed_mask & renal_clinic & ~liver_clinic, 'kidney_donor']
    )

df.loc[mixed_mask & liver_clinic & ~renal_clinic, 'r_d'] = \
    "Liver " + df.loc[mixed_mask & liver_clinic & ~renal_clinic, 'liver_r_d'].astype(str)




In [217]:
df.head()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor,need_review,r_d
7,174153,AYSH ABDO HAMAD HADIDI,1955-02-23,2025-12-23,FOLLOW UP,STARTED,NORMAL BOOKING,Adult Post Renal Transplant,SUMAYAH ABDULAZIZ MOHAMMED ASKANDARANI,,,Post LD Tx,,no,Post LD Tx
9,77521,AMAL SULAIMAN OUDAH ALHAWITI,1986-09-28,2025-12-17,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,Post DD Tx,,no,Post DD Tx
12,104431,SAIF RAJA H ALHAJRI,1976-06-28,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,Recipient,Hepatology patient,Post DD Tx,,no,
13,101282,NOUF SAUD MOHAMMED ALDAWSARI,1982-11-29,2025-12-14,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,Post LD Tx,,no,Post LD Tx
17,6364,MOHAMMED BEN ALI BIN SULAIMAN AL JAFAR,1982-08-20,2025-12-21,FOLLOW UP,STARTED,NORMAL BOOKING,TRANSPLANT SURGERY,MAHMOUD ALI MAHMOUD OBEID,,,Post LD Tx,,no,Post LD Tx


In [218]:
df['liver_status'].value_counts()

liver_status
Post Tx                        366
Hepatology patient             111
Pre Tx on workups               82
New patient                     47
Pre Tx ready for transplant     24
OPD Hepatology patient          16
Pre Tx on DDWL                  16
Pre Tx Ready for transplant      6
Hepatitis C patient              2
Pre Tx on hold                   2
Name: count, dtype: int64

In [219]:
# ambiguous_mask = (
#     df['liver_r_d'].notna() &
#     (df['kidney_recipient'].notna() | df['kidney_donor'].notna()) &
#     df['r_d'].isna()
# )

# len(df.loc[ambiguous_mask])

# come to this later at the end to see how many should be assigned as need review
df.head(30)

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor,need_review,r_d
7,174153,AYSH ABDO HAMAD HADIDI,1955-02-23,2025-12-23,FOLLOW UP,STARTED,NORMAL BOOKING,Adult Post Renal Transplant,SUMAYAH ABDULAZIZ MOHAMMED ASKANDARANI,,,Post LD Tx,,no,Post LD Tx
9,77521,AMAL SULAIMAN OUDAH ALHAWITI,1986-09-28,2025-12-17,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,Post DD Tx,,no,Post DD Tx
12,104431,SAIF RAJA H ALHAJRI,1976-06-28,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,Recipient,Hepatology patient,Post DD Tx,,no,
13,101282,NOUF SAUD MOHAMMED ALDAWSARI,1982-11-29,2025-12-14,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,Post LD Tx,,no,Post LD Tx
17,6364,MOHAMMED BEN ALI BIN SULAIMAN AL JAFAR,1982-08-20,2025-12-21,FOLLOW UP,STARTED,NORMAL BOOKING,TRANSPLANT SURGERY,MAHMOUD ALI MAHMOUD OBEID,,,Post LD Tx,,no,Post LD Tx
18,66898,AMNA AHMED ABDULGAFOR AL SHEIKH,1984-01-04,2025-12-10,FOLLOW UP,ARRIVED,NORMAL BOOKING,TRANSPLANT SURGERY,MAHMOUD ALI MAHMOUD OBEID,,,Post LD Tx,,no,Post LD Tx
19,208799,FARZANAH SAEED A ALI,1963-07-01,2025-12-15,FOLLOW UP,STARTED,NORMAL BOOKING,TRANSPLANT SURGERY,MAHMOUD ALI MAHMOUD OBEID,,,Post LD Tx,,no,Post LD Tx
20,220797,BASHAYER FAWAZ ALSAEGR,1992-04-17,2025-12-10,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,Donor,Pre Tx ready for transplant,,,no,Liver Donor
21,220503,ALHASSAN ABDULRAHIM ABDULLAH ALZAHRANI,1999-08-20,2025-12-10,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,Donor,Post Tx,,,no,Liver Donor
22,220468,SAAD SALEM M ALMAHRI,1986-03-17,2025-12-10,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,Donor,Post Tx,,,no,Liver Donor


In [220]:
df['r_d'].value_counts()

r_d
Post LD Tx                                        911
Liver Recipient                                   450
Pre Tx on workups                                 266
Post living donor nephrectomy                     158
Liver Donor                                       122
Post DD Tx                                         98
Waiting for first appointment                      50
Listed on DDWL                                     40
Pre Tx Ready for committee                         33
Pre Tx Discussed & ready for donor nephrectomy     30
Nephrology patient                                 21
Initial visit done, No LD, suspended for DDWL      20
Pre Tx Discussed & Ready for LD Tx                 18
Listed on DDWL & Ready for LD Tx                   13
Pre Tx Discussed with pending issues                9
Pre Tx on hold                                      9
Listed on DDWL - Inactive                           3
Name: count, dtype: int64

In [221]:
df["patient_type"] = None
df["transplant_date"] = None
df["days"] = None


In [222]:
df.head(1)

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor,need_review,r_d,patient_type,transplant_date,days
7,174153,AYSH ABDO HAMAD HADIDI,1955-02-23,2025-12-23,FOLLOW UP,STARTED,NORMAL BOOKING,Adult Post Renal Transplant,SUMAYAH ABDULAZIZ MOHAMMED ASKANDARANI,,,Post LD Tx,,no,Post LD Tx,,,


In [223]:
df['patient_type'] = df['liver_status'].combine_first(
    df['kidney_recipient']
).combine_first(
    df['kidney_donor']
)

In [225]:
# if patient_type is NA i think need_revuew = yes

In [232]:
# Create lookup dictionaries: MRN → Surgery Date
# Using the actual column names from your files

liver_dict = (
    liver_df[['r_mrn', 'surgery_date']]
    .dropna(subset=['r_mrn'])
    .set_index('r_mrn')['surgery_date']
    .to_dict()
)

recip_dict = (
    kidney_rec_df[['r_mrn', 'surgery_date']]           # ← using kidney_rec_df (your naming)
    .dropna(subset=['r_mrn'])
    .set_index('r_mrn')['surgery_date']
    .to_dict()
)

donor_dict = (
    kidney_donor_df[['d_mrn', 'surgery_date_d']]
    .dropna(subset=['d_mrn'])
    .set_index('d_mrn')['surgery_date_d']
    .to_dict()
)

# Populate transplant_date with this priority order:
# 1. Liver
# 2. Kidney Recipient
# 3. Kidney Donor
df['transplant_date'] = (
    df['mrn'].map(liver_dict)                  # Liver first
    .combine_first(df['mrn'].map(recip_dict))       # then Recipient
    .combine_first(df['mrn'].map(donor_dict))       # then Donor
)

# Optional: Flag rows that still have no transplant date
# (uncomment when you're ready to implement review flagging)
# df.loc[df['transplant_date'].isna(), 'need_review'] = 'yes'

print("transplant_date column updated.")
print(f"Number of rows with transplant_date filled: {df['transplant_date'].notna().sum()}")

transplant_date column updated.
Number of rows with transplant_date filled: 1577


In [240]:
df[df['transplant_date'].isna()][50:100]

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor,need_review,r_d,patient_type,transplant_date,days
652,221596,MASHAIL IBRAHIM I ALQATARI,1993-03-16,2025-12-31,NEW,STARTED,NORMAL BOOKING,Adult Pre Liver Transplant,HADI MOHAMMED SHOEI KURIRY,Recipient,New patient,,,no,Liver Recipient,New patient,NaT,
695,3497,HUWAYDA AHMED MOHAMMED FLATA,1973-04-14,2025-12-01,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,Recipient,OPD Hepatology patient,,,no,Liver Recipient,OPD Hepatology patient,NaT,
697,220938,ABDELRAHMAN ALAAULDEEN AHMED SAYED MOHAMMED,2014-06-16,2025-12-03,Walk-In,STARTED,WALK-IN,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,Recipient,New patient,,,no,Liver Recipient,New patient,NaT,
698,115829,ZAHRA MOHAMMED AHMED ALWABARI,2016-05-09,2025-12-01,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,Nephrology patient,,no,Nephrology patient,Nephrology patient,NaT,
714,168247,SEDRA FARAJ MOHAMMED ALANAZI,2021-01-26,2025-12-15,NEW,STARTED,NORMAL BOOKING,Clinical Nutrition,Fatimah Mansoor Al Faraj,,,Nephrology patient,,no,Nephrology patient,Nephrology patient,NaT,
774,165482,IBRAHIM FAHAD IBRAHIM AL BAHAR,2021-06-16,2025-12-24,NEW,STARTED,NORMAL BOOKING,Clinical Nutrition,Marwah Abdullah Al-Mubarak,Recipient,New patient,,,no,Liver Recipient,New patient,NaT,
859,193437,MARIAM MOHAMMED EID ALRASHIDI,1999-08-16,2025-12-15,NEW,STARTED,NORMAL BOOKING,Clinical Nutrition,Fatimah Abdullah AlIbrahim,Recipient,Hepatology patient,,,no,Liver Recipient,Hepatology patient,NaT,
861,209234,FUWZIA ABDULLAH HUSAIN ALSHABIB,1957-01-31,2025-12-15,NEW,STARTED,NORMAL BOOKING,Clinical Nutrition,Fatimah Abdullah AlIbrahim,Recipient,New patient,Waiting for first appointment,,no,,New patient,NaT,
877,215541,ALAA AHMED M ALARFAJ,1971-08-21,2025-12-28,NEW,STARTED,NORMAL BOOKING,Clinical Nutrition,Fatimah Abdullah AlIbrahim,Recipient,Pre Tx on workups,,,no,Liver Recipient,Pre Tx on workups,NaT,
881,30976,MOHAMMED SAEED MOHAMMED ALZAHRANI,1963-11-17,2025-12-28,NEW,STARTED,NORMAL BOOKING,Clinical Nutrition,Fatimah Abdullah AlIbrahim,Recipient,OPD Hepatology patient,,,no,Liver Recipient,OPD Hepatology patient,NaT,


In [241]:
# fill patient_type: if visit_date - transplant_date is minus, zero or the transplant_date is NaN -> pre
# if the visit_date - transplant_date. patient_type -> post

# Ensure both columns are datetime (safe, coercive)
visit_dt = pd.to_datetime(df["visit_datetime"], errors="coerce")
tx_dt = pd.to_datetime(df["transplant_date"], errors="coerce")

# Calculate day difference
df["days"] = (visit_dt - tx_dt).dt.days

# Overwrite patient_type based on rules
df.loc[tx_dt.isna(), "patient_type"] = "pre"
df.loc[tx_dt.notna() & (df["days"] <= 0), "patient_type"] = "pre"
df.loc[tx_dt.notna() & (df["days"] > 0), "patient_type"] = "post"
