# Excel Quality Pipeline – Discovery Notebook

## Objective
Build a reproducible data pipeline that produces a single cleaned
quality dataset from multiple Excel inputs.

This notebook is for exploration and validation only.
Final logic will be migrated into the `src/` module.


In [1]:
import pandas as pd
pd.__version__


'2.3.3'

In [2]:
file_path = "../data/raw/PATIENT OPD VISITS- from MCC & Email.xlsx"
df = pd.read_excel(file_path)
df.shape

(4204, 18)

In [3]:
header = df.columns.tolist()

In [4]:
print(header)

['MRN', 'PatEngName', 'pat_birthdate', 'regtime', 'VISIT_DATETIME', 'ARRIVETIME', 'STARTTIME', 'VISIT_TYPE', 'PORTAL STATUS', 'VISIT STATUS', 'BOOKING_TYPE', 'clinic_key', 'CLINIC_NAME', 'MRP', 'JOB TITLE', 'SPECIALTY', 'Added_by', 'addtime']


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4204 entries, 0 to 4203
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   MRN             4204 non-null   int64         
 1   PatEngName      4094 non-null   object        
 2   pat_birthdate   4047 non-null   datetime64[ns]
 3   regtime         4204 non-null   datetime64[ns]
 4   VISIT_DATETIME  4047 non-null   datetime64[ns]
 5   ARRIVETIME      3314 non-null   datetime64[ns]
 6   STARTTIME       3208 non-null   datetime64[ns]
 7   VISIT_TYPE      4204 non-null   object        
 8   PORTAL STATUS   4047 non-null   object        
 9   VISIT STATUS    4047 non-null   object        
 10  BOOKING_TYPE    4047 non-null   object        
 11  clinic_key      4047 non-null   float64       
 12  CLINIC_NAME     4204 non-null   object        
 13  MRP             4204 non-null   object        
 14  JOB TITLE       4047 non-null   object        
 15  SPEC

In [6]:
# Drop uneeded col
cols_to_drop = [
    "regtime",
    "ARRIVETIME",
    "STARTTIME",
    "PORTAL STATUS",
    "clinic_key",
    "JOB TITLE",
    "SPECIALTY",
    "Added_by",
    "addtime",
]

df = df.drop(columns=cols_to_drop)


In [7]:
df.columns

Index(['MRN', 'PatEngName', 'pat_birthdate', 'VISIT_DATETIME', 'VISIT_TYPE',
       'VISIT STATUS', 'BOOKING_TYPE', 'CLINIC_NAME', 'MRP'],
      dtype='object')

In [8]:
# Normalize column names
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)
df.columns

Index(['mrn', 'patengname', 'pat_birthdate', 'visit_datetime', 'visit_type',
       'visit_status', 'booking_type', 'clinic_name', 'mrp'],
      dtype='object')

In [9]:
df['visit_datetime'].dtype

dtype('<M8[ns]')

In [10]:
# change col visit_datetime to datetime DD/MM/YYYY

df["visit_datetime"] = pd.to_datetime(
    df["visit_datetime"],
    errors="coerce"
).dt.date


In [11]:
df["visit_datetime"].dtype
type(df.loc[0, "visit_datetime"])


datetime.date

In [12]:
# Adding the new col

new_cols = [
    "liver_r_d",
    "liver_status",
    "kidney_recipient",
    "kidney_donor",
]

for col in new_cols:
    df[col] = None

df.columns

Index(['mrn', 'patengname', 'pat_birthdate', 'visit_datetime', 'visit_type',
       'visit_status', 'booking_type', 'clinic_name', 'mrp', 'liver_r_d',
       'liver_status', 'kidney_recipient', 'kidney_donor'],
      dtype='object')

In [13]:
df.head()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
0,218455,NORAH BAHIAS ALI ZARAB,1987-04-16,2025-12-23,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
1,118631,ZAHRA MAHDI ALI ALAWAZIM,2003-02-24,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
2,219640,SULTANAH HAMAD ABDULLAH ALHASSAN,1959-01-10,2025-12-08,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
3,220961,SEHAM MOHAMMED KHAMEES ALHASSAN,1969-06-03,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
4,220533,ABDULAZIZ ADNAN SALEH ALHADAD,1978-09-30,2025-12-21,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,


In [14]:
df['visit_status'].value_counts()

visit_status
STARTED    3208
NO SHOW     733
ARRIVED     106
Name: count, dtype: int64

In [15]:
# Remove NO SHOW visit_status

df = df[df["visit_status"] != "NO SHOW"]


In [16]:
df['visit_status'].value_counts()

visit_status
STARTED    3208
ARRIVED     106
Name: count, dtype: int64

In [17]:
# Fill liver columns using Liver Registry (ONLY)
# For each MRN in df:
# Look it up in Liver Registry
# If found:
###### Rec_Don → liver_r_d
###### Current Status1 → liver_status
# If not found:
##### leave as NaN

liver_df = pd.read_excel("../data/raw/Liver Registry - 1 JAN.xlsx")
liver_df.columns = (liver_df.columns.str.strip().str.lower().str.replace(" ", "_"))



In [19]:
liver_df.columns.tolist()



['r_mrn',
 'id_number',
 'patient_name',
 'mobile1_0',
 'payment_0',
 'rec_don',
 'nationality',
 'city_of_residence',
 'specify_other',
 'age_group',
 'age',
 'blood_group',
 'gender',
 'date_open_file_0',
 'current_status1',
 'patient_type_1',
 'initial_tx_assessment_date',
 'workup_start_date',
 'workup_completed_date',
 'date_added_to_waiting_list',
 'surgery_date',
 'tx_type',
 'transplant_place',
 'comment_0',
 'closefile_r',
 'closefiledate_r',
 'closefilereason_r',
 'assigned_consultant_0',
 'assigned_surgeon_r',
 'primary_disease']

In [22]:
df = df.merge(
    liver_df[["r_mrn", "rec_don", "current_status1"]],
    left_on="mrn",
    right_on="r_mrn",
    how="left"
)


In [25]:
df["liver_r_d"] = df["rec_don"]
df["liver_status"] = df["current_status1"]
df.tail()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor,r_mrn,rec_don,current_status1
3466,203844,Abdullah,NaT,NaT,Follow up,,,Psychology,Alia Al Enazi,Recipient,Post Tx,,,203844.0,Recipient,Post Tx
3467,221651,MOHAMMED,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,,,,
3468,221730,Abdulqawi,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,,,,
3469,40991,SHATHA,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,,,,
3470,107061,ABDULLAH,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,,,,


In [26]:
df = df.drop(columns=["r_mrn", "rec_don", "current_status1"])


In [30]:
df[df["liver_r_d"].notna()]


Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
12,104431,SAIF RAJA H ALHAJRI,1976-06-28,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,Recipient,Hepatology patient,,
20,220797,BASHAYER FAWAZ ALSAEGR,1992-04-17,2025-12-10,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,Donor,Pre Tx ready for transplant,,
21,220503,ALHASSAN ABDULRAHIM ABDULLAH ALZAHRANI,1999-08-20,2025-12-10,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,Donor,Post Tx,,
22,220468,SAAD SALEM M ALMAHRI,1986-03-17,2025-12-10,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,Donor,Post Tx,,
23,220993,IBRAHIM ABDULKARIM IBRAHIM ALHABIB,1993-09-13,2025-12-10,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,Donor,Pre Tx ready for transplant,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3454,221422,MESHARI,NaT,NaT,New,,,Psychology,Alia Al Enazi,Donor,Pre Tx on workups,,
3455,221756,FAHAD,NaT,NaT,New,,,Psychology,Alia Al Enazi,Donor,Pre Tx on workups,,
3456,218992,HISSAH,NaT,NaT,New,,,Psychology,Alia Al Enazi,Recipient,Pre Tx on workups,,
3461,221468,WADI,NaT,NaT,New,,,Psychology,Alia Al Enazi,Donor,Pre Tx on workups,,


In [33]:
# Fill kidney_recipient using Kidney Recipient registry (ONLY)
# For each mrn in the main df:
# Look it up in Kidney Recipient - 1 JAN.xlsx

# If found:
###### copy Current Status1 → kidney_recipient
# If not found:
###### leave as NaN

kidney_rec_df = pd.read_excel("../data/raw/Kidney Recipient - 1 JAN.xlsx")
kidney_rec_df.columns = (kidney_rec_df.columns.str.strip().str.lower().str.replace(" ", "_"))


In [34]:
df = df.merge(
    kidney_rec_df[["r_mrn", "current_status1"]],
    left_on="mrn",
    right_on="r_mrn",
    how="left"
)
df.head()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor,r_mrn,current_status1
0,218455,NORAH BAHIAS ALI ZARAB,1987-04-16,2025-12-23,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,,,
1,118631,ZAHRA MAHDI ALI ALAWAZIM,2003-02-24,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,,,
2,219640,SULTANAH HAMAD ABDULLAH ALHASSAN,1959-01-10,2025-12-08,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,,,
3,220961,SEHAM MOHAMMED KHAMEES ALHASSAN,1969-06-03,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,,,
4,220533,ABDULAZIZ ADNAN SALEH ALHADAD,1978-09-30,2025-12-21,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,,,


In [35]:
df["kidney_recipient"] = df["current_status1"]
df = df.drop(columns=["r_mrn", "current_status1"])



In [39]:
df.tail()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
3466,203844,Abdullah,NaT,NaT,Follow up,,,Psychology,Alia Al Enazi,Recipient,Post Tx,Nephrology patient,
3467,221651,MOHAMMED,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,
3468,221730,Abdulqawi,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,
3469,40991,SHATHA,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,,
3470,107061,ABDULLAH,NaT,NaT,New,,,Psychology,Alia Al Enazi,,,Post LD Tx,


In [40]:
# Fill kidney_donor
# For each mrn in the main df:
# Match it to d_mrn in Kidney Donor - 1 JAN.xlsx
# Copy current_status1_d → kidney_donor
# If no match → leave NaN

kidney_donor_df = pd.read_excel("../data/raw/Kidney Donor - 1 JAN.xlsx")

kidney_donor_df.columns = (
    kidney_donor_df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
)

df = df.merge(
    kidney_donor_df[["d_mrn", "current_status1_d"]],
    left_on="mrn",
    right_on="d_mrn",
    how="left"
)

df["kidney_donor"] = df["current_status1_d"]

df = df.drop(columns=["d_mrn", "current_status1_d"])


In [45]:
df.head()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
0,218455,NORAH BAHIAS ALI ZARAB,1987-04-16,2025-12-23,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
1,118631,ZAHRA MAHDI ALI ALAWAZIM,2003-02-24,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
2,219640,SULTANAH HAMAD ABDULLAH ALHASSAN,1959-01-10,2025-12-08,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
3,220961,SEHAM MOHAMMED KHAMEES ALHASSAN,1969-06-03,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
4,220533,ABDULAZIZ ADNAN SALEH ALHADAD,1978-09-30,2025-12-21,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,


In [46]:
df['clinic_name'].value_counts()

clinic_name
Anesthesia                                     840
Clinical Nutrition                             556
Adult Post Renal Transplant                    500
TRANSPLANT SURGERY                             458
Adult Post Kidney Transplant Virtual Clinic    264
Adult Pre Renal Transplant                     133
ADULT POST LIVER TRANSPLANT                    131
Social Work                                    110
Pediatric Post Renal Transplant                102
Adult Pre Liver Transplant                      97
Pediatric Liver Transplant                      66
Transplant Dressing (procedure)                 54
Psychology                                      47
Adult Post Liver Transplant Virtual             42
Adult Pre Renal Transplant Virtual Clinic       37
Pre anesthesia liver transplant                 15
Pediatric Pre Renal Transplant                  11
Adult Pre Liver Transplant Virtual               5
Transplant Cystoscopy Clinic                     3
Name: count, dtype:

In [47]:
# Remove visits based on clinic + transplant status (ONLY)

# Remove row if ALL of the following are true:

# 1-clinic_name is Anesthesia OR Clinical Nutrition
# 2- liver_r_d is NaN
# 3-kidney_recipient is NaN
# 4-kidney_donor is NaN

mask = (
    df["clinic_name"].isin(["Anesthesia", "Clinical Nutrition"])
    & df["liver_r_d"].isna()
    & df["kidney_recipient"].isna()
    & df["kidney_donor"].isna()
)

df = df[~mask]


In [49]:
df[
    df["clinic_name"].isin(["Anesthesia", "Clinical Nutrition"])
][["liver_r_d", "kidney_recipient", "kidney_donor"]].isna().all(axis=1).sum()


np.int64(0)

In [50]:
df.head()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
7,174153,AYSH ABDO HAMAD HADIDI,1955-02-23,2025-12-23,FOLLOW UP,STARTED,NORMAL BOOKING,Adult Post Renal Transplant,SUMAYAH ABDULAZIZ MOHAMMED ASKANDARANI,,,Post LD Tx,
9,77521,AMAL SULAIMAN OUDAH ALHAWITI,1986-09-28,2025-12-17,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,Post DD Tx,
12,104431,SAIF RAJA H ALHAJRI,1976-06-28,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,Recipient,Hepatology patient,Post DD Tx,
13,101282,NOUF SAUD MOHAMMED ALDAWSARI,1982-11-29,2025-12-14,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,Post LD Tx,
17,6364,MOHAMMED BEN ALI BIN SULAIMAN AL JAFAR,1982-08-20,2025-12-21,FOLLOW UP,STARTED,NORMAL BOOKING,TRANSPLANT SURGERY,MAHMOUD ALI MAHMOUD OBEID,,,Post LD Tx,


In [58]:
df[df[["liver_r_d", "kidney_recipient", "kidney_donor"]].isna().all(axis=1)]

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
530,13459,SHAIKHAH MOHAMMED A ALSAMHAN,1940-08-04,2025-12-15,NEW,STARTED,NORMAL BOOKING,Adult Pre Liver Transplant,Eyad Mohammad Osman Elhassan Gadour,,,,
1166,221813,MOHAMMAD MOQBIL ABDULLAH ALQARAWI,1959-01-10,2025-12-28,NEW,STARTED,NORMAL BOOKING,Adult Pre Liver Transplant,HADI MOHAMMED SHOEI KURIRY,,,,
1169,218567,REEM ABDULHAKIM HAMAD ALKHALDI,2010-07-14,2025-12-02,Walk-In,STARTED,WALK-IN,Pediatric Liver Transplant,Razan Monther Abdulmajied Bader,,,,
2335,8909,BANEEN ABDULLAH MOHAMMED ALSAIHATI,1986-05-17,2025-12-29,FOLLOW UP,STARTED,NORMAL BOOKING,TRANSPLANT SURGERY,MAHMOUD ALI MAHMOUD OBEID,,,,
2744,200160,AHMED MOHAMED ALY AHMED ZIDAN,1981-07-25,2025-12-31,CHART CHECK,STARTED,WALK-IN,TRANSPLANT SURGERY,AHMED MOHAMED ALY AHMED ZIDAN,,,,
2756,6097,REHAB AHMED GABER ABDALL,1978-06-10,2025-12-28,CHART CHECK,STARTED,WALK-IN,TRANSPLANT SURGERY,AHMED MOHAMED ALY AHMED ZIDAN,,,,
3018,220426,SALMAN OMAR JAFAR ALOTAIBI,1986-09-14,2025-12-09,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,Mohammed Saad Ali AL-Qahtani,,,,
3041,200362,NISY MATHEWS,1998-08-15,2025-12-16,Walk-In,STARTED,WALK-IN,Adult Pre Renal Transplant,KHALID AHMEDH BELEED AKKARI,,,,
3170,210421,Devika Reghu,1996-05-04,2025-12-29,Walk-In,STARTED,WALK-IN,TRANSPLANT SURGERY,MAHMOUD ALI MAHMOUD OBEID,,,,


In [60]:
# Add need_review flag
df["need_review"] = "no"
review_mask = (
    df[["liver_r_d", "kidney_recipient", "kidney_donor"]]
    .isna()
    .all(axis=1)
)

df.loc[review_mask, "need_review"] = "yes"
