# Excel Quality Pipeline â€“ Discovery Notebook

## Objective
Build a reproducible data pipeline that produces a single cleaned
quality dataset from multiple Excel inputs.

This notebook is for exploration and validation only.
Final logic will be migrated into the `src/` module.


In [50]:
import pandas as pd
pd.__version__


'2.3.3'

In [51]:
file_path = "../data/raw/PATIENT OPD VISITS- from MCC & Email.xlsx"
df = pd.read_excel(file_path)
df.shape

(4204, 18)

In [52]:
header = df.columns.tolist()

In [53]:
print(header)

['MRN', 'PatEngName', 'pat_birthdate', 'regtime', 'VISIT_DATETIME', 'ARRIVETIME', 'STARTTIME', 'VISIT_TYPE', 'PORTAL STATUS', 'VISIT STATUS', 'BOOKING_TYPE', 'clinic_key', 'CLINIC_NAME', 'MRP', 'JOB TITLE', 'SPECIALTY', 'Added_by', 'addtime']


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4204 entries, 0 to 4203
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   MRN             4204 non-null   int64         
 1   PatEngName      4094 non-null   object        
 2   pat_birthdate   4047 non-null   datetime64[ns]
 3   regtime         4204 non-null   datetime64[ns]
 4   VISIT_DATETIME  4047 non-null   datetime64[ns]
 5   ARRIVETIME      3314 non-null   datetime64[ns]
 6   STARTTIME       3208 non-null   datetime64[ns]
 7   VISIT_TYPE      4204 non-null   object        
 8   PORTAL STATUS   4047 non-null   object        
 9   VISIT STATUS    4047 non-null   object        
 10  BOOKING_TYPE    4047 non-null   object        
 11  clinic_key      4047 non-null   float64       
 12  CLINIC_NAME     4204 non-null   object        
 13  MRP             4204 non-null   object        
 14  JOB TITLE       4047 non-null   object        
 15  SPEC

In [55]:
# Drop uneeded col
cols_to_drop = [
    "regtime",
    "ARRIVETIME",
    "STARTTIME",
    "PORTAL STATUS",
    "clinic_key",
    "JOB TITLE",
    "SPECIALTY",
    "Added_by",
    "addtime",
]

df = df.drop(columns=cols_to_drop)


In [56]:
df.columns

Index(['MRN', 'PatEngName', 'pat_birthdate', 'VISIT_DATETIME', 'VISIT_TYPE',
       'VISIT STATUS', 'BOOKING_TYPE', 'CLINIC_NAME', 'MRP'],
      dtype='object')

In [57]:
# Normalize column names
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)
df.columns

Index(['mrn', 'patengname', 'pat_birthdate', 'visit_datetime', 'visit_type',
       'visit_status', 'booking_type', 'clinic_name', 'mrp'],
      dtype='object')

In [58]:
df['visit_datetime'].dtype

dtype('<M8[ns]')

In [59]:
# change col visit_datetime to datetime DD/MM/YYYY

df["visit_datetime"] = pd.to_datetime(
    df["visit_datetime"],
    errors="coerce"
).dt.date


In [60]:
df["visit_datetime"].dtype
type(df.loc[0, "visit_datetime"])


datetime.date

In [61]:
# Adding the new col

new_cols = [
    "liver_r_d",
    "liver_status",
    "kidney_recipient",
    "kidney_donor",
]

for col in new_cols:
    df[col] = None

df.columns

Index(['mrn', 'patengname', 'pat_birthdate', 'visit_datetime', 'visit_type',
       'visit_status', 'booking_type', 'clinic_name', 'mrp', 'liver_r_d',
       'liver_status', 'kidney_recipient', 'kidney_donor'],
      dtype='object')

In [62]:
df.head()

Unnamed: 0,mrn,patengname,pat_birthdate,visit_datetime,visit_type,visit_status,booking_type,clinic_name,mrp,liver_r_d,liver_status,kidney_recipient,kidney_donor
0,218455,NORAH BAHIAS ALI ZARAB,1987-04-16,2025-12-23,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
1,118631,ZAHRA MAHDI ALI ALAWAZIM,2003-02-24,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
2,219640,SULTANAH HAMAD ABDULLAH ALHASSAN,1959-01-10,2025-12-08,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
3,220961,SEHAM MOHAMMED KHAMEES ALHASSAN,1969-06-03,2025-12-10,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,
4,220533,ABDULAZIZ ADNAN SALEH ALHADAD,1978-09-30,2025-12-21,NEW,STARTED,NORMAL BOOKING,Anesthesia,MOHAMED ESSAMELDIN HASSAN ABDELMEGUID,,,,


In [66]:
df['visit_status'].value_counts()

visit_status
STARTED    3208
NO SHOW     733
ARRIVED     106
Name: count, dtype: int64

In [73]:
# Remove NO SHOW visit_status

df = df[df["visit_status"] != "NO SHOW"]


In [74]:
df['visit_status'].value_counts()

visit_status
STARTED    3208
ARRIVED     106
Name: count, dtype: int64