In [1]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load dataset 
df = pd.read_csv("../data/processed/nhs_no_show_enhanced_synthetic.csv")
df


Unnamed: 0,IMD_Decile,Ethnicity,Age_Group,Gender,Medical_Specialty,Consultation_Type,Appointment_Type,Base_NoShow_Prob,NoShow_Prob_Final,Previous_Appointments,Previous_NoShows,NoShow
0,More deprived 30-40%,Pakistani (Asian or Asian British),45-49,Female,Trauma and Orthopaedics,Face-to-Face,Subsequent,0.01,0.029,2,0,No
1,Least deprived 10%,British (White),90-120,Female,Allied Health Professional,Face-to-Face,Subsequent,0.01,0.011,5,0,No
2,Less deprived 20-30%,British (White),65-69,Female,Cardiology,Face-to-Face,Subsequent,0.01,0.021,4,0,No
3,Less deprived 30-40%,British (White),10-14,Male,Paediatrics,Face-to-Face,First,0.01,0.099,5,0,No
4,More deprived 10-20%,British (White),85-89,Female,General Surgery,Face-to-Face,Subsequent,0.01,0.013,4,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...
249995,More deprived 30-40%,Any other White background,85-89,Male,Ophthalmology,Face-to-Face,Subsequent,0.01,0.023,2,0,No
249996,Most deprived 10%,British (White),17,Male,Urology,Face-to-Face,Subsequent,0.01,0.053,6,0,No
249997,More deprived 10-20%,British (White),60-64,Female,Gynaecology,Face-to-Face,Subsequent,0.01,0.023,4,0,No
249998,Most deprived 10%,Chinese (other ethnic group),50-54,Female,Nursing,Face-to-Face,Subsequent,0.01,0.027,5,0,No


In [2]:
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   IMD_Decile             250000 non-null  object 
 1   Ethnicity              250000 non-null  object 
 2   Age_Group              250000 non-null  object 
 3   Gender                 250000 non-null  object 
 4   Medical_Specialty      250000 non-null  object 
 5   Consultation_Type      250000 non-null  object 
 6   Appointment_Type       250000 non-null  object 
 7   Base_NoShow_Prob       250000 non-null  float64
 8   NoShow_Prob_Final      250000 non-null  float64
 9   Previous_Appointments  250000 non-null  int64  
 10  Previous_NoShows       250000 non-null  int64  
 11  NoShow                 250000 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 22.9+ MB


Unnamed: 0,Base_NoShow_Prob,NoShow_Prob_Final,Previous_Appointments,Previous_NoShows
count,250000.0,250000.0,250000.0,250000.0
mean,0.01,0.039792,3.4957,0.138812
std,3.259378e-14,0.022651,1.872326,0.38082
min,0.01,0.01,0.0,0.0
25%,0.01,0.023,2.0,0.0
50%,0.01,0.033,3.0,0.0
75%,0.01,0.059,5.0,0.0
max,0.01,0.105,17.0,5.0


In [3]:
# droping duplicate data
print("*"*30)
print("Duplicates before:", df.duplicated().sum())
df = df.drop_duplicates().reset_index(drop=True)
print("Duplicates after:", df.duplicated().sum())
print("*"*30)

print(df.isnull().sum())


******************************
Duplicates before: 63476
Duplicates after: 0
******************************
IMD_Decile               0
Ethnicity                0
Age_Group                0
Gender                   0
Medical_Specialty        0
Consultation_Type        0
Appointment_Type         0
Base_NoShow_Prob         0
NoShow_Prob_Final        0
Previous_Appointments    0
Previous_NoShows         0
NoShow                   0
dtype: int64


In [4]:
print(df['NoShow'].unique())
print(df['Ethnicity'].unique())
print(df['Age_Group'].unique())
print(df['Gender'].unique())
print(df['Medical_Specialty'].unique())
print(df['Consultation_Type'].unique())
print(df['Appointment_Type'].unique())
print(df['Previous_Appointments'].unique())
print(df['Previous_NoShows'].unique())


['No' 'Yes']
['Pakistani (Asian or Asian British)' 'British (White)'
 'Indian (Asian or Asian British)' 'Caribbean (Black or Black British)'
 'African (Black or Black British)' 'Any other Black background'
 'Not Known' 'Any other White background' 'Any other Mixed background'
 'Irish (White)' 'Any other ethnic group' 'Any other Asian background'
 'Bangladeshi (Asian or Asian British)' 'White and Asian (Mixed)'
 'White and Black Caribbean (Mixed)' 'White and Black African (Mixed)'
 'Chinese (other ethnic group)']
['45-49' '90-120' '65-69' '10-14' '85-89' '5-9' '25-29' '35-39' '80-84'
 '40-44' '30-34' '55-59' '60-64' '70-74' '75-79' '1-4' '50-54' '0' '20-24'
 '18' 'Unknown' '17' '15' '16']
['Female' 'Male']
['Trauma and Orthopaedics' 'Allied Health Professional' 'Cardiology'
 'Paediatrics' 'General Surgery' 'Medical Oncology' 'Dermatology'
 'Paediatric Dentistry' 'Adult Mental Illness' 'Ear Nose and Throat'
 'Nursing' 'Gynaecology' 'Gastroenterology' 'Radiology' 'Anaesthetics'
 'Obstetri

In [5]:
# Dropping Base_NoShow_Prob and NoShow_Prob_Final because they are derived from other features. 
# Including them could give the model a hint and lead to overfitting.

df.drop(columns = ["Base_NoShow_Prob" , "NoShow_Prob_Final"] , inplace=True)
df

Unnamed: 0,IMD_Decile,Ethnicity,Age_Group,Gender,Medical_Specialty,Consultation_Type,Appointment_Type,Previous_Appointments,Previous_NoShows,NoShow
0,More deprived 30-40%,Pakistani (Asian or Asian British),45-49,Female,Trauma and Orthopaedics,Face-to-Face,Subsequent,2,0,No
1,Least deprived 10%,British (White),90-120,Female,Allied Health Professional,Face-to-Face,Subsequent,5,0,No
2,Less deprived 20-30%,British (White),65-69,Female,Cardiology,Face-to-Face,Subsequent,4,0,No
3,Less deprived 30-40%,British (White),10-14,Male,Paediatrics,Face-to-Face,First,5,0,No
4,More deprived 10-20%,British (White),85-89,Female,General Surgery,Face-to-Face,Subsequent,4,0,No
...,...,...,...,...,...,...,...,...,...,...
186519,Unknown/Non-UK Country,British (White),90-120,Male,Plastic Surgery,Face-to-Face,Subsequent,3,0,No
186520,Most deprived 10%,Any other ethnic group,60-64,Female,Ophthalmology,Face-to-Face,First,3,0,No
186521,Most deprived 10%,British (White),17,Male,Urology,Face-to-Face,Subsequent,6,0,No
186522,Most deprived 10%,Chinese (other ethnic group),50-54,Female,Nursing,Face-to-Face,Subsequent,5,0,No


In [6]:
# Creating New feature
df["NoShowRate"] = df["Previous_NoShows"] / np.where(df["Previous_Appointments"] == 0, 1, df["Previous_Appointments"])

df["NoShowRate"] = df["NoShowRate"].round(3)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186524 entries, 0 to 186523
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   IMD_Decile             186524 non-null  object 
 1   Ethnicity              186524 non-null  object 
 2   Age_Group              186524 non-null  object 
 3   Gender                 186524 non-null  object 
 4   Medical_Specialty      186524 non-null  object 
 5   Consultation_Type      186524 non-null  object 
 6   Appointment_Type       186524 non-null  object 
 7   Previous_Appointments  186524 non-null  int64  
 8   Previous_NoShows       186524 non-null  int64  
 9   NoShow                 186524 non-null  object 
 10  NoShowRate             186524 non-null  float64
dtypes: float64(1), int64(2), object(8)
memory usage: 15.7+ MB


In [7]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# NoShow currently "Yes"/"No" — convert to 1/0
df['NoShow'] = df['NoShow'].map({'Yes': 1, 'No': 0})
df['NoShow'] = df['NoShow'].astype(int)

# Simple label encoding
le_imd = LabelEncoder()
df['IMD_encoded'] = le_imd.fit_transform(df['IMD_Decile'].astype(str))

# Replacing each specialty with its relative frequency (0..1)
specialty_counts = df['Medical_Specialty'].value_counts(normalize=True)
df['Specialty_Freq'] = df['Medical_Specialty'].map(specialty_counts)

# D. One-hot encode other categorical columns
one_hot_cols = ['Ethnicity', 'Age_Group', 'Gender', 'Consultation_Type', 'Appointment_Type']

# Use get_dummies; drop_first=True to avoid perfect multicollinearity (optional)
df_encoded = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)


In [8]:
# Final feature list

features = [
    'IMD_encoded',
    'Specialty_Freq',
    'Previous_Appointments',
    'Previous_NoShows',
    'NoShowRate'
]
# add all generated one-hot columns
one_hot_generated = [c for c in df_encoded.columns if any(c.startswith(col + '_') for col in one_hot_cols)]
features += one_hot_generated

X = df_encoded[features]
y = df_encoded['NoShow']


print("Encoded feature shape:", X.shape)
print("Sample feature columns:", X.columns[:20].tolist())
print("Target distribution:\n", y.value_counts(normalize=True))

Encoded feature shape: (186524, 47)
Sample feature columns: ['IMD_encoded', 'Specialty_Freq', 'Previous_Appointments', 'Previous_NoShows', 'NoShowRate', 'Ethnicity_Any other Asian background', 'Ethnicity_Any other Black background', 'Ethnicity_Any other Mixed background', 'Ethnicity_Any other White background', 'Ethnicity_Any other ethnic group', 'Ethnicity_Bangladeshi (Asian or Asian British)', 'Ethnicity_British (White)', 'Ethnicity_Caribbean (Black or Black British)', 'Ethnicity_Chinese (other ethnic group)', 'Ethnicity_Indian (Asian or Asian British)', 'Ethnicity_Irish (White)', 'Ethnicity_Not Known', 'Ethnicity_Pakistani (Asian or Asian British)', 'Ethnicity_White and Asian (Mixed)', 'Ethnicity_White and Black African (Mixed)']
Target distribution:
 NoShow
0    0.947438
1    0.052562
Name: proportion, dtype: float64


In [9]:
from sklearn.model_selection import train_test_split

# Using 75% data for training and 25% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Training target distribution:\n", y_train.value_counts(normalize=True))
print("Test target distribution:\n", y_test.value_counts(normalize=True))


Training set shape: (139893, 47)
Test set shape: (46631, 47)
Training target distribution:
 NoShow
0    0.947438
1    0.052562
Name: proportion, dtype: float64
Test target distribution:
 NoShow
0    0.947438
1    0.052562
Name: proportion, dtype: float64


In [10]:
from sklearn.ensemble import RandomForestClassifier

# model
rf_model = RandomForestClassifier(
    n_estimators=200,         # number of trees
    max_depth=None,           # let trees grow fully
    random_state=42,
    class_weight='balanced',  # automatically adjust for class imbalance
    n_jobs=-1                 # use all CPU cores for speed
)

# training model
rf_model.fit(X_train, y_train)

print("Random Forest training completed!")


Random Forest training completed!


In [11]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt

# Make predictions 
y_pred = rf_model.predict(X_test)
y_pred_prob = rf_model.predict_proba(X_test)[:, 1]  # probability for class 1 (NoShow)

# performance
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", round(acc, 4))
print("F1 Score:", round(f1, 4))
print("ROC AUC:", round(roc_auc, 4))
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Accuracy: 0.932
F1 Score: 0.0019
ROC AUC: 0.5439

Confusion Matrix:
 [[43456   724]
 [ 2448     3]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96     44180
           1       0.00      0.00      0.00      2451

    accuracy                           0.93     46631
   macro avg       0.48      0.49      0.48     46631
weighted avg       0.90      0.93      0.91     46631



In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# Model
lr = LogisticRegression(
    class_weight='balanced',  # handle imbalance
    max_iter=1000,            # ensure convergence
    random_state=42,
    n_jobs=-1
)

# Train
lr.fit(X_train, y_train)

# Predict
y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:,1]

# Evaluate
acc = accuracy_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr)
roc_auc = roc_auc_score(y_test, y_prob_lr)
cm = confusion_matrix(y_test, y_pred_lr)

print(" Logistic Regression ")
print("Accuracy:", round(acc, 4))
print("F1 Score:", round(f1, 4))
print("ROC AUC:", round(roc_auc, 4))
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))


 Logistic Regression 
Accuracy: 0.622
F1 Score: 0.1466
ROC AUC: 0.6627

Confusion Matrix:
 [[27492 16688]
 [  937  1514]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.62      0.76     44180
           1       0.08      0.62      0.15      2451

    accuracy                           0.62     46631
   macro avg       0.53      0.62      0.45     46631
weighted avg       0.92      0.62      0.73     46631



In [13]:
import lightgbm as lgb

# Handle imbalance using scale_pos_weight
pos = y_train.sum()
neg = len(y_train) - pos
scale = neg / pos

lgb_model = lgb.LGBMClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.8,
    objective='binary',
    class_weight='balanced',  # alternative to scale_pos_weight
    random_state=42,
    n_jobs=-1
)

# Train
lgb_model.fit(X_train, y_train)

# Predict
y_pred_lgb = lgb_model.predict(X_test)
y_prob_lgb = lgb_model.predict_proba(X_test)[:,1]

# Evaluate
acc = accuracy_score(y_test, y_pred_lgb)
f1 = f1_score(y_test, y_pred_lgb)
roc_auc = roc_auc_score(y_test, y_prob_lgb)
cm = confusion_matrix(y_test, y_pred_lgb)

print(" LightGBM ")
print("Accuracy:", round(acc, 4))
print("F1 Score:", round(f1, 4))
print("ROC AUC:", round(roc_auc, 4))
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred_lgb))


[LightGBM] [Info] Number of positive: 7353, number of negative: 132540
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003289 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 220
[LightGBM] [Info] Number of data points in the train set: 139893, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
 LightGBM 
Accuracy: 0.6226
F1 Score: 0.1466
ROC AUC: 0.6609

Confusion Matrix:
 [[27521 16659]
 [  939  1512]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.62      0.76     44180
           1       0.08      0.62      0.15      2451

    accuracy                           0.62     46631
   macro avg       0.53      0.62      0.45     46631
weighted avg       0.92      0.62

In [14]:
from xgboost import XGBClassifier

# count ratio for weight
pos = y_train.sum()
neg = len(y_train) - pos
scale = neg / pos
print("scale_pos_weight =", scale)

xgb = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.8,
    objective='binary:logistic',
    scale_pos_weight=scale,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

xgb.fit(X_train, y_train)
print("XGBoost training completed!")


scale_pos_weight = 18.025295797633618
XGBoost training completed!


In [15]:
import numpy as np
from sklearn.metrics import f1_score

best_thr = 0
best_f1 = 0
y_prob = xgb.predict_proba(X_test)[:,1]

for thr in np.arange(0.05, 0.90, 0.01):
    y_pred_thr = (y_prob >= thr).astype(int)
    f1 = f1_score(y_test, y_pred_thr)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = thr

print("Best threshold:", round(best_thr, 3))
print("Best F1 at that threshold:", round(best_f1, 4))


Best threshold: 0.61
Best F1 at that threshold: 0.1585


In [16]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score

thr = 0.60

y_pred_opt = (xgb.predict_proba(X_test)[:,1] >= thr).astype(int)

print("Threshold used:", thr)
print("Accuracy:", round(accuracy_score(y_test, y_pred_opt), 4))
print("F1 Score:", round(f1_score(y_test, y_pred_opt), 4))
print("ROC AUC:", round(roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1]), 4))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_opt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_opt))


Threshold used: 0.6
Accuracy: 0.797
F1 Score: 0.1568
ROC AUC: 0.6601

Confusion Matrix:
 [[36286  7894]
 [ 1571   880]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.82      0.88     44180
           1       0.10      0.36      0.16      2451

    accuracy                           0.80     46631
   macro avg       0.53      0.59      0.52     46631
weighted avg       0.91      0.80      0.85     46631



In [17]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# Base model
xgb = XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight = (len(y_train) - y_train.sum()) / y_train.sum(), 
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)


# Hyperparameter grid for random search
param_dist = {
    'n_estimators': [200, 400, 600],
    'max_depth': [3, 5, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2]
}

# RandomizedSearchCV
xgb_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=25,          # number of random combinations to try
    scoring='f1',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)


In [18]:
# Fit on training data
xgb_search.fit(X_train, y_train)

# Best model from search
xgb_best = xgb_search.best_estimator_

print("Best hyperparameters found:", xgb_search.best_params_)


Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best hyperparameters found: {'subsample': 0.9, 'reg_lambda': 2, 'reg_alpha': 0.1, 'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.8}


In [19]:
import numpy as np
from sklearn.metrics import f1_score

y_prob = xgb_best.predict_proba(X_test)[:,1]

best_thr = 0
best_f1 = 0

for thr in np.arange(0.01, 0.90, 0.002):
    y_pred_thr = (y_prob >= thr).astype(int)
    f1 = f1_score(y_test, y_pred_thr)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = thr

print("Best threshold:", round(best_thr, 3))
print("Best F1 at that threshold:", round(best_f1, 4))


Best threshold: 0.596
Best F1 at that threshold: 0.1592


In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt

# Predictions using optimal threshold
y_pred_opt = (y_prob >= best_thr).astype(int)

print("Threshold used:", best_thr)
print("Accuracy:", round(accuracy_score(y_test, y_pred_opt), 4))
print("F1 Score:", round(f1_score(y_test, y_pred_opt), 4))
print("ROC AUC:", round(roc_auc_score(y_test, y_prob), 4))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_opt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_opt))


Threshold used: 0.596
Accuracy: 0.8036
F1 Score: 0.1592
ROC AUC: 0.6571

Confusion Matrix:
 [[36606  7574]
 [ 1584   867]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.83      0.89     44180
           1       0.10      0.35      0.16      2451

    accuracy                           0.80     46631
   macro avg       0.53      0.59      0.52     46631
weighted avg       0.91      0.80      0.85     46631



In [25]:
import joblib
joblib.dump(xgb_best, '../xgb_model.joblib')  # Better filename; saves in repo root

NameError: name 'preprocessor' is not defined

In [24]:
import xgboost
print('Notebook XGBoost version:', xgboost.__version__)

Notebook XGBoost version: 3.1.1
