# Supervised Learning

# 05_supervised_learning_model

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 30/10/2025   | Martin | Created   | Notebook created for supervised learning model | 

# Content

* [Introduction](#introduction)

# Introduction

In [None]:
%load_ext watermark

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data

In [27]:
path = "../data/clean"
df = pd.read_pickle(f"{path}/patient_level.pkl")

In [28]:
df.head()

Unnamed: 0,patient_medicare_number,patient_first_name,patient_last_name,gender,birthdate,number_of_claims,drg_ls,combined_diagnosis_ls,combined_hcpcs_ls,billablePeriod_start_ls,billablePeriod_end_ls,location_of_bill_ls,total_value
1,1S00E00AA10,Brandon214,Roob72,female,1946-01-15,3,[],"[O039, O039, B085, B002, O039, J029]","[G0444, 99241, G0444, G9572]","[2013-04-23, 2016-01-15, 2020-06-02]","[2013-04-23, 2016-01-15, 2020-06-02]",[002],15458.12
3,1S00E00AA23,B.,Hagene,female,,1,[],"[J329, E785, P292]","[G0444, G9572]",[2014-04-13],[2014-04-13],[],840.21
5,1S00E00AA25,Carlota980,Gamez720,female,1947-04-15,2,[],"[E669, D649, K635, O039, M810, J329, E669, D64...","[G0444, 99241]","[2012-07-18, 2021-11-23]","[2012-07-18, 2021-11-23]",[002],85.55
6,1S00E00AA32,Denny560,Watsica258,male,1945-06-09,3,[],"[P292, E669, I2510, B349, J329, I10, E669, I25...","[99241, 99241, 99241]","[2015-05-12, 2021-02-20, 2021-03-20]","[2015-05-12, 2021-02-20, 2021-03-20]","[002, 002, 002]",85.55
10,1S00E00AA54,Lashawnda5,Greenfelder433,female,1950-12-23,11,[],"[E119, R739, E781, E8881, D649, E11319, P292, ...","[G0444, 99241, 99241, 99241, 99241, G0444, 992...","[2012-10-27, 2013-01-26, 2014-06-21, 2014-07-2...","[2012-10-27, 2013-01-26, 2014-06-21, 2014-07-2...","[002, 002, 002, 002, 002, 002, 002, 002, 002, ...",142.58


# Cost List

Get the cost of procedures from claims with single HCPCS based on the existing claims list

In [None]:
def get_price_list_from_hcpcs(df: pd.DataFrame, price_selection: str):
  """Retrieve the price of each HCPCS code from those claims that only have 1 code

  Args:
      df (pd.DataFrame): Patient Level dataset
      price_selection (str): Any aggregation function for group by ("max", "min", "mean"). 
          How to handle when multiple claims have the same singular code

  Returns:
      pd.DataFrame: Cost of each HCPCS
  """
  temp = df.copy()
  temp['hcpcs_len'] = temp['combined_hcpcs_ls'].str.len()

  # Get only those with single HCPC values
  cost = temp[temp['hcpcs_len'] == 1]

  # Only the select only the hcpcs codes and total bill
  cost['hcpcs_code'] = cost['combined_hcpcs_ls'].str[0]
  cost = cost[['hcpcs_code', 'total_value']]

  # Based on cost selection
  cost = cost.groupby('hcpcs_code').agg(price_selection)
  cost['total_value'] = np.round(cost['total_value'], 2)
  cost = cost.reset_index()
  cost = cost.rename({
    'total_value': 'cost'
  }, axis=1)

  return cost

In [46]:
cost = get_price_list_from_hcpcs(df, price_selection='mean')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price['hcpcs_code'] = price['combined_hcpcs_ls'].str[0]


In [47]:
cost.head()

Unnamed: 0,hcpcs_code,cost
0,99221,5666.37
1,99241,2370.06
2,G0107,36515.28
3,G0151,526.33
4,G0152,1041.51


# Supervised Learning

Assumption: All the data is already numerically encoded

## Additional data processing

In [None]:
SEED = 43

In [None]:
# Perform additional steps here
# Might consider feature cross for sequence representation

## Split Data

Create train, validation and test splits in __stratified__ manner

- Train: 70%
- Validation: 15%
- Test: 15%

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
# Change values here
train_size = 0.7
test_size = 0.5

In [None]:
y = df['target']
train, temp = train_test_split(df, train_size=train_size, stratify=y, random_state=SEED)
val, test = train_test_split(temp, train_size=val_size, stratify=y, random_state=SEED)

In [None]:
strat = pd.concat([train, val], axis=0)
y_strat = strat['target']
X_strat = strat.drop('target', axis=1)

In [None]:
X_train = train['target']
y_train = train.drop('target', axis=1)

X_val = val['target']
y_val = val.drop('target', axis=1)

X_test = test['target']
y_test = test.drop('target', axis=1)

## Metrics

Common classification metrics:

- Accuracy
- Precision
- Recall
- F1 Score (weighted or micro)
- ROC AUC Score (OVR + weighted)

Plots:

- Confusion Matrix
- ROC AUC

In [None]:
def metrics_score(y_true, y_pred):
  acc = round(accuracy_score(y_true, y_pred), 4)
  prec = round(precision_score(y_true, y_pred), 4)
  recall = round(recall_score(y_true, y_pred), 4)
  f1 = round(f1_score(y_true, y_pred, average="weighted"), 4)
  roc_auc = round(roc_auc_score(y_true, y_pred, average="weighted", multi_class="ovr"), 4)

  return acc, prec, recall, f1, roc_auc

In [None]:
def plot_confusion_matrix(y_true, y_pred, clf):
  cm = confusion_matrix(y_true, y_pred)
  disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=clf.classes_
  )

  disp.plot(cmap=plt.cm.Blues)
  plt.title("Confusion Matrix")
  plt.show()

In [None]:
def plot_ROC_AUC():
  # TODO: Determine which method of comparison should be used
  return

## Baseline: Dummy Classifier

Uses sklearns dummy classifier to test various baseline strategies

In [None]:
# Change here
strategy = "most_frequent"

In [None]:
dummy_clf = DummyClassifier(strategy=strategy)
dummy_clf.fit(X_train, y_train)

preds = dummy_clf.predict(X_val)
metrics_score(y_val, preds)

In [None]:
plot_confusion_matrix(y_val, preds, dummy_clf)

## Model 1: Logistic Regression

Simple logistic regression model for an improved baseline

In [None]:
# For model tuning

In [None]:
lr_model = OneVsRestClassifier(LogisticRegression(random_state=SEED))
# lr_model = OneVsOneClassifier(LogisticRegression(random_state=SEED))

lr_model.fit(X_train, y_train)
preds = lr_model.predict(X_val)
metrics_score(y_val, preds)

In [None]:
plot_confusion_matrix(y_val, preds, lr_model)

In [None]:
# For K-Fold shuffled stratified cross validation
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=SEED)
lr_model = OneVsRestClassifier(LogisticRegression(random_state=SEED))

for split_idx, (train_idx, test_idx) in enumerate(sss.split(X, y)):
  print(f"\n========== Split {split_idx} ==========")

  X_strat_train, X_strat_val = X_strat[train_idx], X_strat[train_idx]
  y_strat_train, y_strat_val = y_strat[train_idx], y_strat[train_idx]

  lr_model.fit(X_strat_train, y_strat_train)

  preds = lr_model.predict(X_strat_val)
  metrics_score(pred, y_strat_val)

## Model 2: Decision Tree / Random Forest

Similar reason for decision tree except more interpretable. Random forest for boosting improvements

In [None]:
# For model tuning

## Model 3: XGBoost

Larger improvement over standard decision tree and random forest

## Model 4: FF Neural Network

Trying this out if we have time

# Feature & Model Evaluation

Have yet to decision visualisations and feature evaluation methods

In [None]:
%watermark