In [None]:
# Oncology Risk Prediction — Interactive Notebook
This notebook walks through EDA, training Logistic Regression and Random Forest, evaluating, and logging to MLflow.

In [14]:
# Add project root to path so `utils` can be imported from notebooks/
import sys, os
project_root = os.path.abspath("..")   # notebook lives in notebooks/
if project_root not in sys.path:
    sys.path.append(project_root)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# utils (must exist under ml_training_class/utils/)
from utils.preprocess import scale_features
from utils.evaluator import classification_metrics
from utils.logger_mlflow import log_classification_run

import matplotlib.pyplot as plt
plt.style.use("seaborn-v0_8")

In [19]:
DATA_PATH = os.path.abspath(os.path.join("..", "data", "oncology_data.csv"))
os.makedirs(os.path.dirname(DATA_PATH), exist_ok=True)

if not os.path.exists(DATA_PATH):
    # generate simple sample CSV (20 rows)
    df_sample = pd.DataFrame({
        "age":[45,61,52,37,50,68,55,70,42,59,65,72,58,49,62,80,74,47,53,66],
        "tumor_size_mm":[22,35,48,18,40,50,25,60,18,33,28,55,30,21,45,70,38,19,27,31],
        "lymph_nodes":[0,1,3,0,5,7,2,6,1,4,2,5,1,0,3,8,2,1,2,3],
        "stage":[1,2,3,1,2,3,2,4,1,3,2,4,2,1,3,4,2,1,2,3],
        "chemo_given":[0,1,1,0,0,1,0,1,1,0,1,1,0,0,1,1,1,0,0,1],
        "high_risk":[0,1,1,0,1,1,0,1,0,1,1,1,0,0,1,1,1,0,0,1]
    })
    df_sample.to_csv(DATA_PATH, index=False)
    print("Sample oncology_data.csv created at:", DATA_PATH)

df = pd.read_csv(DATA_PATH)
print("Loaded data:", DATA_PATH)
df.head()

Loaded data: D:\ml-lab\lab-session-4\data\oncology_data.csv


Unnamed: 0,age\ttumor_size_mm\tlymph_nodes\tstage\tchemo_given\thigh_risk (label)
0,45\t22\t0\t1\t0\t0
1,61\t35\t1\t2\t1\t1
2,52\t48\t3\t3\t1\t1
3,37\t18\t0\t1\t0\t0


In [22]:
import pandas as pd
import os

DATA_PATH = os.path.abspath(os.path.join("..", "data", "oncology_data.csv"))

df_sample = pd.DataFrame({
    "age":[45,61,52,37,50,68,55,70,42,59,65,72,58,49,62,80,74,47,53,66],
    "tumor_size_mm":[22,35,48,18,40,50,25,60,18,33,28,55,30,21,45,70,38,19,27,31],
    "lymph_nodes":[0,1,3,0,5,7,2,6,1,4,2,5,1,0,3,8,2,1,2,3],
    "stage":[1,2,3,1,2,3,2,4,1,3,2,4,2,1,3,4,2,1,2,3],
    "chemo_given":[0,1,1,0,0,1,0,1,1,0,1,1,0,0,1,1,1,0,0,1],
    "high_risk":[0,1,1,0,1,1,0,1,0,1,1,1,0,0,1,1,1,0,0,1]
})

df_sample.to_csv(DATA_PATH, index=False)
print("✔️ File recreated:", DATA_PATH)

✔️ File recreated: D:\ml-lab\lab-session-4\data\oncology_data.csv


In [23]:
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,age,tumor_size_mm,lymph_nodes,stage,chemo_given,high_risk
0,45,22,0,1,0,0
1,61,35,1,2,1,1
2,52,48,3,3,1,1
3,37,18,0,1,0,0
4,50,40,5,2,0,1
