#### Imports 

In [3]:
import joblib
from pathlib import Path
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    roc_auc_score, 
    average_precision_score, 
    f1_score, 
    precision_score, 
    recall_score
)

####  Paths & Folders

In [4]:
ARTIFACTS_DIR = Path("artifacts")
MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok=True)

#### Load Preprocessed Data

In [5]:
data_splits = joblib.load(ARTIFACTS_DIR / "data_splits.pkl")
X_train = data_splits["X_train"]
y_train = data_splits["y_train"]
X_val   = data_splits["X_val"]
y_val   = data_splits["y_val"]
X_test  = data_splits["X_test"]
y_test  = data_splits["y_test"]

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

Train: (31919, 57), Val: (6096, 57), Test: (6096, 57)


#### Unit-Test Checks

In [7]:
# Total samples check
total_samples = X_train.shape[0] + X_val.shape[0] + X_test.shape[0]
assert total_samples == len(y_train) + len(y_val) + len(y_test), "Sample count mismatch"

# NaN check
assert not np.isnan(X_train).any(), "NaNs in X_train"
assert not np.isnan(X_val).any(), "NaNs in X_val"
assert not np.isnan(X_test).any(), "NaNs in X_test"

print("✅ Data integrity checks passed.")

✅ Data integrity checks passed.


## Logistic Regression

In [9]:
logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
logreg.fit(X_train, y_train)

y_val_pred = logreg.predict(X_val)
y_val_proba = logreg.predict_proba(X_val)[:,1]

metrics_logreg = {
    "roc_auc": roc_auc_score(y_val, y_val_proba),
    "pr_auc": average_precision_score(y_val, y_val_proba),
    "f1": f1_score(y_val, y_val_pred),
    "precision": precision_score(y_val, y_val_pred),
    "recall": recall_score(y_val, y_val_pred)
}

joblib.dump(logreg, MODELS_DIR / "logreg.pkl")
joblib.dump(metrics_logreg, MODELS_DIR / "metrics_logreg.pkl")
print("Logistic Regression metrics:", metrics_logreg)


Logistic Regression metrics: {'roc_auc': 0.9362623256501986, 'pr_auc': 0.6209451971604625, 'f1': 0.6080781180648025, 'precision': 0.462525320729237, 'recall': 0.8873056994818653}


## Support Vector Machine

In [10]:
svc = SVC(probability=True, class_weight='balanced', random_state=42)
svc.fit(X_train, y_train)

y_val_pred_svc = svc.predict(X_val)
y_val_proba_svc = svc.predict_proba(X_val)[:,1]

metrics_svc = {
    "roc_auc": roc_auc_score(y_val, y_val_proba_svc),
    "pr_auc": average_precision_score(y_val, y_val_proba_svc),
    "f1": f1_score(y_val, y_val_pred_svc),
    "precision": precision_score(y_val, y_val_pred_svc),
    "recall": recall_score(y_val, y_val_pred_svc)
}

joblib.dump(svc, MODELS_DIR / "svc.pkl")
joblib.dump(metrics_svc, MODELS_DIR / "metrics_svc.pkl")
print("SVM metrics:", metrics_svc)

SVM metrics: {'roc_auc': 0.9327664491227523, 'pr_auc': 0.6050893243103598, 'f1': 0.6082191780821918, 'precision': 0.4696755994358251, 'recall': 0.8626943005181347}


In [11]:
print("✅ Baseline models trained and metrics saved to 'models/' folder.")

✅ Baseline models trained and metrics saved to 'models/' folder.


## ✅ Features of this Notebook

1. Uses only Logistic Regression and SVM — no CatBoost dependency.

2. Calculates all key metrics:
   - ROC-AUC (roc_auc)
   - PR-AUC (pr_auc)
   - F1-score (f1)
   - Precision (precision)
   - Recall (recall)

3. Performs unit-test style checks:
    - NaNs
    - Sample count consistency

4. Saves models and metrics for downstream evaluation.

5. Fully reproducible and ready for deployment.