In [None]:
import mlflow
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA, KernelPCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, roc_auc_score, confusion_matrix

from mlflow.models import infer_signature # To normalize input and output schema

### 1. Loading dataset.

In [2]:
df = pd.read_parquet("./cleaned_data/grade_summary.parquet")

In [3]:
df.isna().sum().sum()

np.int64(0)

In [4]:
df.band.value_counts()

band
LOW          265
GOOD         140
EXCELLENT     91
MEDIUM        70
Name: count, dtype: int64

The classes in this dataset are imbalanced, being largest in `LOW`.

## 2. Baseline models.

In [7]:
# Splitting data
X = df.drop(columns=['band'])
y = df['band'].to_numpy().ravel()

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [90]:
params_lr = [
    # For `liblinear` solver, which handles both L1 and L2 penalties.
    {
        'penalty' : ['l1', 'l2'],
        'C' : [0.01, 0.1, 1, 10, 100],
        'solver' : ['liblinear']
    },
    # Solver that support Elastic-Net (saga).
    {
        'penalty': ['elasticnet'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['saga'],
        'l1_ratio': [0.25, 0.5, 0.75]       
    },
    # Solver that only support L2 penalty.
    {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'newton-cg', 'sag']        
    }
]

params_rf = {
        'n_estimators' : [50, 100, 200, 300, 500],
        'criterion' : ['gini', 'log_loss', 'entropy'],
        'class_weight' : ['balanced', 'balanced_subsample']
    }

In [74]:
# Fundamental Logistic Regression.
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [75]:
print(classification_report(y_test, y_pred_lr, labels=['LOW', 'MEDIUM', 'GOOD', 'EXCELLENT']))

              precision    recall  f1-score   support

         LOW       0.91      0.95      0.93        87
      MEDIUM       0.00      0.00      0.00        16
        GOOD       0.48      0.64      0.55        33
   EXCELLENT       0.82      0.79      0.81        34

    accuracy                           0.77       170
   macro avg       0.55      0.60      0.57       170
weighted avg       0.72      0.77      0.74       170



In [76]:
cm_lr = confusion_matrix(y_test, y_pred_lr)
print(cm_lr)

[[27  7  0  0]
 [ 6 21  4  2]
 [ 0  4 83  0]
 [ 0 12  4  0]]


In [77]:
# Fundamental Random Forest.
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [78]:
print(f"Accuracy score: {accuracy_score(y_test, y_pred_rf):.2f}")
cm_rf = confusion_matrix(y_test, y_pred_rf)
print(cm_rf)

Accuracy score: 0.74
[[25  8  0  1]
 [ 7 17  7  2]
 [ 0  4 80  3]
 [ 0  9  4  3]]


In [79]:
print(classification_report(y_test, y_pred_rf, labels=['LOW', 'MEDIUM', 'GOOD', 'EXCELLENT']))

              precision    recall  f1-score   support

         LOW       0.88      0.92      0.90        87
      MEDIUM       0.33      0.19      0.24        16
        GOOD       0.45      0.52      0.48        33
   EXCELLENT       0.78      0.74      0.76        34

    accuracy                           0.74       170
   macro avg       0.61      0.59      0.59       170
weighted avg       0.72      0.74      0.73       170



### 2.1 GridSearchCV

In [80]:
lr_gcv = GridSearchCV(
    estimator=lr,
    param_grid=params_lr
)

In [91]:
rf_gcv = GridSearchCV(
    estimator=rf,
    param_grid=params_rf
)

In [85]:
lr_gcv.fit(X_train, y_train)



In [86]:
lr_gcv.best_estimator_

In [87]:
lr_gcv.best_params_

{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}

In [88]:
lr_gcv.best_score_

np.float64(0.7271835443037975)

In [92]:
rf_gcv.fit(X_train, y_train)

In [93]:
rf_gcv.best_estimator_

In [94]:
rf_gcv.best_params_

{'class_weight': 'balanced_subsample', 'criterion': 'gini', 'n_estimators': 50}

In [95]:
rf_gcv.best_score_

np.float64(0.7171202531645571)