Installing libraries

In [2]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
data_df = pd.read_csv(r"../../data/processed/TCGA_GBM_LGG_Mutations_clean_v2.csv") #change path when testing
X=data_df.drop(["Grade"], axis=1)
y=data_df["Grade"]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
mlflow.set_experiment("Tumor_Classification")

<Experiment: artifact_location='file:///c:/Users/Usuario/Desktop/Maestria/4.-Trimestre/MNA/MLops-Equipo/notebooks/4.-MLFlow/mlruns/236535041957126280', creation_time=1729450834963, experiment_id='236535041957126280', last_update_time=1729450834963, lifecycle_stage='active', name='Tumor_Classification', tags={}>

In [8]:
def train_and_log_model(model, model_name, X_train, X_test, y_train, y_test, params):
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train, y_train)
        # Make predictions
        y_pred = model.predict(X_test)
        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted')
        rec = recall_score(y_test, y_pred, average='weighted')
        # Log parameters and metrics
        mlflow.log_params(params)
        mlflow.log_metrics({"accuracy": acc, "precision": prec, "recall": rec})
        # Log the model
        mlflow.sklearn.log_model(model, artifact_path="models")

Logistic Regression

In [9]:
params_lr = {"C": 1.0, "solver": "liblinear", "random_state": 42}
model_lr = LogisticRegression(**params_lr)

In [10]:
train_and_log_model(
    model=model_lr,
    model_name="Logistic_Regression",
    X_train=X_train_scaled,
    X_test=X_test_scaled,
    y_train=y_train,
    y_test=y_test,
    params=params_lr
)



 Decision Tree Classifier

In [11]:
params_dt = {"max_depth": 5, "criterion": "entropy", "random_state": 42}
model_dt = DecisionTreeClassifier(**params_dt)

In [12]:
train_and_log_model(
    model=model_dt,
    model_name="Decision_Tree",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    params=params_dt
)



c. Random Forest Classifier

In [13]:
params_rf = {"n_estimators": 100, "max_depth": 5, "random_state": 42}
model_rf = RandomForestClassifier(**params_rf)

In [14]:
train_and_log_model(
    model=model_rf,
    model_name="Random_Forest",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    params=params_rf
)

