Student 1: name:Dmitri_Antonov , i.d.:317270510 , github:https://github.com/dimantonov
Student 2: name:Nisim_Noam_sharabi , i.d.:201606514 , github:https://github.com/nisimsh44-cloud

1. Load breast cancer dataset (**structured data**)

For more details about the data: https://scikit-learn.org/1.5/modules/generated/sklearn.datasets.load_breast_cancer.html

In [None]:
from sklearn.datasets import load_breast_cancer
my_data = load_breast_cancer()

2. Split **my_data** to train and test:

- Define X_train, X_test, Y_train, Y_test
- Choose **test_size** for splitting **my_data**
- Use **train_test_split** (for details: https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.train_test_split.html)

In [None]:

from sklearn.model_selection import train_test_split


X_train, X_test, Y_train, Y_test = train_test_split(my_data.data, my_data.target, test_size=0.2, random_state=42)

3. Libraries

In [None]:
!pip install mlflow
!pip install mlflow scikit-learn

import mlflow
import mlflow.sklearn
from mlflow import log_param, log_metric

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import itertools
import pandas as pd




4. Define MLFlow experiment

In [None]:
EXPERIMENT_NAME = "trees_hyperparam"
mlflow.set_experiment(EXPERIMENT_NAME)
# MLFlow details: https://mlflow.org/docs/latest/ml/tracking/

<Experiment: artifact_location='/content/mlruns/1', creation_time=1765698759922, experiment_id='1', last_update_time=1765698759922, lifecycle_stage='active', name='trees_hyperparam', tags={}>

5. Train **model_decision_tree**

- Library: sklearn.tree.DecisionTreeClassifier
- Data: X_train, Y_train
- **Essential**: explore and optimize DecisionTreeClassifier options   

In [None]:
from sklearn.tree import DecisionTreeClassifier


criterion_list =["gini", "entropy"]
max_depth_list = [None, 3, 5, 10]
min_samples_split_list = [2, 4, 6, 8]

param_grid = list(itertools.product(criterion_list, max_depth_list, min_samples_split_list))

for criterion, max_depth, min_samples_split in param_grid:
    with mlflow.start_run():

        # Log parameters
        mlflow.log_param("model_type", "DecisionTree")
        mlflow.log_param("criterion", criterion)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("min_samples_split", min_samples_split)

        # Train the model
        d_tree = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split)
        d_tree.fit(X_train, Y_train)

        y_pred = d_tree.predict(X_test)

        # run test prediction and calculate metrics:
        acc = accuracy_score(Y_test, y_pred)
        pre = precision_score(Y_test, y_pred)
        rec = recall_score(Y_test, y_pred)
        f1 = f1_score(Y_test, y_pred)

        # Log metrics
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision_score", pre)
        mlflow.log_metric("recall_score", rec)
        mlflow.log_metric("f1_score", f1)



6. Train model_random_forest
- Library: sklearn.ensemble.RandomForestClassifier
- Data: X_train, Y_train
- **Essential**: explore and optimize RandomForestClassifier options

In [None]:
from sklearn.ensemble import RandomForestClassifier

n_estimators_list = [50,100,200,300]
max_depth_list = [None, 5, 10, 15]
max_features_list = ["sqrt", "log2"]

param_grid = list(itertools.product(n_estimators_list, max_depth_list, max_features_list))

for n_estimators, max_depth, max_features in param_grid:
    with mlflow.start_run():

        # Log parameters
        mlflow.log_param("model_type", "RandomForest")
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("max_features", max_features)

        # Train the model
        rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)
        rf.fit(X_train, Y_train)

        # run test prediction and calculate metrics:
        y_pred = rf.predict(X_test)
        acc = accuracy_score(Y_test, y_pred)
        pre = precision_score(Y_test, y_pred)
        rec = recall_score(Y_test, y_pred)
        f1 = f1_score(Y_test, y_pred)


        # Log metrics
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision_score", pre)
        mlflow.log_metric("recall_score", rec)
        mlflow.log_metric("f1_score", f1)




7. Train model_adaboost

- Library: sklearn.ensemble.AdaBoostClassifier
- Data: X_train, Y_train
- **Essential**: explore and optimize AdaBoostClassifier options

In [None]:
from sklearn.ensemble import AdaBoostClassifier

n_estimators_list = [50,100,200,300]
learning_rate_list = [0.01, 0.1, 1]
algorithm_list = ["SAMME"]

param_grid = list(itertools.product(n_estimators_list, learning_rate_list, algorithm_list))

for n_estimators, learning_rate, algorithm in param_grid:
    with mlflow.start_run():

        # Log parameters
        mlflow.log_param("model_type", "AdaBoost")
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("algorithm", algorithm)

        ada = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm)
        ada.fit(X_train, Y_train)

        # run test prediction and calculate metrics:
        y_pred = ada.predict(X_test)
        acc = accuracy_score(Y_test, y_pred)
        pre = precision_score(Y_test, y_pred)
        rec = recall_score(Y_test, y_pred)
        f1 = f1_score(Y_test, y_pred)


        # Log metrics
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision_score", pre)
        mlflow.log_metric("recall_score", rec)
        mlflow.log_metric("f1_score", f1)







8. Store the result

In [None]:
from google.colab import files

df = mlflow.search_runs()

df = df.drop(columns=[col for col in df.columns if "time" in col.lower()], errors="ignore")

student_name = "Dmitri_Antonov_&_Nisim_Noam_Sharabi"

df.to_excel(f"{student_name}_results.xlsx", index=False)

files.download(f"{student_name}_results.xlsx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>