In [None]:
# !pip install tpot pandas -q

### PROMPT:

**Context**: I want you to act as an automatic machine learning (AutoML)  bot using TPOT for me. I am working on a model that predicts sales. The code will be executed in Colab, so we must take care of the compatible versions between TPOT and skit-learn.

**Input**: Use the Iris dataset available in sklearn

**Output**: Please write python code to find the best classification  model with the highest AUC score on the test set.


### **ChatGPT solution, fixed by Gemini**

In [1]:
import sklearn
import tpot

In [3]:
# chatGPT Answer

# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from tpot import TPOTClassifier
import numpy as np
#from sklearn.preprocessing import LabelBinarizer # This is not needed

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# No need to binarize the output, TPOTClassifier can handle multiclass classification
#lb = LabelBinarizer()
#y_binarized = lb.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TPOTClassifier
tpot = TPOTClassifier(verbosity=2, scoring='roc_auc', generations=5, population_size=20, random_state=42)

# Fit the TPOTClassifier
tpot.fit(X_train, y_train)

# Evaluate the model on the test set
y_test_pred = tpot.predict_proba(X_test)

# Calculate AUC score for each class
auc_scores = [roc_auc_score(y_test == i, y_test_pred[:, i]) for i in range(y_test_pred.shape[1])]
print(f"AUC Scores for each class: {auc_scores}")

# Export the best model
tpot.export('best_pipeline.py')

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -inf


ValueError: multi_class must be in ('ovo', 'ovr')

The previous code produces an error `ValueError: multi_class must be in ('ovo', 'ovr')`, which the Gemini assistant solves by changing the metric by `roc_auc_ovo` and adding the parameter `multiclass='ovo'` when calculating the AUC scores.

In [None]:
# chatGPT Answer - Repaired by Gemini

# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from tpot import TPOTClassifier
import numpy as np
#from sklearn.preprocessing import LabelBinarizer # This is not needed

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# No need to binarize the output, TPOTClassifier can handle multiclass classification
#lb = LabelBinarizer()
#y_binarized = lb.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TPOTClassifier
# Added the multi_class parameter to the scoring function
tpot = TPOTClassifier(verbosity=2, scoring= 'roc_auc_ovo', generations=5, population_size=20, random_state=42)

# Fit the TPOTClassifier
tpot.fit(X_train, y_train)

# Evaluate the model on the test set
y_test_pred = tpot.predict_proba(X_test)

# Calculate AUC score for each class
# Added the multi_class parameter to the scoring function
auc_scores = [roc_auc_score(y_test == i, y_test_pred[:, i], multi_class='ovo') for i in range(y_test_pred.shape[1])]
print(f"AUC Scores for each class: {auc_scores}")

# Export the best model
tpot.export('best_pipeline.py')

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 1.0

Generation 2 - Current best internal CV score: 1.0

Generation 3 - Current best internal CV score: 1.0

Generation 4 - Current best internal CV score: 1.0

Generation 5 - Current best internal CV score: 1.0

Best pipeline: MLPClassifier(input_matrix, alpha=0.0001, learning_rate_init=0.001)
AUC Scores for each class: [1.0, 1.0, 1.0]


We can check now the file with the best model (best_pipeline.py) and predict values for new data.

In [None]:
#best_pipeline.py

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: 1.0
exported_pipeline = MLPClassifier(alpha=0.0001, learning_rate_init=0.001)
# Fix random state in exported estimator
if hasattr(exported_pipeline, 'random_state'):
    setattr(exported_pipeline, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

### **Claude AI Solution**

Perfecto a la primera!

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tpot import TPOTClassifier
import numpy as np

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TPOT
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    cv=5,
    random_state=42,
    verbosity=2,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

# Fit TPOT
tpot.fit(X_train, y_train)

# Get the best model
best_model = tpot.fitted_pipeline_

# Make predictions on the test set
y_pred_proba = best_model.predict_proba(X_test)

# Calculate the AUC score
auc_score = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

print(f"Best pipeline: {best_model}")
print(f"AUC score on test set: {auc_score:.4f}")

# Export the best pipeline as a Python script
tpot.export('tpot_iris_pipeline.py')

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 1.0

Generation 2 - Current best internal CV score: 1.0

Generation 3 - Current best internal CV score: 1.0

Generation 4 - Current best internal CV score: 1.0

Generation 5 - Current best internal CV score: 1.0

Best pipeline: MLPClassifier(input_matrix, alpha=0.0001, learning_rate_init=0.001)
Best pipeline: Pipeline(steps=[('mlpclassifier', MLPClassifier(random_state=42))])
AUC score on test set: 1.0000
