In [12]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier, Pool
from modelop.monitors.performance import ModelEvaluator

In [3]:
data = pd.read_csv("./Iris.csv")

In [4]:
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
species_dummy = pd.get_dummies(data["Species"])
species_dummy.head()

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [6]:
assigned_data = data.copy()
assigned_data = pd.concat([data, species_dummy], axis = 1)
assigned_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Iris-setosa,Iris-versicolor,Iris-virginica
0,1,5.1,3.5,1.4,0.2,Iris-setosa,1,0,0
1,2,4.9,3.0,1.4,0.2,Iris-setosa,1,0,0
2,3,4.7,3.2,1.3,0.2,Iris-setosa,1,0,0
3,4,4.6,3.1,1.5,0.2,Iris-setosa,1,0,0
4,5,5.0,3.6,1.4,0.2,Iris-setosa,1,0,0


In [7]:
# target = ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
target=["Species"]
features = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']

In [8]:
y = assigned_data[target].copy()
X = assigned_data[features].copy()

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [9]:
iris_classifier = DecisionTreeClassifier(max_leaf_nodes = 3, random_state = 42)
iris_classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=3,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [10]:
accuracy_score(y_true = y_test, y_pred = iris_classifier.predict(X_test))

0.9666666666666667

In [11]:
with open("iris_tree_classifier.pkl", "wb") as f:
    pickle.dump(iris_classifier, f)

In [12]:
baseline_data = X_train.copy()
baseline_data["Species"] = y_train
baseline_data["predicted_species"] = iris_classifier.predict(X_train)

baseline_data.to_json("baseline_data.json", orient="records", lines=True)

In [13]:
sample_data = X_test.copy()
sample_data["Species"] = y_test
sample_data["predicted_species"] = iris_classifier.predict(X_test)

sample_data.to_json("sample_data.json", orient="records", lines=True)

# CatBoost

In [10]:
from cProfile import label

model = CatBoostClassifier(
    iterations=2,
    depth=2,
    learning_rate=1,
    loss_function='MultiClass',
    verbose=True,
)

In [11]:
model.fit(X_train, y_train)

0:	learn: 0.2937162	total: 50.6ms	remaining: 50.6ms
1:	learn: 0.2127711	total: 51.4ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x13014e1f0>

In [27]:
model.predict(X_test.loc[73,:])[0]

'Iris-versicolor'

In [38]:
for idx,row in X_test.iterrows():
    X_test.loc[idx, "CatBoostPred"] = model.predict(X_test.loc[idx,])[0]
for idx,row in X_train.iterrows():
    X_train.loc[idx, "CatBoostPred"] = model.predict(X_train.loc[idx,])[0]

In [39]:
X_test.loc[:,"Species"] = y_test
X_train.loc[:,"Species"] = y_train

In [40]:
performance_monitor = ModelEvaluator(
    dataframe=X_train,
    score_column="CatBoostPred",
    label_column="Species"
)

In [41]:
performance_monitor.evaluate_performance(pre_defined_metrics = "classification_metrics")

{'accuracy': 0.95,
 'precision': 0.9506,
 'recall': 0.95,
 'f1_score': 0.9499,
 'confusion_matrix': [{'Iris-setosa': 0.3333,
   'Iris-versicolor': 0.0,
   'Iris-virginica': 0.0},
  {'Iris-setosa': 0.0, 'Iris-versicolor': 0.325, 'Iris-virginica': 0.0167},
  {'Iris-setosa': 0.0, 'Iris-versicolor': 0.0333, 'Iris-virginica': 0.2917}],
 'performance': [{'test_name': 'Classification Metrics',
   'test_category': 'performance',
   'test_type': 'classification_metrics',
   'test_id': 'performance_classification_metrics',
   'values': {'accuracy': 0.95,
    'precision': 0.9506,
    'recall': 0.95,
    'f1_score': 0.9499,
    'confusion_matrix': [{'Iris-setosa': 0.3333,
      'Iris-versicolor': 0.0,
      'Iris-virginica': 0.0},
     {'Iris-setosa': 0.0, 'Iris-versicolor': 0.325, 'Iris-virginica': 0.0167},
     {'Iris-setosa': 0.0,
      'Iris-versicolor': 0.0333,
      'Iris-virginica': 0.2917}]}}]}