In [1]:
import mlflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import os

### Data source: https://archive.ics.uci.edu/ml/datasets/Car+Evaluation

In [2]:

# reading the dataset
df = pd.read_csv('data/car.data', sep='\n', header=None)

# # defining column names
columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'evaluation']

# # refining the dataframe
df[columns] = df[0].str.split(',', expand=True)
df.drop(columns=[0], inplace=True)

df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,evaluation
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
# seperating dependant and independant variables

X = df[['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']]
y = df['evaluation']

In [4]:
# train test split
train_X, test_X, train_y, test_y = train_test_split(X,y,random_state=0)

In [5]:
# defining a new experiment
experiment_name = 'SimpleClassification'
# returns experiment ID
try:
    # creating a new experiment
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

In [7]:
if 'data_processed' not in os.listdir():
    os.mkdir('data_processed')

# starting an mlflow run, and tracking them under the experiment defined above
with mlflow.start_run(experiment_id=exp_id, run_name='First_Classification_Model'):
    
    # adding tags to the run
    mlflow.set_tag('Description','Simple Classification Model')
    mlflow.set_tags({'ProblemType': 'Classification', 'ModelType': 'DecisionTree', 'ModelLibrary': 'Scikit-Learn'})
    
    # using one hot encoder to encode the categories
    encoder = OneHotEncoder(handle_unknown='ignore')
    
    X_encoded_train = encoder.fit_transform(train_X)
    train_x_encoded = pd.DataFrame(X_encoded_train.toarray())
    
    X_encoded_test = encoder.transform(test_X)
    test_x_encoded = pd.DataFrame(X_encoded_test.toarray())
    
    # saving a copy of the encoded data
    train_x_encoded.to_csv('data_processed/encoded_train.csv', sep='|', index=False)
    test_x_encoded.to_csv('data_processed/encoded_test.csv', sep='|', index=False)
    
    # logging artifacts -> saves the copy of the data and enables tracking for later use
    mlflow.log_artifacts('data_processed')
    
    # defining alpha and l1 ratio
    max_depth, max_features = 5, 15
    
    # initiating an decision tree model
    clf = DecisionTreeClassifier(random_state=0, max_depth=5, max_features=19)
    
    # fitting the model with train dataset
    clf.fit(train_x_encoded, train_y)
    
    # logging explaination of the model
    # mlflow.shap.log_explanation(clf.predict_proba, train_x_encoded)
    
    # making predictions on test set
    clf.predict(test_x_encoded)

    # obtaining the model performance
    accuracy = clf.score(test_x_encoded, test_y)
    
    # logging hyperparameters defined above
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("max_features", max_features)
    
    # logging performance of the model
    mlflow.log_metric("accuracy", accuracy)
    
    mlflow.sklearn.log_model(clf, 'SimpleClassification_Model')

In [8]:
clf.classes_

array(['acc', 'good', 'unacc', 'vgood'], dtype=object)