# **Experiment Tracking using MLFlow**

## GENDER PREDICTION

The gender dataset consists of 4981 rows and 7 features and one class label.

* long_hair : it is 1 if he/she has long hair or 0 if he/she haven’t
* forehead_width_cm : forehead width in cm
* forehead_height_cm : forehead height in cm
* nose_long : it is 1 if he/she has a long nose or 0 if he/she haven’t
* nose_wide : it is 1 if he/she has a wide nose or 0 if he/she haven’t
* lips_thin : it is 1 if he/she has thin lips or 0 if he/she haven’t
* distance_nose_to_lip_long: it is 1 if there is a long distance between lips and nose or 0 if there isn’t this long distance between them
* Gender(Target column): We will use the other 7 features of the dataset in order to make inferences and predictions regarding the gender of any given individual.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,plot_confusion_matrix

In [2]:
df=pd.read_csv(r"data//gender_classification_v7.csv")

df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int64  
 1   forehead_width_cm          5001 non-null   float64
 2   forehead_height_cm         5001 non-null   float64
 3   nose_wide                  5001 non-null   int64  
 4   nose_long                  5001 non-null   int64  
 5   lips_thin                  5001 non-null   int64  
 6   distance_nose_to_lip_long  5001 non-null   int64  
 7   gender                     5001 non-null   object 
dtypes: float64(2), int64(5), object(1)
memory usage: 312.7+ KB


In [4]:
df['gender'].value_counts()

Female    2501
Male      2500
Name: gender, dtype: int64

In [5]:
sns.pairplot(df,hue = 'gender')

In [None]:
#preprocessing data
x = df.drop('gender', axis=1)
y = df['gender']

In [None]:
#splitting the dataset
from sklearn.model_selection import train_test_split
# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=100)

In [None]:
# scaling the numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# column names are (annoyingly) lost after Scaling
# (i.e. the dataframe is converted to a numpy ndarray)

X_train = pd.DataFrame(scaler.fit_transform(x_train), 
                                    columns = x_train.columns, 
                                    index = x_train.index)

X_train.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
235,0.389541,1.743269,-1.005723,1.006521,0.982159,1.013086,0.9995
2697,0.389541,-1.249662,1.018605,-0.993521,-1.018165,-0.987083,-1.0005
1219,0.389541,0.020066,-0.453634,-0.993521,0.982159,1.013086,0.9995
135,0.389541,-1.158967,-1.189753,-0.993521,-1.018165,-0.987083,-1.0005
3613,0.389541,-0.433408,-0.085574,1.006521,0.982159,1.013086,0.9995


In [None]:
X_test= pd.DataFrame(scaler.transform(x_test), 
                                   columns = x_test.columns, 
                                   index = x_test.index)

X_test.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
294,-2.567125,0.65493,0.282486,1.006521,0.982159,1.013086,0.9995
1379,0.389541,-1.158967,0.650546,-0.993521,0.982159,-0.987083,-1.0005
3667,0.389541,0.382846,0.834575,1.006521,-1.018165,-0.987083,-1.0005
504,-2.567125,1.924658,-0.453634,1.006521,0.982159,1.013086,0.9995
1359,-2.567125,-1.158967,0.466516,-0.993521,-1.018165,-0.987083,-1.0005


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn import metrics
from pickle import dump



In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)

y_test_pred = knn_classifier.predict(X_test)
# calculate accuracy of class predictions
from sklearn import metrics
print("Accuracy :",metrics.accuracy_score(y_test, y_test_pred).round(4))

Accuracy : 0.963


In [None]:
# Logistic Regn
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

y_test_pred = lr_classifier.predict(X_test)
# calculate accuracy of class predictions
from sklearn import metrics
print("Accuracy :",metrics.accuracy_score(y_test, y_test_pred).round(4))

Accuracy : 0.965


In [None]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

y_test_pred = nb_classifier.predict(X_test)
# calculate accuracy of class predictions
from sklearn import metrics
print("Accuracy :",metrics.accuracy_score(y_test, y_test_pred).round(4))

Accuracy : 0.968


In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(max_depth = 3)
dt_classifier.fit(X_train, y_train)

y_test_pred = dt_classifier.predict(X_test)
# calculate accuracy of class predictions
from sklearn import metrics
print("Accuracy :",metrics.accuracy_score(y_test, y_test_pred).round(4))

Accuracy : 0.961


In [None]:
# Support Vector Classifier
from sklearn.svm import SVC
sv_classifier = SVC()
sv_classifier.fit(X_train, y_train)

y_test_pred = sv_classifier.predict(X_test)
# calculate accuracy of class predictions
from sklearn import metrics
print("Accuracy :",metrics.accuracy_score(y_test, y_test_pred).round(4))

Accuracy : 0.969


## **Saving the Model (Serialization)**

In [None]:

from pickle import dump

dump(scaler, open('models/standard_scaler.pkl', 'wb'))
dump(knn_classifier, open('models/knn_model.pkl', 'wb'))
dump(lr_classifier, open('models/lr_model.pkl', 'wb'))
dump(nb_classifier, open('models/nb_model.pkl', 'wb'))
dump(dt_classifier, open('models/dt_model.pkl', 'wb'))
dump(sv_classifier, open('models/sv_model.pkl', 'wb'))

In [None]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Gender Prediction")

2023/05/10 11:41:44 INFO mlflow.tracking.fluent: Experiment with name 'Gender Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/385426064880880178', creation_time=1683699104176, experiment_id='385426064880880178', last_update_time=1683699104176, lifecycle_stage='active', name='Gender Prediction', tags={}>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn import metrics

In [None]:
from pickle import dump

dump(scaler, open('models/standard_scaler.pkl', 'wb'))

## **EXPERIMENT TRACKING**

In [None]:
# KNN model
with mlflow.start_run(run_name="KNN"):
    mlflow.set_tag("dev", "APM")
    mlflow.set_tag("algo", "KNN")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/gender_classification_v7.csv")
    k = 53
    mlflow.log_param("n_neighbors", k)
    knn_classifier = KNeighborsClassifier(n_neighbors=5)
    knn_classifier.fit(X_train, y_train)
    y_test_pred = knn_classifier.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_test_pred)
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(knn_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [None]:
# Logistic Regn 
with mlflow.start_run(run_name="LogisticRegression"):
    mlflow.set_tag("dev", "APM")
    mlflow.set_tag("algo", "Logit_Regn")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/gender_classification_v7.csv")
    C = 0.1
    mlflow.log_param("C", C)
    lr_classifier = LogisticRegression(C=C)
    lr_classifier.fit(X_train, y_train)
    y_test_pred = lr_classifier.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(lr_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [None]:
# Naive Bayes
with mlflow.start_run(run_name="NaiveBayes"):
    mlflow.set_tag("dev", "APM")
    mlflow.set_tag("algo", "GaussianNB")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/gender_classification_v7.csv")
    nb_classifier = GaussianNB()
    nb_classifier.fit(X_train, y_train)
    y_test_pred = nb_classifier.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(nb_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [None]:
# Decision Tree
with mlflow.start_run(run_name="DecisionTree"):
    mlflow.set_tag("dev", "APM")
    mlflow.set_tag("algo", "DecisionTree")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/gender_classification_v7.csv")
    depth = 3
    mlflow.log_param("max_depth", depth)
    dt_classifier = DecisionTreeClassifier(max_depth = depth)
    dt_classifier.fit(X_train, y_train)
    y_test_pred = dt_classifier.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(dt_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [None]:
# Support Vector Classifier
with mlflow.start_run(run_name="SVM"):
    mlflow.set_tag("dev", "APM")
    mlflow.set_tag("algo", "SVM")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/gender_classification_v7.csv")
    C = 0.1
    mlflow.log_param("C", C)
    sv_classifier = SVC(C=C)
    sv_classifier.fit(X_train, y_train)
    y_test_pred = sv_classifier.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(sv_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")