In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,recall_score,accuracy_score,precision_score,confusion_matrix,classification_report

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Import label encoder
from sklearn import preprocessing

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import seaborn as sns
import pandas as pd
from urllib.parse import urlparse
import mlflow



In [2]:
df = pd.read_csv('vertebral_column_data/column_3C.dat',delimiter=' ', header=None)

In [3]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,63.03,22.55,39.61,40.48,98.67,-0.25,DH
1,39.06,10.06,25.02,29.00,114.41,4.56,DH
2,68.83,22.22,50.09,46.61,105.99,-3.53,DH
3,69.30,24.65,44.31,44.64,101.87,11.21,DH
4,49.71,9.65,28.32,40.06,108.17,7.92,DH
...,...,...,...,...,...,...,...
305,47.90,13.62,36.00,34.29,117.45,-4.25,NO
306,53.94,20.72,29.22,33.22,114.37,-0.42,NO
307,61.45,22.69,46.17,38.75,125.67,-2.71,NO
308,45.25,8.69,41.58,36.56,118.55,0.21,NO


In [4]:
df.columns =['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle', 'sacral_slope', 'pelvic_radius','grade_of_spondylolisthesis','target']

In [6]:
df['target']

0      DH
1      DH
2      DH
3      DH
4      DH
       ..
305    NO
306    NO
307    NO
308    NO
309    NO
Name: target, Length: 310, dtype: object

In [8]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
df['target'] = label_encoder.fit_transform(df['target'])
  
df['target'].unique()

array([0, 2, 1])

In [11]:
 df[df.columns[0:6]]

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,grade_of_spondylolisthesis
0,63.03,22.55,39.61,40.48,98.67,-0.25
1,39.06,10.06,25.02,29.00,114.41,4.56
2,68.83,22.22,50.09,46.61,105.99,-3.53
3,69.30,24.65,44.31,44.64,101.87,11.21
4,49.71,9.65,28.32,40.06,108.17,7.92
...,...,...,...,...,...,...
305,47.90,13.62,36.00,34.29,117.45,-4.25
306,53.94,20.72,29.22,33.22,114.37,-0.42
307,61.45,22.69,46.17,38.75,125.67,-2.71
308,45.25,8.69,41.58,36.56,118.55,0.21


In [12]:
X = df[df.columns[0:6]]
y = df[df.columns[6]]

print(X.shape)
print(y.shape)

for i in [0,2,1]:
    print("classe : %s, nb exemplaires: %s" % (i, len(y[ y == i]) ) )

(310, 6)
(310,)
classe : 0, nb exemplaires: 60
classe : 2, nb exemplaires: 150
classe : 1, nb exemplaires: 100


In [13]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [14]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [15]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8064516129032258


In [None]:
def accuracymeasures(y_test,predictions,avg_method):
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average=avg_method)
    recall = recall_score(y_test, predictions, average=avg_method)
    f1score = f1_score(y_test, predictions, average=avg_method)
    target_names = [0,2,1]
    print("Classification report")
    print("---------------------","\n")
    print(classification_report(y_test, predictions,target_names=target_names),"\n")
    print("Confusion Matrix")
    print("---------------------","\n")
    print(confusion_matrix(y_test, predictions),"\n")

    print("Accuracy Measures")
    print("---------------------","\n")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1score)
    
    return accuracy,precision,recall,f1score

In [None]:
max_depth = 5
n_estimators = 5

n_neighbors = 5


remote_server_uri = 'http://localhost:1234'

mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment('TP_MLOps')

with mlflow.start_run(run_name='knn') as mlops_run:
    #model = RandomForestClassifier(max_depth=max_depth,n_estimators=n_estimators)
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy,precision,recall,f1score = accuracymeasures(y_test,y_pred,'weighted')

    #mlflow.log_param("max_depth",max_depth)
    #mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("n_neighbors", n_neighbors)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1score)

    tracking_url_type_store = urlparse(mlflow.get_artifact_uri()).scheme

    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(
            model, 
            "model", 
            registered_model_name='knn_model')
    else:
        mlflow.sklearn.load_model(model, "model")