In [1]:
#Install prerequisite
!pip install pandas numpy sklearn matplotlib Minio

In [2]:
import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
%matplotlib inline 
import matplotlib.pyplot as plt

In [None]:
# Define S3 credentials
bucket_name = 'training-data'
data_path = 'metrics.csv'
S3_ENDPOINT = 'minio-service.kubeflow:9000'
AWS_ACCESS_KEY_ID = 'minio'
AWS_SECRET_ACCESS_KEY = 'minio123'

In [None]:
# Get data from S3 Object Storage
from minio import Minio
import re
url = re.compile(r"https?://")
cos = Minio(url.sub('', S3_ENDPOINT),
            access_key=AWS_ACCESS_KEY_ID,
            secret_key=AWS_SECRET_ACCESS_KEY,
            secure=False)
cos.fget_object(bucket_name, data_path, 'metrics.csv')

In [None]:
# Importing dataset - the generated simulator data
data = pd.read_csv('./metrics.csv', delimiter=",")
data.head()

In [None]:
# Convert dataset into array for the independent variables (features)
X = np.asarray(data[['Temperature(celsius)','Target_Temperature(celsius)','Power','PowerConsumption','ContentType','O2','CO2','Time_Door_Open','Maintenance_Required','Defrost_Cycle']])
X[0:5]

In [None]:
# Convert dataset into array for the dependent (objective) variables
y = np.asarray(data['Maintenance_Required'])
#y = np.asarray(data['Maintainence_Required'])
y [0:5]

In [None]:
# Normalize/standardize (mean = 0 and standard deviation = 1) 
# your features before applying machine learning techniques.
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

In [None]:
## split the dataset into train and test to estiamte model accuracy 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
## As we are trying to acheive a binary classification, we use Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
LR

In [None]:
## Predict using the trained LR model
yhat = LR.predict(X_test)
yhat

In [None]:
yhat_prob = LR.predict_proba(X_test)
yhat_prob

In [None]:
from sklearn.metrics import jaccard_score
jaccard_score(y_test, yhat)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    print(confusion_matrix(y_test, yhat, labels=[1,0]))

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yhat, labels=[1,0])
np.set_printoptions(precision=2)


# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['MaintFlag=1','MaintFlag=0'],normalize= False,  title='Confusion matrix')

In [None]:
print (classification_report(y_test, yhat))


In [None]:

from sklearn.metrics import log_loss
log_loss(y_test, yhat_prob)


In [None]:
import pickle

#serializing our model to a file called model_logistic_regression.pkl
pickle.dump(LR, open("model_logistic_regression.pkl","wb"))

In [None]:
cos.fput_object(bucket_name, 'model_logistic_regression.pkl', 'model_logistic_regression.pkl')