## Task 2 - Train models and record the best one

In [None]:
#DEPENDANCIES----------------------------------------
#Scikit-learn machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
#Scikit-learn data split
from sklearn.model_selection import train_test_split
from sklearn import metrics
#To serialize models
import pickle as pkl

#Context
from azureml.core import Workspace
ws = Workspace.from_config()
from azureml.core.model import Model

#Get the data variable stored at the end of the data preparation file
%store -r data
data.head(5)
#----------------------------------------------------

In [None]:
#TRAINING AND EVALUATING SOME MODELS-----------------
#We do not take the 'Unknown' or 'Missing' death status to train our model
#Neither do we take ethnicity who ads a lot of complexity for only 80k lines
X = data.query('death_yn in [0, 1]')[['current_status', 'sex', 'age_group', 'hosp_yn','icu_yn','medcond_yn']]
y = data.query('death_yn in [0, 1]')['death_yn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

#Dimensions sanity check
print('> X_train dimensions:', X_train.shape, '\r\n> X_test dimensions', X_test.shape, '\r\n> y_train dimensions', y_train.shape, '\r\n> y_test dimensions', y_test.shape)

#Algorithms classes instanciation
log_reg = LogisticRegression(solver='lbfgs', max_iter=1500) #lbfgs is default but to ensure it remains the same, 
                                                            #playing around with iterations, for convergence purposes
svc = SVC()
lin_svc = LinearSVC()
rfc = RandomForestClassifier()
knn = KNeighborsClassifier()
gnb = GaussianNB()
perceptron = Perceptron()
sdg = SGDClassifier()
dtc = DecisionTreeClassifier()

#Algorithms fitting/learning
log_reg.fit(X_train, y_train)
svc.fit(X_train, y_train)
lin_svc.fit(X_train, y_train)
rfc.fit(X_train, y_train)
knn.fit(X_train, y_train)
gnb.fit(X_train, y_train)
perceptron.fit(X_train, y_train)
sdg.fit(X_train, y_train)
dtc.fit(X_train, y_train)

#Make predictions on the test dataset to evaluate the model
#Note: Once we will use the classifier for a real case scenario, we will use the .predict_proba() method
#in order to get an intuition about the level of risk for this patient
y_pred = log_reg.predict(X_test)
#Scoring the models
print('> Classification report Logistic regression\r\n', metrics.classification_report(y_test, y_pred), '\r\n')

y_pred = svc.predict(X_test)
print('> Classification report Support Vector Machine (optim 1)\r\n', metrics.classification_report(y_test, y_pred), '\r\n')

y_pred = lin_svc.predict(X_test)
print('> Classification report Support Vector Machine (optim 2)\r\n', metrics.classification_report(y_test, y_pred), '\r\n')

y_pred = rfc.predict(X_test)
print('> Classification report Random Forest\r\n', metrics.classification_report(y_test, y_pred), '\r\n')

y_pred = knn.predict(X_test)
print('> Classification report k-Nearest Neighbours\r\n', metrics.classification_report(y_test, y_pred), '\r\n')

y_pred = gnb.predict(X_test)
print('> Classification report Gaussian Naïve Bayes\r\n', metrics.classification_report(y_test, y_pred), '\r\n')

y_pred = perceptron.predict(X_test)
print('> Classification report Perceptron\r\n', metrics.classification_report(y_test, y_pred), '\r\n')

y_pred = sdg.predict(X_test)
print('> Classification report Mixed Linear Models\r\n', metrics.classification_report(y_test, y_pred), '\r\n')

y_pred = dtc.predict(X_test)
print('> Classification report Decision Tree\r\n', metrics.classification_report(y_test, y_pred), '\r\n')

#----------------------------------------------------

In [None]:
#FUNCTIONAL CONSIDERATIONS---------------------------
#Verify our data are balanced
print('> Data balance', data.query('death_yn in [0, 1]')['death_yn'].mean(), '\r\n')

#Plot the ROC curve
metrics.plot_roc_curve(log_reg, X_test, y_test)
metrics.plot_roc_curve(svc, X_test, y_test)
metrics.plot_roc_curve(lin_svc, X_test, y_test)
metrics.plot_roc_curve(rfc, X_test, y_test)
metrics.plot_roc_curve(knn, X_test, y_test)
metrics.plot_roc_curve(gnb, X_test, y_test)
metrics.plot_roc_curve(perceptron, X_test, y_test)
metrics.plot_roc_curve(sdg, X_test, y_test)
metrics.plot_roc_curve(dtc, X_test, y_test)

#Precision: How many (which proportion) selected elements are pertinents
#Recall: How many (which proportion) pertinent elements has been selected
#----------------------------------------------------

In [None]:
#SAVE THE BEST MODEL---------------------------------
#Save a serialized version of your models
pkl.dump(dtc, open(r'./outputs/models/DecisionTreeClassifier.pkl', 'wb'))
#----------------------------------------------------

### Register your model

[From Microsoft documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.model.model?view=azure-ml-py): Registering a model creates a logical container for the one or more files that make up your model. In addition to the content of the model file itself, a registered model also stores model metadata, including model description, tags, and framework information, that is useful when managing and deploying the model in your workspace. For example, with tags you can categorize your models and apply filters when listing models in your workspace. After registration, you can then download or deploy the registered model and receive all the files and metadata that were registered.

In [None]:
#REGISTER YOUR MODEL---------------------------------
model = Model.register(model_path=r'./outputs/models/DecisionTreeClassifier.pkl',
                       model_name="DecisionTreeClassifier",
                       tags={'area': 'covid19', 'type': 'classification'},
                       description='Survival to Covid-19 classification and estimation',
                       workspace=ws)
#---------------------------------------------------