# Training a Model Using Notebook

#### Parameters
Set any parameters you to pass from the CLI (or using AML Pipelines) in a [cell tagged "parameters"](https://papermill.readthedocs.io/en/latest/usage-parameterize.html). Here, we're setting the variable `DATA_PATH` to be settable at runtime, but with a default of `./sample-data/`

In [None]:
DATA_PATH = '../../../sample-data/'
RANDOM_SEED = 42

#### Imports

In [None]:
import joblib
import os

import pandas as pd
from azureml.core import Run

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

#### Get Azure ML Run Context

In [None]:
# Get the Run context of where this run is being submitted.
run = Run.get_context()

#### Get Data and extract information for featurization

In [None]:
# Read the CSV file
credit_data_df = pd.read_csv(os.path.join(DATA_PATH, 'german_credit_data.csv'))
credit_data_df.drop("Sno", axis=1, inplace=True)

y_raw = credit_data_df['Risk']
X_raw = credit_data_df.drop('Risk', axis=1)

In [None]:
# Find categorical and numeric features
categorical_features = X_raw.select_dtypes(include=['object']).columns
numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns

#### Create SKLearn Pipeline Steps

In [None]:
# Create categorical transformer pipeline steps
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
    ('onehotencoder', OneHotEncoder(categories='auto', sparse=False))])

# Create numeric scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Combine pipeline steps into a feature engineering step
feature_engineering_pipeline = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ], remainder="drop")

In [None]:
# Encode Labels
le = LabelEncoder()
encoded_y = le.fit_transform(y_raw)

# Create sklearn pipeline
lr_clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline),
                            ('classifier', LogisticRegression(solver="lbfgs"))])

#### Split Data and Train Model

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.20, stratify=encoded_y, random_state=RANDOM_SEED)

# Train the model
lr_clf.fit(X_train, y_train)

#### Calculate and log metrics to Azure ML

In [None]:
# Capture metrics
train_acc = lr_clf.score(X_train, y_train)
test_acc = lr_clf.score(X_test, y_test)
print("Training accuracy: %.3f" % train_acc)
print("Test data accuracy: %.3f" % test_acc)

# Log to Azure ML
run.log('Train accuracy', train_acc)
run.log('Test accuracy', test_acc)

#### Save Model to `./outputs` folder

In [None]:
#copying to "outputs" directory, automatically uploads it to Azure ML
output_dir = './outputs/'
os.makedirs(output_dir, exist_ok=True)
joblib.dump(value=lr_clf, filename=os.path.join(output_dir, 'model.pkl'))
