In [None]:
import pandas as pd
import logging
import sys
import os
import time

import azure.ai.ml
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml import command, Input
from azure.ai.ml.entities import (
    AzureBlobDatastore,
    AzureFileDatastore,
    AzureDataLakeGen1Datastore,
    AzureDataLakeGen2Datastore,
)
from azureml.core import Environment
import io
from tqdm import tqdm
import pandas as pd
import numpy as np
import _pickle as cPickle
import matplotlib.pyplot as plt
import os
import time
import keras
import mlflow
import joblib
import json
from sys import path

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from scikeras.wrappers import KerasRegressor

import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from mlflow.models.signature import infer_signature

In [None]:
with open("config.json", "r") as f:
    config = json.load(f)
subscription_id = config["azure_ml_subscription_ID"]
resource_group = config["resource_group"]
workspace = config["workspace"]

In [None]:
# get a handle to the workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
    )


In [None]:
df = pd.read_csv("Data/DL_data_next_1_station_2016_dm.csv")

In [None]:
# iterating the columns
column_headers = list(df.columns)

In [None]:
def buildPreprocessorPipeline(df):
    #used
    #63,0  is cumulative arr delay
    #62,1  is cumulative dep delay
    #18,2 is day
    #58,3 is deviation from arrival
    #59,4 is deviation from departure
    #34,5 is direction
    #35,6 is dwell time curr station
    #39,7 is dwell time next station average
    #31,8 is (is destination?)
    #30,9 is (is origin?)
    #19,10 is month
    #27,11 is number of stops
    #33,12 is order of departure
    #32,13 is order of journey
    #28,14 is origin departure period
    #56,15 is travel time next station average
    #46,16 is travel time prev station
    numeric_features = ['cumulative_arrival_delay',
                        'cumulative_departure_delay',
                        'deviation_from_arrival',
                        'deviation_from_departure',
                        'dwell_time_curr_station',
                        'dwell_time_next_1_station_average',
                        'number_stops',
                        'order_of_departure',
                        'order_of_journey',
                        'origin_departure_period',
                        'travel_time_next_1_station_average',
                        'travel_time_prev_station']

    categorical_features = ['day',
                            'month',
                            'direction',
                            'is_destination',
                            'is_origin']
    
    #encoding categorical data
    categorical_transformer = Pipeline(
        steps=[("encoder", OneHotEncoder(handle_unknown = "ignore"))
        ]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ], remainder="drop"
    )
    
    return preprocessor

In [None]:
def create_model():
    model = Sequential()
    model.add(Dense(28, input_dim=35, activation='relu'))
    model.add(Dense(28, activation='relu'))
    model.add(Dense(28, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
    return model

In [None]:
def model_train(df):
    X_raw = df.iloc[:,[63,62,18,58,59,34,35,39,31,30,19,27,33,32,28,56,46]]
    Y_raw = df.iloc[:,60]
    X_train, X_test, Y_train, Y_test = train_test_split(X_raw, Y_raw, test_size = 0.2)
    preprocessor = buildPreprocessorPipeline(X_train)

    # estimator instance
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', KerasRegressor(build_fn=create_model, epochs=100, batch_size=64, validation_split = 0.2, verbose=1))])
    model = clf.fit(X_train, Y_train)

    # Calculate RMSE
    y_pred = model.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)
    print("RMSE: %f" % (rmse))
    mlflow.log_metric("rmse", rmse)

    # Calculate R2
    r2 = r2_score(Y_test, y_pred)
    print("R2: %f" % (r2))
    mlflow.log_metric("r2", r2)

    # Plot the predictions vs actual
    fig = plt.figure(figsize=(10, 10))
    plt.scatter(Y_test, y_pred)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title('Predictions vs Actual')
    plt.show()

In [None]:
def main():
    #start logging
    experiment_name = "data_next_1_station_local_compute"
    mlflow.set_experiment(experiment_name)
    #start logging
    mlflow.start_run()
    mlflow.sklearn.autolog()
    mlflow.log_metric("num_samples", df.shape[0])
    mlflow.log_param("num_features", df.shape[1])
    model = model_train(df)

    os.makedirs('outputs', exist_ok=True)
    model_file = os.path.join('outputs', 'dnn_model.pkl')
    joblib.dump(value=model, filename=model_file)

    # Register the model
    # stop logging
    mlflow.end_run()

if __name__ == '__main__':
    main()