In [None]:
import azure.ai.ml
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml import command, Input
from azure.ai.ml.entities import (
    AzureBlobDatastore,
    AzureFileDatastore,
    AzureDataLakeGen1Datastore,
    AzureDataLakeGen2Datastore,
)
from azure.ai.ml.entities import Environment
import io
import json
from tqdm import tqdm

import pandas as pd
import logging
import sys
from sys import path
import os
import time

In [None]:
with open("config.json", "r") as f:
    config = json.load(f)

subscription_id = config["azure_ml_subscription_ID"]
resource_group = config["resource_group"]
workspace = config["workspace"]

In [None]:
# Details of AML workspace
from_loc = "DID"
to_loc = "PAD"
data_year = "2017"
datastore_name = 'hist_info_'+from_loc+'_'+to_loc+'_'+data_year
if from_loc == "DID" and data_year == "2016":
    uri = config["uri_DID_PAD_2016"]
elif from_loc == "PAD" and data_year == "2016":
    uri = config["uri_PAD_DID_2016"]
elif from_loc == "DID" and data_year == "2017":
    uri = config["uri_DID_PAD_2017"]
else:
    uri = config["uri_PAD_DID_2017"]


In [None]:
# get a handle to the workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
    )

In [None]:
df = pd.read_csv(uri)
df.head()

In [None]:
current_path = os.path.dirname(os.path.abspath(__file__))
dir_path = os.path.dirnam(os.path.join(current_path, "/DataPreprocessing/data_clean_02.py"))
sys.path.insert(0, dir_path)
from data_clean_02 import data_cleaning


dir_path = os.path.dirname(os.path.join(current_path, "/DataPreprocessing/data_structure_next_n_stations_03.py"))
sys.path.insert(0, dir_path)
from data_structure_next_n_stations_03 import data_structure_next_n_stations

dir_path = os.path.dirname(os.path.join(current_path, "/DataPreprocessing/delay_mechanism_04.py"))
sys.path.insert(0, dir_path)
from delay_mechanism_04 import delay_mechanism

In [None]:
df_schedule_detail_list = []
for i in tqdm(range(len(df))):
    df_schedule_detail = pd.read_csv(io.StringIO(df.loc[i,'5.schedule_detail']), sep=',', dtype=str)
    df_schedule_detail = df_schedule_detail.drop(df_schedule_detail.columns[0], axis = 1)
    df_schedule_detail_list.append(df_schedule_detail)
# Drop that column
df.drop("5.schedule_detail", axis = 1, inplace = True)
# Put whatever series you want in its place
df["5.schedule_detail"] = df_schedule_detail_list

In [None]:
historical_information, station_dwell_time_unique, OD_pairs_unique = data_cleaning(df)
data_next_1_station = data_structure_next_n_stations(historical_information, station_dwell_time_unique, OD_pairs_unique, 1)
data_next_1_station = delay_mechanism(data_next_1_station)

In [None]:
data_next_1_station.to_csv("Data/data_next_1_station_"+from_loc+"_"+to_loc+"_"+data_year+".csv",index=False)