# Load the library

## Install the package

In [4]:
packages = ['pandas==1.5.3', 
            'numpy==1.23.5', 
            'tqdm==4.64.1', 
            'requests==2.28.2', 
            'xgboost==1.7.5', 
            'scipy==1.10.1', 
            'lightgbm==3.3.5', 
            'scikit-learn==1.2.2', 
            'matplotlib==3.7.1', 
            'ephem==4.1.4', 
            'optuna==3.1.1', 
            'gdown==4.7.1', 
            'OSMPythonTools==0.3.5']

with open('requirements.txt', 'w') as f:
    f.write("\n".join(packages))

In [None]:
!pip install -r requirements.txt 

## Import the library

In [None]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import json
import xml.etree.ElementTree as ET
import requests

import xgboost
import scipy.stats as stats

import lightgbm as lgb

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score


from matplotlib import pyplot as plt

from datetime import datetime
import ephem
import optuna

from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

import gdown

from OSMPythonTools.api import Api


# Constant Value

In [None]:
SEED = 42
TRAIN_NAME = 'train.csv'
TRAIN_NAME_NEW = 'new_train.csv'
TEST_NAME = 'test.csv'
TEST_NAME_NEW = 'new_test.csv'
TRAIN_URL = 'https://drive.google.com/file/d/1iT838oJWhZp_ss0ZoazLzy-SbTIHb1Is/view?usp=drive_link'
TEST_URL = 'https://drive.google.com/file/d/1MEeMiEyGRcuvzaONA42tAqYCCVEn1dMU/view?usp=drive_link'
NEW_TRAIN_URL = 'https://drive.google.com/file/d/1v4R5D7qE_v8Tfz0ClWj2WL0IfgR3cnX9/view?usp=drive_link'
NEW_TEST_URL = 'https://drive.google.com/file/d/1y0l0TipyrXq5DU635UUSjGhgyIO-3cwc/view?usp=drive_link'

# Preparing the data

In [None]:
# train.csv
gdown.download(url=TRAIN_URL, output=TRAIN_NAME, quiet=False, fuzzy=True) 
# test.csv
gdown.download(url=TEST_URL, output=TEST_NAME, quiet=False, fuzzy=True) 


`Uncomment & run the code bellow if you want to skip the API extraction`

In [None]:

# # new_train.csv
# gdown.download(url=NEW_TRAIN_URL, output=TRAIN_NAME_NEW, quiet=False, fuzzy=True) 
# # new_test.csv
# gdown.download(url=NEW_TEST_URL, output=TEST_NAME_NEW, quiet=False, fuzzy=True) 

# Load the data

In [None]:
df_train = pd.read_csv(TRAIN_NAME)
df_train.head()

In [None]:
df_test = pd.read_csv(TEST_NAME_NEW)
df_test.head()

# Extract API Data from id

In [None]:
unique_jalan_train = df_train['id_jalan'].unique()
unique_jalan_test = df_test['id_jalan'].unique()
unique_jalan = np.concatenate((unique_jalan_train, unique_jalan_test), axis=0)
unique_jalan = np.unique(unique_jalan, axis=0)

In [None]:
unique_mulai_train = df_train['id_titik_mulai'].unique()
unique_mulai_test = df_test['id_titik_mulai'].unique()
unique_akhir_train = df_train['id_titik_akhir'].unique()
unique_akhir_test = df_test['id_titik_akhir'].unique()
unique_mulai = np.concatenate((unique_mulai_train, unique_mulai_test), axis=0)
unique_mulai = np.unique(unique_mulai, axis=0)
unique_akhir = np.concatenate((unique_akhir_train, unique_akhir_test), axis=0)
unique_akhir = np.unique(unique_akhir, axis=0)

In [None]:
unique_mulai_akhir_train = np.unique(df_train[['id_titik_mulai', 'id_titik_akhir']].values, axis=0)
unique_mulai_akhir_test = np.unique(df_test[['id_titik_mulai', 'id_titik_akhir']].values, axis=0)
unique_mulai_akhir = np.concatenate((unique_mulai_akhir_train, unique_mulai_akhir_test), axis=0)
unique_mulai_akhir = np.unique(unique_mulai_akhir, axis=0)

## OSM API

In [None]:
# Testing result
api = Api()
way = api.query('way/66924592')
print(way.tags())
node = api.query('node/21390008') # idx=0, id_titik mulai
print(node.lat())
print(node.lon())

In [None]:
# Testing result
node = api.query('node/21390008') # idx=0, id_titik mulai
print(node.lat())
print(node.lon())

In [None]:
result_jalan = {'id_jalan': unique_jalan,
                'api_result': []}
api = Api()
# new_col_jalan = ['lanes', 'lanes:forward', 'lit', 'maxspeed']
new_col_jalan = ['cycleway', 'highway','lanes', 'lanes:forward', 'lit', 'maxspeed', 'name', 'operator', 'ref', 'sidewalk', 'surface', 'turn:lanes:forward']
for id_jalan in unique_jalan:
    way = api.query(f'way/{id_jalan}')
    api_result = {} 
    for col in new_col_jalan:
        api_result[col] = way.tag(col)
    result_jalan['api_result'].append(api_result)


In [None]:
def manual_extract(node):
    url = f'https://www.openstreetmap.org/api/0.6/node/{node}/history'
    r = requests.get(url)
    root = ET.fromstring(r.text)

    # Find all nodes with the specified id
    nodes = root.findall(f'.//node[@id="{node}"]')

    # Initialize variables to hold latitude and longitude
    latest_lat = None
    latest_lon = None

    # Iterate through the nodes in reverse order (latest version first)
    for node in reversed(nodes):
        lat = node.get('lat')
        lon = node.get('lon')
        
        if lat and lon:
            latest_lat = lat
            latest_lon = lon
            break  # Stop when the first valid lat and lon are found

    # If no valid lat and lon are found in the latest version, use the previous version's data
    if latest_lat is None or latest_lon is None:
        for node in reversed(nodes):
            lat = node.get('lat')
            lon = node.get('lon')
            if lat and lon:
                latest_lat = lat
                latest_lon = lon
                break  # Stop when the first valid lat and lon are found
    return latest_lat, latest_lon

    

In [None]:
result_mulai = {'id_titik_mulai': unique_mulai,
                'api_result': []}
api = Api()
new_col_jalan = ['lat', 'lon']
for id_titik in unique_mulai:
    try:
        node = api.query(f'node/{id_titik}')
    except:
        lat, lon = manual_extract(id_titik)
    
    api_result = {'lat': node.lat(),
                  'lon': node.lon()} 
    result_mulai['api_result'].append(api_result)


In [None]:
result_akhir = {'id_titik_akhir': unique_akhir,
                'api_result': []}
api = Api()
new_col_jalan = ['lat', 'lon']
for id_titik in unique_akhir:
    try:
        node = api.query(f'node/{id_titik}')
    except:
        lat, lon = manual_extract(id_titik)
    
    api_result = {'lat': node.lat(),
                  'lon': node.lon()} 
    result_akhir['api_result'].append(api_result)


## MapQuest API

In [None]:
def distance(start_lat, start_lon, end_lat, end_lon):
    key = 'FVgErOkhYoJmjdsUrldVi9nkrCrGKuWm' # API KEY from MapQuest
    url = f'https://www.mapquestapi.com/directions/v2/route?key={key}&from={start_lat},{start_lon}&to={end_lat},{end_lon}'
    r = requests.get(url)
    data = r.json()
    dist = data['route']['distance']
    return dist

In [None]:
result_distance = {'id_titik_mulai_akhir': unique_mulai_akhir,
                'api_result': []}
api = Api()
new_col_jalan = ['lat', 'lon']
for id_titik in unique_mulai_akhir:
    idx_mulai = np.where(result_mulai['id_titik_mulai'] == id_titik[0])[0].sum()
    idx_akhir = np.where(result_akhir['id_titik_akhir'] == id_titik[1])[0].sum()
    dist = distance(result_mulai['api_result'][idx_mulai]['lat'], result_mulai['api_result'][idx_mulai]['lon'], result_akhir['api_result'][idx_akhir]['lat'], result_akhir['api_result'][idx_akhir]['lon'])
    api_result = {'distance': dist} 
    result_distance['api_result'].append(api_result)


In [None]:
def find_idx_dist(id_titik, unique_val):
    for idx, val in enumerate(unique_val):
        if np.array_equal(val, id_titik):
            return idx
            

In [None]:
def add_data(df, filename):
    # new_col = ['lanes', 'lanes:forward', 'lit', 'maxspeed', 'mulai_lat', 'mulai_lon', 'akhir_lat', 'akhir_lon', 'distance']
    new_col = ['cycleway', 'highway','lanes', 'lanes:forward', 'lit', 'maxspeed', 'name', 'operator', 'ref', 'sidewalk', 'surface', 'turn:lanes:forward', 'mulai_lat', 'mulai_lon', 'akhir_lat', 'akhir_lon', 'distance']
    additional_data = {'cycleway':[], 'highway':[],'lanes':[], 'lanes:forward':[], 'lit':[], 'maxspeed':[], 'name':[], 'operator':[], 'ref':[], 'sidewalk':[], 'surface':[], 'turn:lanes:forward':[], 'mulai_lat':[], 'mulai_lon':[], 'akhir_lat':[], 'akhir_lon':[], 'distance':[]}
    for index, row in tqdm(df.iterrows()):
        id_jalan = row['id_jalan']
        idx_jalan = np.where(result_jalan['id_jalan'] == id_jalan)[0].sum()

        cycleway, highway, name, operator = (result_jalan['api_result'][idx_jalan]['cycleway'], result_jalan['api_result'][idx_jalan]['highway'], result_jalan['api_result'][idx_jalan]['name'], result_jalan['api_result'][idx_jalan]['operator'])
        lanes, lanesforward, lit, maxspeed = (result_jalan['api_result'][idx_jalan]['lanes'], result_jalan['api_result'][idx_jalan]['lanes:forward'], result_jalan['api_result'][idx_jalan]['lit'], result_jalan['api_result'][idx_jalan]['maxspeed'])
        ref, sidewalk, surface, turnlanesforward = (result_jalan['api_result'][idx_jalan]['ref'], result_jalan['api_result'][idx_jalan]['sidewalk'], result_jalan['api_result'][idx_jalan]['surface'], result_jalan['api_result'][idx_jalan]['turn:lanes:forward'])

        id_titik_mulai = row['id_titik_mulai']
        id_titik_akhir = row['id_titik_akhir']

        idx_mulai = np.where(result_mulai['id_titik_mulai'] == id_titik_mulai)[0].sum()
        idx_akhir = np.where(result_akhir['id_titik_akhir'] == id_titik_akhir)[0].sum()

        mulai_lat, mulai_lon = (result_mulai['api_result'][idx_mulai]['lat'], result_mulai['api_result'][idx_mulai]['lon'])
        akhir_lat, akhir_lon = (result_akhir['api_result'][idx_akhir]['lat'], result_akhir['api_result'][idx_akhir]['lon'])

        id_distance = np.array([row['id_titik_mulai'], row['id_titik_akhir']])
        idx_distance = find_idx_dist(id_distance, unique_mulai_akhir)
        distance = result_distance['api_result'][idx_distance]['distance']

        additional_data['cycleway'].append(cycleway)
        additional_data['highway'].append(highway)
        additional_data['name'].append(name)
        additional_data['operator'].append(operator)
        additional_data['lanes'].append(lanes)
        additional_data['lanes:forward'].append(lanesforward)
        additional_data['lit'].append(lit)
        additional_data['maxspeed'].append(maxspeed)
        additional_data['ref'].append(ref)
        additional_data['sidewalk'].append(sidewalk)
        additional_data['surface'].append(surface)
        additional_data['turn:lanes:forward'].append(turnlanesforward)
        
        additional_data['mulai_lat'].append(mulai_lat)
        additional_data['mulai_lon'].append(mulai_lon)
        additional_data['akhir_lat'].append(akhir_lat)
        additional_data['akhir_lon'].append(akhir_lon)
        additional_data['distance'].append(distance)
    for col in new_col:
        df[col] = additional_data[col]
    df.to_csv(filename, index=False)

In [None]:
add_data(df_train, TRAIN_NAME_NEW)
add_data(df_test, TEST_NAME_NEW)

# Load new data

In [None]:
df = pd.read_csv(TRAIN_NAME_NEW)
df.head()

# Prepraring the data

In [None]:
def prepare_data(df):
    df = df.drop(['lanes', 'lanes:forward', 'lit'], axis=1)

    unique_maxspeed = df['maxspeed'].unique()
    max_speed = []
    for data in tqdm(df['maxspeed'].to_numpy(), desc='Convert to kph'):
        if data == '30 mph':
            max_speed.append(48.2803)
        elif data == '20 mph':
            max_speed.append(32.1869)

    df['maxspeed'] = max_speed
    
    is_weekend = []
    is_night = []
    is_rush_hour = []
    date = []
    hour = []

    uk_observer = ephem.Observer()
    uk_observer.lat = '51.5074'  # Latitude of London
    uk_observer.lon = '-0.1278'  # Longitude of London


    for data in tqdm(df['waktu_setempat'].to_numpy(), desc='time categorization'):
        datetime_obj = datetime.strptime(data, '%Y-%m-%d %H:%M:%S%z')
        day_of_week = datetime_obj.weekday()
        if day_of_week >= 5:
            is_weekend.append(1)
        else:
            is_weekend.append(0)
        date_component = datetime_obj.strftime('%Y-%m-%d')
        hour_component = datetime_obj.strftime('%H')
        date.append(date_component)
        hour.append(int(hour_component))
        # Set the observer's date and time to the input UTC time
        uk_observer.date = datetime_obj

        # Calculate sunrise and sunset times
        sunrise = uk_observer.previous_rising(ephem.Sun())
        sunset = uk_observer.next_setting(ephem.Sun())
        if sunrise < uk_observer.date < sunset:
            is_night.append(0)
        else:
            is_night.append(1)
        
        # Define the time ranges
        morning_rush_hour_start = datetime.strptime('10:00:00', '%H:%M:%S').time()
        morning_rush_hour_end = datetime.strptime('16:00:00', '%H:%M:%S').time()

        night_rush_hour_start = datetime.strptime('20:00:00', '%H:%M:%S').time()
        night_rush_hour_end = datetime.strptime('23:59:59', '%H:%M:%S').time()

        night_rush_hour_start_2 = datetime.strptime('00:00:00', '%H:%M:%S').time()
        night_rush_hour_end_2 = datetime.strptime('06:00:00', '%H:%M:%S').time()

        # Extract the time component from the input datetime object
        input_time = datetime_obj.time()

        # Check if the time falls within the desired ranges
        if morning_rush_hour_start <= input_time <= morning_rush_hour_end or (night_rush_hour_start <= input_time <= night_rush_hour_end) or (night_rush_hour_start_2 <= input_time <= night_rush_hour_end_2):
            is_rush_hour.append(1)
        else:
            is_rush_hour.append(0)

    df['is_weekend'] = is_weekend
    df['hour'] = hour
    df['date'] = date
    df['is_night'] = is_night
    df['is_rush_hour'] = is_rush_hour
    df = df.drop(['waktu_setempat'], axis=1)
    
    return df
    

In [None]:
df = prepare_data(df)
df.head()

In [None]:
col_used = ['maxspeed','mulai_lat','mulai_lon','akhir_lat','akhir_lon','distance','is_weekend','hour','is_night','is_rush_hour']
X = df[col_used]
y = df['rerata_kecepatan'].to_numpy()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, shuffle=True)

# Modelling

## Metrics

In [None]:
# Define the function to return the SMAPE value
def smape(actual, predicted) -> float:
  
    # Convert actual and predicted to numpy
    # array data type if not already
    if not all([isinstance(actual, np.ndarray), 
                isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual),
        np.array(predicted)
  
    return round(
        np.mean(
            np.abs(predicted - actual) / 
            ((np.abs(predicted) + np.abs(actual))/2)
        )*100, 6
    )

In [None]:
def plot_eval(pred, true):
    residuals = true - pred

    # Create a 2x3 grid of subplots
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))

    # Scatter plot of real vs. predicted values
    axes[0, 0].scatter(true, pred)
    axes[0, 0].set_xlabel('Real Values')
    axes[0, 0].set_ylabel('Predicted Values')
    axes[0, 0].set_title('Scatter Plot')

    # Residual plot
    axes[0, 1].scatter(pred, residuals)
    axes[0, 1].axhline(y=0, color='r', linestyle='--')
    axes[0, 1].set_xlabel('Predicted Values')
    axes[0, 1].set_ylabel('Residuals')
    axes[0, 1].set_title('Residual Plot')

    # Histogram of residuals
    axes[0, 2].hist(residuals, bins=20)
    axes[0, 2].set_xlabel('Residuals')
    axes[0, 2].set_ylabel('Frequency')
    axes[0, 2].set_title('Histogram of Residuals')

    # QQ plot
    stats.probplot(residuals.flatten(), plot=axes[1, 0])
    axes[1, 0].set_title('QQ Plot')

    # Regression line plot
    axes[1, 1].scatter(true, pred)
    axes[1, 1].plot(true, true, color='r', linestyle='--')
    axes[1, 1].set_xlabel('Real Values')
    axes[1, 1].set_ylabel('Predicted Values')
    axes[1, 1].set_title('Regression Line Plot')

    # R-squared and MSE values
    r2 = r2_score(true, pred)
    mse = mean_squared_error(true, pred)
    axes[1, 2].bar(['R-squared', 'MSE'], [r2, mse])
    axes[1, 2].set_title('R-squared and MSE')

    # Adjust layout and show plot
    plt.tight_layout()
    plt.show()

## Baseline Model

In [None]:
model = xgboost.XGBRegressor(n_jobs=-1, random_state=42)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(f'test_data: {smape(y_test, pred)}')
plot_eval(pred, y_test)

## Optuna: Hyperparameter Tuning 

`Usually it takes more than one hour for each model. So its better to use the already existing best parameters.`

### XGBoost Regressor Tuning

In [None]:
def objective(trial):
    
    
    
    n_estimators = trial.suggest_int("n_estimators", 1000, 5000)
    max_depth = trial.suggest_int("max_depth", 1, 10)
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-1, log=True)
    gamma = trial.suggest_float("gamma", 0.1, 1.0, step=0.1)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 7, step=2)
    subsample = trial.suggest_float("subsample", 0.5, 1.0, step=0.1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0, step=0.1)
    reg_alpha = trial.suggest_float("reg_alpha", 0., 1.0, step=0.1)
    reg_lambda = trial.suggest_float("reg_lambda", 0., 1.0, step=0.1)
    seed = trial.suggest_int("random_state", 20, 50, step=2)
    
    
    model = xgboost.XGBRegressor(n_estimators=n_estimators,
                                max_depth=max_depth,
                                learning_rate=learning_rate,
                                gamma=gamma,
                                min_child_weight=min_child_weight,
                                colsample_bytree=colsample_bytree,
                                subsample=subsample,
                                reg_alpha=reg_alpha,
                                reg_lambda=reg_lambda,
                                n_jobs=-1, metric=mean_squared_error,
                                eval_metric=mean_squared_error,
                                random_state=seed
                                )
    
    model.fit(x_train, y_train)
    
    y_hat = model.predict(x_test)
    
    return mean_squared_error(y_test, y_hat, squared=True)

study = optuna.create_study()
study.optimize(objective, n_trials=50)

`Bellow are the best parameters for XGBoost.`

param_1 = {'n_estimators': 2552, 'max_depth': 9, 'learning_rate': 0.010419707354527082, 'gamma': 0.30000000000000004, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.9, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.5, 'random_state': 28}

### LGBM Regressor Tuning

In [None]:
def objective(trial):
    
    
    subsample_for_bin = trial.suggest_int("subsample_for_bin", 100000, 300000)
    n_estimators = trial.suggest_int("n_estimators", 1000, 5000)
    max_depth = trial.suggest_int("max_depth", 1, 10)
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-1, log=True)
    num_leaves = trial.suggest_int("num_leaves", 10, 50)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 7, step=2)
    subsample = trial.suggest_float("subsample", 0.5, 1.0, step=0.1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0, step=0.1)
    reg_alpha = trial.suggest_float("reg_alpha", 0., 1.0, step=0.1)
    reg_lambda = trial.suggest_float("reg_lambda", 0., 1.0, step=0.1)
    seed = trial.suggest_int("random_state", 20, 50, step=2)
    min_child_samples = trial.suggest_int("min_child_samples", 10, 50)
    
    model = lgb.LGBMRegressor(num_leaves=num_leaves, max_depth=max_depth, learning_rate=learning_rate, 
                              n_estimators=n_estimators, subsample_for_bin=subsample_for_bin, 
                              min_child_weight=min_child_weight, min_child_samples=min_child_samples, subsample=subsample, 
                              colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, 
                              reg_lambda=reg_lambda, random_state=seed, n_jobs=-1,)
    
    model.fit(x_train, y_train)
    
    y_hat = model.predict(x_test)
    
    return mean_squared_error(y_test, y_hat, squared=True)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200)

`Bellow are the best parameters for LGBM.`

param_1 = {'subsample_for_bin': 131673, 'n_estimators': 4510, 'max_depth': 6, 'learning_rate': 0.054834260577325884, 'num_leaves': 47, 'min_child_weight': 1, 'subsample': 1.0, 'colsample_bytree': 1.0, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.30000000000000004, 'random_state': 34, 'min_child_samples': 16}

param_2 = {'subsample_for_bin': 155546, 'n_estimators': 3768, 'max_depth': 7, 'learning_rate': 0.04067420520019853, 'num_leaves': 50, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 1.0, 'reg_alpha': 0.5, 'reg_lambda': 0.30000000000000004, 'random_state': 34, 'min_child_samples': 16}
 

## Model Creation

In [None]:
def xgb_1(no):
  param_1 = {'n_estimators': 2552, 
             'max_depth': 9, 
             'learning_rate': 0.010419707354527082, 
             'gamma': 0.30000000000000004, 
             'min_child_weight': 5, 
             'subsample': 0.8, 
             'colsample_bytree': 0.9, 
             'reg_alpha': 0.30000000000000004, 
             'reg_lambda': 0.5, 
             'random_state': 28}
  
  params = [param_1]
  xgb = xgboost.XGBRegressor(**params[no])
  return xgb

def lgbm_1(no):
  param_1 = {'subsample_for_bin': 131673, 'n_estimators': 4510, 'max_depth': 6, 'learning_rate': 0.054834260577325884, 'num_leaves': 47, 'min_child_weight': 1, 'subsample': 1.0, 'colsample_bytree': 1.0, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.30000000000000004, 'random_state': 34, 'min_child_samples': 16}

  param_2 = {'subsample_for_bin': 155546, 'n_estimators': 3768, 'max_depth': 7, 'learning_rate': 0.04067420520019853, 'num_leaves': 50, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 1.0, 'reg_alpha': 0.5, 'reg_lambda': 0.30000000000000004, 'random_state': 34, 'min_child_samples': 16}
  
  params = [param_1, param_2]
  lgbm = lgb.LGBMRegressor(**params[no])
  return lgbm

### Baseline finetuned model

In [None]:
estimators1 = [('xgb_1{}'.format(i), xgb_1(i)) for i in range(1)]
estimators2 = [('lgbm_1{}'.format(j), lgbm_1(j))  for j in range(2)]
estimators = estimators1+estimators2


In [None]:
for est in estimators:
    name, reg = est
    print(f'Evaluating {name} model')
    reg.fit(x_train, y_train)
    pred = reg.predict(x_test)
    print(f'val_data: {smape(y_test, pred)}')
    pred = reg.predict(x_train)
    print(f'train_data: {smape(y_train, pred)}')
    print('\n')

### Model Stacking

In [None]:
model = StackingRegressor(estimators=estimators, 
                          cv=3, 
                          final_estimator=LinearRegression(n_jobs=-1), 
                          n_jobs = -1, 
                          verbose=1)

model.fit(x_train, y_train)

### Stacked Model Evaluation

#### Validation data

In [None]:
pred = model.predict(x_test)
print(f'val_data: {smape(y_test, pred)}')
plot_eval(pred, y_test)

#### Train Data

In [None]:
pred = model.predict(x_train)
print(f'train_data: {smape(y_train, pred)}')
plot_eval(pred, y_train)

# Applying on test dataset

df_test = pd.read_csv(TEST_NAME_NEW)
df_test.head()

In [None]:
df_test = prepare_data(df_test)
df_test.head()

In [None]:
data_test = df_test[col_used]


pred_test = model.predict(data_test)

In [None]:
df_test['rerata_kecepatan'] = pred_test
subm = df_test[['id', 'rerata_kecepatan']]
subm.to_csv('submission.csv')