# Import Packages

In [1]:
import warnings 
warnings.simplefilter(action='ignore')
import pandas as pd
import os
import sys
parent_path = os.path.join(os.getcwd(), '..')
if parent_path not in sys.path:
    sys.path.append(parent_path)
import pickle
from pypinyin import pinyin, Style

import numpy as np
from utility.utils import select_model, autoregressive_predicting
from utility.data import data_imputer, data_slicing, \
    transformer_slice, get_dict_from_pd, train_test_split
from utility.visuals import plot_time_series, yield_visuals
from model.gpt import GPT_fit, GPT_predict, get_desired_sequence
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, \
AdaBoostRegressor
from IPython.display import clear_output
from tqdm import tqdm


# Retrieve Data

In [2]:

with open('../Data/source_data.pkl', 'rb') as f:
    df = pickle.load(f)
df_list, date_range = data_imputer(df)
date_num = date_range.values
sequences = get_dict_from_pd(df_list, 'address1', '受傷')

# Data splitting / slicing

In [None]:
import torch

prediction_map = {}
device = torch.device('cpu')
for curr_address in tqdm(sequences.keys()):
    length = sequences[curr_address].shape[0]
    train, test = train_test_split(sequences[curr_address], length)
    train_x, train_y, final_segment = data_slicing(train, 150)
    train_x_g, train_y_g, final_segment = transformer_slice(train, 150)
    pinyin_result = pinyin(curr_address, style=Style.TONE3)
    
    roman_representation = "_".join([item[0] for item in pinyin_result])
    # plot_time_series(train, test, xticks=date_num, title=roman_representation, pred=np.array([]), saving=True)
    plot_time_series(train, test, xticks=date_num, title=roman_representation \
                 + "/avg\n", pred=np.mean(train) * np.ones_like(test), saving=True)    
    g_loss = yield_visuals(GradientBoostingRegressor, final_segment, {},\
                train_x, train_y, train, test, date_num, roman_representation, saving=True)
    r_loss = yield_visuals(RandomForestRegressor, final_segment, {},\
                train_x, train_y, train, test, date_num, roman_representation, saving=True)
    a_loss = yield_visuals(AdaBoostRegressor, final_segment, {},\
                train_x, train_y, train, test, date_num, roman_representation, saving=True)
        
    trainer, model, final_segment = GPT_fit(train, checkpoint_dir='../output/result/' + roman_representation, trained=True, device=device)
    pred = GPT_predict(model, final_segment, 300, device=device)
    pred_list = get_desired_sequence(pred, final_segment, test)
    gpt_loss = plot_time_series(train, test, xticks=date_num, pred=pred_list, title=roman_representation + "_GPT", saving=True)
    
    ran = select_model(RandomForestRegressor, {}, train_x, train_y)
    ran_pred = autoregressive_predicting(ran, final_segment, len(test))
    
    prediction_map[roman_representation] = {
        'GradientBoost': g_loss['dtw'],
        'RandomForest': r_loss['dtw'],
        'Adaboost': a_loss['dtw'],
        'GPT': gpt_loss['dtw']
    }


    # clear_output(wait=True)



In [8]:
# with open('../output/loss.pkl', 'wb') as f:
#     pickle.dump(prediction_map, f)


In [None]:
import matplotlib.pyplot as plt
with open('../output/loss.pkl', 'rb') as f:
    prediction_map = pickle.load(f)

gradient_list = []
random_list = []
ada_list = []
GPT_list = []
for losses in list(prediction_map.values()):
    gradient_list.append(losses['GradientBoost'])
    random_list.append(losses['RandomForest'])
    ada_list.append(losses['Adaboost'])
    GPT_list.append(losses['GPT'])
num_sequences = len(sequences)
bar_width = .2
indices = np.arange(num_sequences)

plt.figure(figsize=(12, 6))

# Calculate the positions for each set of bars
positions_gradient = indices - 1.5 * bar_width
positions_random = indices - 0.5 * bar_width
positions_ada = indices + 0.5 * bar_width
positions_GPT = indices + 1.5 * bar_width
xticks = np.array([    "_".join([item[0] for item in pinyin(k, style=Style.TONE3)]) for k in sequences.keys()])
plt.bar(positions_gradient, gradient_list, width=bar_width, label='Gradient_boost')
plt.bar(positions_ada, ada_list, width=bar_width, label='Adaboost')
plt.bar(positions_random, random_list, width=bar_width, label='Random forest')
plt.bar(positions_GPT, GPT_list, width=bar_width, label='GPT')
plt.xticks(indices, xticks, rotation=60)
plt.title("DTW(dynamic time warping) scores by different models")
plt.xlabel("city")
plt.ylabel('dtw')
plt.legend()
plt.show()

In [None]:
np.mean(gradient_list), np.mean(GPT_list), np.mean(random_list), np.mean(ada_list)

In [None]:
GPT_list

In [None]:
positions_ada