In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import seaborn as sns
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet

from sklearn.metrics import mean_squared_error, r2_score

## Load data

In [2]:
class LoadData:

    def __init__(self):
        self.PM_path = './PM_Data/'
        self.Rain_path = './Rain_Data/'
        self.PM_startswith = 'FillByAll'
        self.Rain_startswith = 'Meteo'

    def load_PM_data(self):
        PM_data = {name.split('_')[1]:pd.read_csv(os.path.join(self.PM_path, name)) for name in os.listdir(self.PM_path) if name.startswith(self.PM_startswith)}
        return PM_data

    def load_Rain_data(self):
        Rain_data = {name.split('_')[1]:pd.read_excel(os.path.join(self.Rain_path, name)) for name in os.listdir(self.Rain_path) if name.startswith(self.Rain_startswith)}
        return Rain_data

## Preprocessing

In [3]:
def station_data_TS(PM_data, staion_id, scope_year=None): # scope_year = [2017, 2018, 2019]
    if scope_year:
        new_PM_data = {a_year:year_df for a_year, year_df in PM_data.items() if int(a_year) in scope_year}
    else:
        new_PM_data = PM_data.copy()
    staion_TS = []
    for val in new_PM_data.values():
        if staion_id in val.columns:
            staion_at_year = val[['Date', staion_id]].copy()
            staion_at_year['Date'] = pd.to_datetime(staion_at_year['Date'].copy(), format='%Y_%m_%d')
            staion_TS.append(staion_at_year.set_index(['Date']).rename(columns={staion_id:'PM_'+staion_id}))
    return pd.concat(staion_TS, axis=0)

def window_input(window_length: int, data: pd.DataFrame, column_name: str): 
    df = data.copy()
    add_columns = [column_name]
    i = 1
    while i < window_length:
        df[f'{column_name}_{i}'] = df[column_name].shift(-i)
        add_columns.append(f'{column_name}_{i}')
        i = i + 1   
    # Fill NaN with zero
    df = df.fillna(0)
    return df, add_columns


## Training

In [4]:
def univariate_piplines(window_length, data, column_name, target_name, portion, model):
    # preparing data
    new_data, add_columns = window_input(window_length, data.copy(), column_name)
    X, y = new_data.copy()[add_columns].to_numpy(), new_data.copy()[target_name].to_numpy()
    dim = len(add_columns)
    minmax_model = MinMaxScaler().fit(X)
    X = minmax_model.transform(X)
    X_train, X_test = X[:int(X.shape[0] * portion)].reshape(-1, dim), X[int(X.shape[0] * portion):].reshape(-1, dim)
    y_train, y_test = y[:int(y.shape[0] * portion)].reshape(-1, 1), y[int(y.shape[0] * portion):].reshape(-1, 1)

    # train model
    trained_model = model.fit(X_train, y_train.ravel())
    return trained_model, minmax_model, X_test, y_test.ravel()

In [5]:
def multivariate_piplines(window_length, data, column_lst, target_name, portion, model):
    # preparing data
    new_data = data.copy()
    add_columns = []
    for column_name in column_lst:
        new_data, new_columns = window_input(window_length, new_data, column_name)
        add_columns.extend(new_columns)
    X, y = new_data.copy()[add_columns].to_numpy(), new_data.copy()[target_name].to_numpy()
    dim = len(add_columns)
    minmax_model = MinMaxScaler().fit(X)
    X = minmax_model.transform(X)
    X_train, X_test = X[:int(X.shape[0] * portion)].reshape(-1, dim), X[int(X.shape[0] * portion):].reshape(-1, dim)
    y_train, y_test = y[:int(y.shape[0] * portion)].reshape(-1, 1), y[int(y.shape[0] * portion):].reshape(-1, 1)

    # train model
    trained_model = model.fit(X_train, y_train.ravel())
    return trained_model, minmax_model, X_test, y_test.ravel()

## Result

In [6]:
def plot_regression(X_train, y_train, X_test, y_test, model_path):
    regrssor = pickle.load(open(model_path, 'rb'))
    y_pred_test = regrssor.predict(X_test)
    y_pred_train = regrssor.predict(X_train)
    # Train fitting
    if X_train.shape[1] > 1:
        for i in range(X_train.shape[1]):
            plt.plot(X_train[:, i], y_pred_train, '.', label='Prediction')
            plt.plot(X_train[:, i], y_train, '.', label='Actual')
            plt.ylabel('PM 2.5')
            plt.xlabel('Rain volume')
            plt.title(f'Input: {regrssor.domain[i].name}')
            plt.legend()
            plt.grid()
            plt.pause(0.0001)
            plt.clf()
    else:
        plt.plot(X_train[:, 0], y_pred_train, '.', label='Prediction')
        plt.plot(X_train[:, 0], y_train, '.', label='Actual')
        plt.ylabel('PM 2.5')
        plt.xlabel('Rain volume')
        plt.title(f'Input: {regrssor.domain[0].name}')
        plt.legend()
        plt.grid()
        
  
    
    plt.figure()
    if X_test.shape[1] > 1:
        for i in range(X_test.shape[1]):
            plt.plot(X_test[:, i], y_pred_test, '.', label='Prediction')
            plt.plot(X_test[:, i], y_test, '.', label='Actual')
            plt.ylabel('PM 2.5')
            plt.xlabel('Rain volume')
            plt.title(f'Input: {regrssor.domain[i].name}')
            plt.legend()
            plt.grid()
            plt.pause(0.0001)
            plt.clf()
    
    else:
        plt.plot(X_test[:, 0], y_pred_test, '.', label='Prediction')
        plt.plot(X_test[:, 0], y_test, '.', label='Actual')
        plt.ylabel('PM 2.5')
        plt.xlabel('Rain volume')
        plt.title(f'Input: {regrssor.domain[0].name}')
        plt.legend()
        plt.grid()
        
    plt.show()


In [7]:
def Evaluation_Model(model_dict, X_test, y_test):
    for model_name, model in model_dict.items():
        print(f'Model {model_name}')
        y_pred = model.predict(X_test)
        SS_Residual = sum((y_test-y_pred)**2)       
        SS_Total = sum((y_test-np.mean(y_pred))**2)     
        r_squared = 1 - (float(SS_Residual))/SS_Total
        print(SS_Residual, SS_Total)
        print(len(y_test), X_test.shape[1])
        adjusted_r_squared = 1 - (1-r_squared)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
        # The mean squared error
        print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
        # The coefficient of determination: 1 is perfect prediction
        print("Coefficient of determination or R2-score: %.2f" % r2_score(y_test, y_pred))
        print('R square:', r_squared)
        print('Adjusted R square:', adjusted_r_squared)
        print('-------------------------------------')