In [31]:
import os 

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Plots
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 18, 7

import warnings
warnings.filterwarnings('ignore')

from tpot import TPOTRegressor

import sidetable as stb
import plotly.express as px

In [20]:
from datetime import datetime
def dateparse(d,t):
    dt = d + " " + t
    return pd.datetime.strptime(dt, '%d/%m/%Y %H.%M.%S')

df = pd.read_csv('data/AirQualityUCI_combined.csv',sep=';',decimal=',', parse_dates={'datetime': ['Date', 'Time']})
df.drop(['Unnamed: 15'],axis=1, inplace=True)

# From the dataset description we know that unknown/missing values are marked as -200.0.
# we fill those values using ffill: propagate last valid observation forward
df = df.replace({-200.0:np.nan})
df = df.fillna(method='ffill')
df = df.dropna(axis=0)
df = df.set_index('datetime')
ground_truth = ['CO(GT)','C6H6(GT)','NMHC(GT)','NOx(GT)','NO2(GT)']
df = df.drop(ground_truth,axis=1)
df = df.loc['2004-04-01 00:00:00':'2005-01-07 00:00:00']
df = df.sort_index()
df = ( df - df.min() ) / (df.max() - df.min()) # Min-Max normalize the dataset
df_orig = df.copy()

In [21]:
label_col = 'PT08.S2(NMHC)'
len_forecast = 500
window_size = 100

In [35]:
features = df_orig.drop(label_col,axis=1)
target = df_orig[label_col]

In [36]:
X_train, X_test, y_train, y_test = train_test_split(features, target,
                                                    train_size=0.75, test_size=0.25)

In [37]:
tpot = TPOTRegressor(generations=4, population_size=100, verbosity=2,)#config_dict="TPOT cuML")
tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/500 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.0005614699060210702

Generation 2 - Current best internal CV score: -0.00048372641085816473

Generation 3 - Current best internal CV score: -0.00048372641085816473

Generation 4 - Current best internal CV score: -0.00048372641085816473

Best pipeline: XGBRegressor(input_matrix, learning_rate=0.1, max_depth=10, min_child_weight=2, n_estimators=100, n_jobs=1, objective=reg:squarederror, subsample=0.4, verbosity=0)


TPOTRegressor(generations=4, verbosity=2)

In [38]:
print("TPOT cross-validation MSE")
print(tpot.score(X_test, y_test))

TPOT cross-validation MSE
-0.0004380828096052712


In [49]:

print('MSE:')
print(mean_squared_error(y_test, tpot.predict(X_test)))
print('MAE:')
print(mean_absolute_error(y_test, tpot.predict(X_test)))

MSE:
0.0004380828096052712
MAE:
0.014559382762427077


In [40]:
tpot.export('tpot_optimized_pipeline.py')

In [42]:
df_predicted = pd.DataFrame({'Predicted':tpot.predict(X_test)})
df_predicted.index = y_test.index

In [43]:
df_result = pd.merge(df_orig,df_predicted,how='left',left_index = True, right_index = True)

In [52]:
fig = px.line(df_result.dropna(axis=0), x=df_result.dropna(axis=0).index, y = [label_col,'Predicted'])
fig.show()