In [71]:
import pandas as pd
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
import matplotlib.pyplot as plt

data = pd.read_csv('data/imputed_train.csv')
data['id'] = pd.to_datetime(data['id'])
data

Unnamed: 0,id,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25
0,2020-01-01 00:00:00,42.900000,0.718000,15.7,73.1,64.4
1,2020-01-01 01:00:00,33.600000,0.587000,10.1,74.8,66.0
2,2020-01-01 02:00:00,29.300000,0.400655,5.1,51.0,44.9
3,2020-01-01 03:00:00,30.500000,0.246000,7.2,27.7,25.1
4,2020-01-01 04:00:00,29.300000,0.204000,8.3,15.3,13.6
...,...,...,...,...,...,...
40986,2024-09-03 18:00:00,17.713860,0.222000,55.1,12.0,5.3
40987,2024-09-03 19:00:00,21.932757,0.245000,48.2,13.4,7.0
40988,2024-09-03 20:00:00,23.265996,0.234000,44.5,12.4,7.1
40989,2024-09-03 21:00:00,33.122175,0.225000,25.9,10.6,5.4


In [78]:
data_long = pd.melt(data, id_vars=['id'], value_vars=['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25'],
                    var_name='Pollutant', value_name='Value')
train_data = TimeSeriesDataFrame.from_data_frame(
    data_long,
    id_column="Pollutant",
    timestamp_column="id"
)
train_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value
item_id,timestamp,Unnamed: 2_level_1
valeur_NO2,2020-01-01 00:00:00,42.9
valeur_NO2,2020-01-01 01:00:00,33.6
valeur_NO2,2020-01-01 02:00:00,29.3
valeur_NO2,2020-01-01 03:00:00,30.5
valeur_NO2,2020-01-01 04:00:00,29.3


In [79]:
predictor = TimeSeriesPredictor(
    prediction_length=504,
    path="autogluon-m4-hourly",
    target="Value",
    eval_metric="MAE",
)

predictor.fit(
    train_data,
    presets="medium_quality",
    time_limit=600,
)

Beginning AutoGluon training... Time limit = 600s
AutoGluon will save models to 'autogluon-m4-hourly'
AutoGluon Version:  1.0.0
Python Version:     3.10.14
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          8
GPU Count:          0
Memory Avail:       11.00 GB / 31.70 GB (34.7%)
Disk Space Avail:   68.29 GB / 476.00 GB (14.3%)
Setting presets to: medium_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MAE,
 'hyperparameters': 'light',
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 504,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'target': 'Value',
 'time_limit': 600,
 'verbosity': 2}

Inferred time series frequency: 'H'
Provided train_data has 204955 rows, 5 time series. Median time series length is 40991 (min=40991, max=40991). 

Provided dataset contains following columns:
	target:  

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x231d80de110>

In [83]:
prediction = predictor.predict(train_data)[["mean"]]
prediction

Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


Unnamed: 0_level_0,Unnamed: 1_level_0,mean
item_id,timestamp,Unnamed: 2_level_1
valeur_NO2,2024-09-03 23:00:00,20.151454
valeur_NO2,2024-09-04 00:00:00,19.795188
valeur_NO2,2024-09-04 01:00:00,18.223992
valeur_NO2,2024-09-04 02:00:00,17.992103
valeur_NO2,2024-09-04 03:00:00,19.417732
...,...,...
valeur_PM25,2024-09-24 18:00:00,8.706389
valeur_PM25,2024-09-24 19:00:00,8.764595
valeur_PM25,2024-09-24 20:00:00,9.172983
valeur_PM25,2024-09-24 21:00:00,8.609866


In [92]:
prediction_wide = prediction.reset_index().pivot(index='timestamp', columns='item_id', values='mean')
prediction_wide["id"] = pd.to_datetime(prediction_wide.index)
prediction_wide[['id', 'valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10',
       'valeur_PM25']].to_csv('submissions/matt_autogluon_raw.csv', index=False)

In [89]:
sample = pd.read_csv('data/sample_submission.csv')
sample.columns

Index(['id', 'valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10',
       'valeur_PM25'],
      dtype='object')

In [49]:
# With all features

Unnamed: 0_level_0,Unnamed: 1_level_0,Value
item_id,timestamp,Unnamed: 2_level_1
valeur_NO2,2024-09-03 23:00:00,
valeur_CO,2024-09-03 23:00:00,
valeur_O3,2024-09-03 23:00:00,
valeur_PM10,2024-09-03 23:00:00,
valeur_PM25,2024-09-03 23:00:00,
...,...,...
valeur_NO2,2024-09-05 22:00:00,
valeur_CO,2024-09-05 22:00:00,
valeur_O3,2024-09-05 22:00:00,
valeur_PM10,2024-09-05 22:00:00,


In [81]:
from utils.preprocessing import preprocess_data
data = preprocess_data(data)
data

Unnamed: 0,id,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25,is_holiday,is_jour_ferie,precipitation,wind_speed,...,Year,Month,Weekday,Day,Hour,is_weekend,DayOfYear_sin,DayOfYear_cos,HourOfDay_sin,HourOfDay_cos
0,2020-01-01 00:00:00,42.900000,0.718000,15.7,73.1,64.4,1,0,0.0,1.5,...,2020,1,2,1,0,False,0.017213,0.999852,0.000000,1.000000e+00
1,2020-01-01 01:00:00,33.600000,0.587000,10.1,74.8,66.0,1,0,0.0,2.6,...,2020,1,2,1,1,False,0.017213,0.999852,0.258819,9.659258e-01
2,2020-01-01 02:00:00,29.300000,0.400655,5.1,51.0,44.9,1,0,0.0,1.9,...,2020,1,2,1,2,False,0.017213,0.999852,0.500000,8.660254e-01
3,2020-01-01 03:00:00,30.500000,0.246000,7.2,27.7,25.1,1,0,0.0,1.8,...,2020,1,2,1,3,False,0.017213,0.999852,0.707107,7.071068e-01
4,2020-01-01 04:00:00,29.300000,0.204000,8.3,15.3,13.6,1,0,0.0,2.2,...,2020,1,2,1,4,False,0.017213,0.999852,0.866025,5.000000e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40986,2024-09-03 18:00:00,17.713860,0.222000,55.1,12.0,5.3,0,0,0.0,1.2,...,2024,9,1,3,18,False,-0.895839,-0.444378,-1.000000,-1.836970e-16
40987,2024-09-03 19:00:00,21.932757,0.245000,48.2,13.4,7.0,0,0,0.0,1.7,...,2024,9,1,3,19,False,-0.895839,-0.444378,-0.965926,2.588190e-01
40988,2024-09-03 20:00:00,23.265996,0.234000,44.5,12.4,7.1,0,0,0.0,2.0,...,2024,9,1,3,20,False,-0.895839,-0.444378,-0.866025,5.000000e-01
40989,2024-09-03 21:00:00,33.122175,0.225000,25.9,10.6,5.4,0,0,0.0,1.4,...,2024,9,1,3,21,False,-0.895839,-0.444378,-0.707107,7.071068e-01
