# Data Aggregation Across Data Sources

We have 3 different sources of data:

1. Our sensor data: that has the Indoor Air Quality and Indoor Environmental Data.

2. SINAICA: Outdoor Air Quality Monitoring Data from the Government.

3. OpenWeatherData: Outdoor Environmental Data.

We need it to be available that data to the models we plan to train. In the following sections this process is detailed.

In [1]:
import os, gzip, json, re, stan, dplython, asyncio, nest_asyncio
#nest_asyncio.apply()
import warnings
from matplotlib import pyplot as plt
warnings.filterwarnings("ignore", category=DeprecationWarning)
from dplython import (DplyFrame, X, diamonds, select, sift,
  sample_n, sample_frac, head, arrange, mutate, group_by,
  summarize, DelayFunction, dfilter)
import seaborn as sns
from plotnine import *
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import (mean_squared_error, 
                             r2_score,
                             mean_absolute_error)
import pandas as pd
import numpy as np
from IPython.display import display, Markdown, update_display
DEBUG=True
if DEBUG:
  display(Markdown("Default Values:"))
  display(Markdown(f"* pandas max_columns={pd.options.display.max_columns}\n" +
                   f"* pandas max_rows={pd.options.display.max_rows}"))
  pd.options.display.max_columns=35
  #pd.options.display.max_rows=100
  display(Markdown("New Values:"))
  display(Markdown(f"* pandas max_columns={pd.options.display.max_columns}\n" +
                   f"* pandas max_rows={pd.options.display.max_rows}"))



Default Values:

* pandas max_columns=20
* pandas max_rows=60

New Values:

* pandas max_columns=35
* pandas max_rows=60

## Indoor Data

In [2]:
airdata = pd.read_pickle('data/airdata/air.pickle')
airdata["year"] = [dt.year for dt in airdata["datetime"]]
airdata["month"] = [dt.month for dt in airdata["datetime"]]
airdata["day"] = [dt.day for dt in airdata["datetime"]]
airdata["hour"] = [dt.hour for dt in airdata["datetime"]]
airdata["minute"] = [dt.minute for dt in airdata["datetime"]]
airdata["second"] = [dt.second for dt in airdata["datetime"]]
airdata.set_index("datetime", inplace=True)
airdata.sort_index(inplace=True)
airdata

Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-02-12 06:04:09.089621067,21.54,777.41,43.93,151328,37.5,1,2021,2,12,6,4,9
2021-02-12 06:04:12.087778807,21.56,777.41,43.89,152702,35.6,1,2021,2,12,6,4,12
2021-02-12 06:04:15.072475433,21.53,777.41,43.97,151328,37.5,1,2021,2,12,6,4,15
2021-02-12 06:04:18.070170164,21.51,777.41,44.03,151464,38.5,1,2021,2,12,6,4,18
2021-02-12 06:04:21.061994791,21.51,777.41,44.05,152425,36.9,1,2021,2,12,6,4,21
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-18 01:20:38.889113188,25.84,782.96,56.64,928867,130.8,1,2021,9,18,1,20,38
2021-09-18 01:20:41.882042885,25.83,782.94,56.66,923130,131.5,1,2021,9,18,1,20,41
2021-09-18 01:20:44.877856970,25.83,782.94,56.63,925034,131.3,1,2021,9,18,1,20,44
2021-09-18 01:20:47.872255564,25.83,782.94,56.62,923130,131.9,1,2021,9,18,1,20,47


## Outdoor Air Quality Data

In [3]:
sinaica = pd.read_pickle('data/sinaica2/dsinaica.pickle')
sinaica.rename(mapper={
  "Merced_CO": "CO",
  "Camarones_NO": "NO",
  "Merced_NO2": "NO2",
  "Merced_NOx": "NOx",
  "Merced_O3": "O3",
  "Merced_PM10": "PM10",
  "Merced_PM2.5": "PM2.5",
  "Merced_SO2": "SO2"
}, axis=1, inplace=True)
sinaica.drop(columns=[col 
                       for col in sinaica.columns 
                       if re.match('^(Camaron|Gustavo|Miguel|Tlalne|FES|Merced|La Pre)', col)],
              inplace=True
             )
sinaica["year"] = [dt.year for dt in sinaica["Fecha"]]
sinaica["month"] = [dt.month for dt in sinaica["Fecha"]]
sinaica["day"] = [dt.day for dt in sinaica["Fecha"]]
sinaica["hour"] = [dt.hour for dt in sinaica["Fecha"]]
sinaica["minute"] = [dt.minute for dt in sinaica["Fecha"]]
sinaica.set_index("Fecha", inplace=True)
sinaica.sort_index(inplace=True)
sinaica = sinaica.copy()
sinaica

Unnamed: 0_level_0,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,year,month,day,hour,minute
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-01-01 00:00:00,0.006000,1.000000,0.032000,0.036000,0.006000,31.000000,19.000000,0.003000,2021,1,1,0,0
2021-01-01 01:00:00,0.021000,,,,,,,,2021,1,1,1,0
2021-01-01 02:00:00,0.013000,1.100000,0.032000,0.039000,0.004000,37.000000,24.000000,0.003000,2021,1,1,2,0
2021-01-01 03:00:00,0.031000,1.200000,0.033000,0.043000,0.001000,49.000000,39.000000,0.003000,2021,1,1,3,0
2021-01-01 04:00:00,0.005000,1.200000,0.031000,0.039000,0.002000,80.000000,65.000000,0.003000,2021,1,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-10-04 00:00:00,0.008292,0.545833,0.019083,0.026875,0.015833,11.826087,7.913043,0.000750,2021,10,4,0,0
2021-10-05 00:00:00,0.010000,0.563158,0.019722,0.030500,0.012278,11.090909,6.772727,0.000556,2021,10,5,0,0
2021-10-06 00:00:00,0.007571,0.672222,0.026111,0.035611,0.011000,18.722222,11.833333,0.000111,2021,10,6,0,0
2021-10-07 00:00:00,0.011565,0.713636,0.028636,0.040318,0.017909,26.772727,17.000000,0.001045,2021,10,7,0,0


## Outdoor Weather Data

In [4]:
weather = pd.read_pickle("data/openweathermap/weather.pickle.gz")
#weather["year"] = [dt.year for dt in weather["dt"]]
#weather["month"] = [dt.month for dt in weather["dt"]]
#weather["day"] = [dt.day for dt in weather["dt"]]
#weather["hour"] = [dt.hour for dt in weather["dt"]]
#weather["minute"] = [dt.minute for dt in weather["dt"]]
weather.rename(columns={'temp': 'temperature'},
               inplace=True)
weather.set_index("dt", inplace=True)
weather.sort_index(inplace=True)
weather.drop(columns=['clouds_all', "weather_id", 'rain_1h', 'rain_3h',
                      'temp_max', 'temp_min'], inplace=True)
weather

Unnamed: 0_level_0,temperature,feels_like,pressure,humidity,wind_speed,wind_deg,weather_main
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-02-12 07:00:00,13.87,12.46,1020,44,0.00,0,Clear
2021-02-12 08:00:00,12.81,11.37,1020,47,0.00,0,Clear
2021-02-12 09:00:00,10.83,9.35,1019,53,1.54,60,Clear
2021-02-12 10:00:00,6.40,3.51,1019,61,4.12,40,Clear
2021-02-12 11:00:00,6.23,6.23,1019,57,0.00,0,Clear
...,...,...,...,...,...,...,...
2021-09-27 19:00:00,21.51,20.89,1006,45,0.89,139,Clear
2021-09-27 20:00:00,23.18,22.81,1005,48,0.45,224,Rain
2021-09-27 21:00:00,22.21,21.69,1025,46,6.17,220,Rain
2021-09-27 22:00:00,21.03,20.68,1004,57,0.45,242,Rain


## Merging the 3 Datasets: Indoor Data, Outdoor Air Quality Data, Outdoor Weather Data.

### Merging Air Quality and Weather Data

In [5]:
outdoor = sinaica.drop(columns=['year', 'month', 'day', 'hour', 'minute']).join(weather, 
                                    rsuffix='_weather').copy()
outdoor = outdoor[(outdoor.index >= airdata.index.min()) &
                  (outdoor.index <= airdata.index.max())]
outdoor.sort_index(inplace=True)
outdoor

Unnamed: 0,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature,feels_like,pressure,humidity,wind_speed,wind_deg,weather_main
2021-02-12 07:00:00,0.244000,2.500000,0.035000,0.205000,0.002000,57.000000,25.000000,0.005000,13.87,12.46,1020.0,44.0,0.00,0.0,Clear
2021-02-12 08:00:00,0.146000,1.600000,0.030000,0.089000,0.004000,67.000000,33.000000,0.003000,12.81,11.37,1020.0,47.0,0.00,0.0,Clear
2021-02-12 09:00:00,0.099000,1.500000,0.039000,0.072000,0.012000,50.000000,28.000000,0.002000,10.83,9.35,1019.0,53.0,1.54,60.0,Clear
2021-02-12 10:00:00,0.024000,1.200000,0.030000,0.047000,0.025000,40.000000,21.000000,0.002000,6.40,3.51,1019.0,61.0,4.12,40.0,Clear
2021-02-12 11:00:00,0.009000,0.900000,0.016000,0.026000,0.033000,33.000000,19.000000,0.001000,6.23,6.23,1019.0,57.0,0.00,0.0,Clear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-14 00:00:00,0.017000,0.716667,0.015125,0.024625,0.013333,7.047619,4.500000,0.000125,15.85,14.95,1025.0,56.0,4.12,170.0,Rain
2021-09-15 00:00:00,0.027458,0.954167,0.028167,0.048625,0.019375,24.416667,17.333333,0.000875,17.95,17.08,1023.0,49.0,7.72,130.0,Clouds
2021-09-16 00:00:00,0.006875,0.883333,0.028000,0.034458,0.022792,46.833333,40.041667,0.001542,18.45,17.63,1022.0,49.0,6.17,130.0,Smoke
2021-09-17 00:00:00,0.010250,0.947619,0.032700,0.042750,0.026650,29.666667,24.875000,0.003350,18.34,17.69,1024.0,56.0,4.63,300.0,Rain


In [6]:
outdoor[outdoor.index >= "2021-09-15 23:59"]

Unnamed: 0,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature,feels_like,pressure,humidity,wind_speed,wind_deg,weather_main
2021-09-16,0.006875,0.883333,0.028,0.034458,0.022792,46.833333,40.041667,0.001542,18.45,17.63,1022.0,49.0,6.17,130.0,Smoke
2021-09-17,0.01025,0.947619,0.0327,0.04275,0.02665,29.666667,24.875,0.00335,18.34,17.69,1024.0,56.0,4.63,300.0,Rain
2021-09-18,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,17.61,17.85,1015.0,93.0,1.37,199.0,Rain


### Merging Indoor and Outdoor (Air Quality and Weather) Data

In [7]:
data = pd.merge_asof(airdata, 
                   outdoor, 
                   left_index=True, right_index=True, 
                   suffixes=('', '_outdoor'),
                   tolerance=pd.Timedelta('3 seconds'),
                   direction="backward"
             )
data[data.index >= "2021-09-16 23:59:40"].head(10)

Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg,weather_main
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
2021-09-16 23:59:42.445474625,25.86,780.44,57.0,877365,246.4,1,2021,9,16,23,59,42,,,,,,,,,,,,,,,
2021-09-16 23:59:45.439809322,25.85,780.4,57.04,874512,246.7,1,2021,9,16,23,59,45,,,,,,,,,,,,,,,
2021-09-16 23:59:48.434415102,25.85,780.42,57.07,877365,245.9,1,2021,9,16,23,59,48,,,,,,,,,,,,,,,
2021-09-16 23:59:51.428925753,25.84,780.42,57.07,872810,246.9,1,2021,9,16,23,59,51,,,,,,,,,,,,,,,
2021-09-16 23:59:54.423571348,25.84,780.42,57.07,872244,247.7,1,2021,9,16,23,59,54,,,,,,,,,,,,,,,
2021-09-16 23:59:57.418201685,25.84,780.44,57.06,868302,249.6,1,2021,9,16,23,59,57,,,,,,,,,,,,,,,
2021-09-17 00:00:00.412962675,25.84,780.42,57.02,869987,250.0,1,2021,9,17,0,0,0,0.01025,0.947619,0.0327,0.04275,0.02665,29.666667,24.875,0.00335,18.34,17.69,1024.0,56.0,4.63,300.0,Rain
2021-09-17 00:00:03.407538652,25.84,780.44,57.02,877365,248.1,1,2021,9,17,0,0,3,,,,,,,,,,,,,,,
2021-09-17 00:00:06.402314186,25.83,780.42,57.05,868302,249.8,1,2021,9,17,0,0,6,,,,,,,,,,,,,,,
2021-09-17 00:00:09.396840096,25.84,780.42,57.0,875651,248.6,1,2021,9,17,0,0,9,,,,,,,,,,,,,,,


In [8]:
Markdown("Dataset with Indoor and Outdoor Data:\n* %d Rows\n* %d Columns."%(data.shape))

Dataset with Indoor and Outdoor Data:
* 6285103 Rows
* 27 Columns.

In [9]:
data[~data.isna().any(axis=1)]

Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg,weather_main
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
2021-02-12 07:00:02.979657173,21.51,777.30,43.74,144561,95.2,1,2021,2,12,7,0,2,0.244000,2.500000,0.035000,0.205000,0.002000,57.000000,25.000000,0.005000,13.87,12.46,1020.0,44.0,0.00,0.0,Clear
2021-02-12 08:00:01.982832432,21.04,776.92,42.35,153539,78.3,1,2021,2,12,8,0,1,0.146000,1.600000,0.030000,0.089000,0.004000,67.000000,33.000000,0.003000,12.81,11.37,1020.0,47.0,0.00,0.0,Clear
2021-02-12 09:00:00.729691744,20.41,776.33,42.56,153820,99.0,1,2021,2,12,9,0,0,0.099000,1.500000,0.039000,0.072000,0.012000,50.000000,28.000000,0.002000,10.83,9.35,1019.0,53.0,1.54,60.0,Clear
2021-02-12 10:00:02.449775934,20.27,776.20,42.21,144066,178.7,1,2021,2,12,10,0,2,0.024000,1.200000,0.030000,0.047000,0.025000,40.000000,21.000000,0.002000,6.40,3.51,1019.0,61.0,4.12,40.0,Clear
2021-02-12 11:00:01.044736862,19.91,776.25,42.26,142117,212.6,1,2021,2,12,11,0,1,0.009000,0.900000,0.016000,0.026000,0.033000,33.000000,19.000000,0.001000,6.23,6.23,1019.0,57.0,0.00,0.0,Clear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-14 00:00:02.855808020,24.31,780.42,55.46,873944,230.3,3,2021,9,14,0,0,2,0.017000,0.716667,0.015125,0.024625,0.013333,7.047619,4.500000,0.000125,15.85,14.95,1025.0,56.0,4.12,170.0,Rain
2021-09-15 00:00:01.543255568,25.52,779.38,54.92,938590,125.5,1,2021,9,15,0,0,1,0.027458,0.954167,0.028167,0.048625,0.019375,24.416667,17.333333,0.000875,17.95,17.08,1023.0,49.0,7.72,130.0,Clouds
2021-09-16 00:00:01.135869265,27.09,778.30,48.41,1222727,148.3,3,2021,9,16,0,0,1,0.006875,0.883333,0.028000,0.034458,0.022792,46.833333,40.041667,0.001542,18.45,17.63,1022.0,49.0,6.17,130.0,Smoke
2021-09-17 00:00:00.412962675,25.84,780.42,57.02,869987,250.0,1,2021,9,17,0,0,0,0.010250,0.947619,0.032700,0.042750,0.026650,29.666667,24.875000,0.003350,18.34,17.69,1024.0,56.0,4.63,300.0,Rain


## Imputations 

We found that the resulting dataframe after merging 2 datasets (Outdoor Data that is sampled every 1 hour and Indoor Data that is sampled every 3 seconds) contains repeated records on the columns of hourly data: SINAICA Gov't Air Quality Monitoring and OpenWeatherData. 

We think that the repeated data can be an issue, as the data moves very abruptly from a record call it at 10:57 and 11:00. This is relevant as the real world is not represented by the data correctly. Temperature, pressure and general natural features move slowly from one value to other. But we don't have that data, and it's not easily obtainable.

Therefore, we propose an approach similar to the imputations using the interpolation incorporating noise, that could avert the overfitting issue on our machine learning and deep learning training.

In [10]:
%%time
### We need to impute the first data and last data
### Therefore we need the datapoints to copy
first_data = pd.Timestamp(
  year=data.index[0].year,
  month=data.index[0].month,
  day=data.index[0].day,
  hour=data.index[0].hour,
  minute=0, second=0
)
last_data = pd.Timestamp(
  year=data.index[-1].year,
  month=data.index[-1].month,
  day=data.index[-1].day,
  hour=data.index[-1].hour + 1,
  minute=0, second=0
)

weather2 = pd.read_csv("data/openweathermap/2f101ea00e7759ea8723b848ac8b18d0.csv")
weather2["dt"] = pd.to_datetime(weather2["dt"], unit='s')
weather2.set_index("dt", drop=True, inplace=True)
weather2 = weather2[["temp", "feels_like", "temp_min", "temp_max", 
         "pressure", "humidity", "wind_speed", "wind_deg", "rain_1h", "rain_3h", 
         "clouds_all", "weather_id", "weather_main"]]
weather2 = weather2.loc[[first_data, last_data]]
#display(Markdown("Weather data:"))
#display(weather2)

sinaica2 = sinaica.loc[[first_data, 
             sinaica.loc[sinaica.index <= last_data].iloc[-1].name]]
#display(Markdown("SINAICA data:"))
#display(sinaica2)

CPU times: user 999 ms, sys: 176 ms, total: 1.18 s
Wall time: 1.17 s


In [11]:
%%time
### First data
d = data.iloc[0]
#sinaica data
d["NO"] = sinaica2.iloc[0]["NO"]
d["CO"] = sinaica2.iloc[0]["CO"]
d["NO2"] = sinaica2.iloc[0]["NO2"]
d["NOx"] = sinaica2.iloc[0]["NOx"]
d["O3"] = sinaica2.iloc[0]["O3"]
d["PM10"] = sinaica2.iloc[0]["PM10"]
d["PM2.5"] = sinaica2.iloc[0]["PM2.5"]
d["SO2"] = sinaica2.iloc[0]["SO2"]
##weather data
d["temperature_outdoor"] = weather2.iloc[0]["temp"]
d["feels_like"] = weather2.iloc[0]["feels_like"]
d["pressure_outdoor"] = weather2.iloc[0]["pressure"]
d["humidity_outdoor"] = weather2.iloc[0]["humidity"]
d["wind_speed"] = weather2.iloc[0]["wind_speed"]
d["wind_deg"] = weather2.iloc[0]["wind_deg"]
d["weather_main"] = weather2.iloc[0]["weather_main"]
data.iloc[0] = d
### Last data
d = data.iloc[-1]
#sinaica data
d["NO"] = sinaica2.iloc[-1]["NO"]
d["CO"] = sinaica2.iloc[-1]["CO"]
d["NO2"] = sinaica2.iloc[-1]["NO2"]
d["NOx"] = sinaica2.iloc[-1]["NOx"]
d["O3"] = sinaica2.iloc[-1]["O3"]
d["PM10"] = sinaica2.iloc[-1]["PM10"]
d["PM2.5"] = sinaica2.iloc[-1]["PM2.5"]
d["SO2"] = sinaica2.iloc[-1]["SO2"]
##weather data
d["temperature_outdoor"] = weather2.iloc[-1]["temp"]
d["feels_like"] = weather2.iloc[-1]["feels_like"]
d["pressure_outdoor"] = weather2.iloc[-1]["pressure"]
d["humidity_outdoor"] = weather2.iloc[-1]["humidity"]
d["wind_speed"] = weather2.iloc[-1]["wind_speed"]
d["wind_deg"] = weather2.iloc[-1]["wind_deg"]
d["weather_main"] = weather2.iloc[-1]["weather_main"]
data.iloc[-1] = d
data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 1.9 s, sys: 1.09 s, total: 2.99 s
Wall time: 2.99 s


Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg,weather_main
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
2021-02-12 06:04:09.089621067,21.54,777.41,43.93,151328,37.5,1,2021,2,12,6,4,9,0.205000,2.200000,0.031000,0.207000,0.002000,45.000000,22.000000,0.004000,14.93,13.6,1021.0,43.0,2.57,110.0,Clear
2021-02-12 06:04:12.087778807,21.56,777.41,43.89,152702,35.6,1,2021,2,12,6,4,12,,,,,,,,,,,,,,,
2021-02-12 06:04:15.072475433,21.53,777.41,43.97,151328,37.5,1,2021,2,12,6,4,15,,,,,,,,,,,,,,,
2021-02-12 06:04:18.070170164,21.51,777.41,44.03,151464,38.5,1,2021,2,12,6,4,18,,,,,,,,,,,,,,,
2021-02-12 06:04:21.061994791,21.51,777.41,44.05,152425,36.9,1,2021,2,12,6,4,21,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-18 01:20:38.889113188,25.84,782.96,56.64,928867,130.8,1,2021,9,18,1,20,38,,,,,,,,,,,,,,,
2021-09-18 01:20:41.882042885,25.83,782.94,56.66,923130,131.5,1,2021,9,18,1,20,41,,,,,,,,,,,,,,,
2021-09-18 01:20:44.877856970,25.83,782.94,56.63,925034,131.3,1,2021,9,18,1,20,44,,,,,,,,,,,,,,,
2021-09-18 01:20:47.872255564,25.83,782.94,56.62,923130,131.9,1,2021,9,18,1,20,47,,,,,,,,,,,,,,,


Here we can see the first and last data points to create the interpolation for the first and last values:

In [12]:
%%time
df2 = data.copy()
#df2 = data[["temperature_outdoor", "feels_like", "pressure_outdoor", 
#            "humidity_outdoor", "wind_speed", "wind_deg", "weather_main"]].copy()
#df2[~df2.isna().any(axis=1)]
df2

CPU times: user 340 ms, sys: 293 ms, total: 633 ms
Wall time: 632 ms


Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg,weather_main
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
2021-02-12 06:04:09.089621067,21.54,777.41,43.93,151328,37.5,1,2021,2,12,6,4,9,0.205000,2.200000,0.031000,0.207000,0.002000,45.000000,22.000000,0.004000,14.93,13.6,1021.0,43.0,2.57,110.0,Clear
2021-02-12 06:04:12.087778807,21.56,777.41,43.89,152702,35.6,1,2021,2,12,6,4,12,,,,,,,,,,,,,,,
2021-02-12 06:04:15.072475433,21.53,777.41,43.97,151328,37.5,1,2021,2,12,6,4,15,,,,,,,,,,,,,,,
2021-02-12 06:04:18.070170164,21.51,777.41,44.03,151464,38.5,1,2021,2,12,6,4,18,,,,,,,,,,,,,,,
2021-02-12 06:04:21.061994791,21.51,777.41,44.05,152425,36.9,1,2021,2,12,6,4,21,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-18 01:20:38.889113188,25.84,782.96,56.64,928867,130.8,1,2021,9,18,1,20,38,,,,,,,,,,,,,,,
2021-09-18 01:20:41.882042885,25.83,782.94,56.66,923130,131.5,1,2021,9,18,1,20,41,,,,,,,,,,,,,,,
2021-09-18 01:20:44.877856970,25.83,782.94,56.63,925034,131.3,1,2021,9,18,1,20,44,,,,,,,,,,,,,,,
2021-09-18 01:20:47.872255564,25.83,782.94,56.62,923130,131.9,1,2021,9,18,1,20,47,,,,,,,,,,,,,,,


In [13]:
%%time
df2["NO"] = df2["NO"].interpolate(method="linear", limit_direction='forward')
df2["CO"] = df2["CO"].interpolate(method="linear", limit_direction='forward')
df2["NO2"] = df2["NO2"].interpolate(method="linear", limit_direction='forward')
df2["NOx"] = df2["NOx"].interpolate(method="linear", limit_direction='forward')
df2["O3"] = df2["O3"].interpolate(method="linear", limit_direction='forward')
df2["PM10"] = df2["PM10"].interpolate(method="linear", limit_direction='forward')
df2["PM2.5"] = df2["PM2.5"].interpolate(method="linear", limit_direction='forward')
df2["SO2"] = df2["SO2"].interpolate(method="linear", limit_direction='forward')
df2["temperature_outdoor"] = df2["temperature_outdoor"].interpolate(method="linear", limit_direction='forward')
df2["feels_like"] = df2["feels_like"].interpolate(method="linear", limit_direction='forward')
df2["pressure_outdoor"] = df2["pressure_outdoor"].interpolate(method="linear", limit_direction='forward')
df2["humidity_outdoor"] = df2["humidity_outdoor"].interpolate(method="linear", limit_direction='forward')
df2["wind_speed"] = df2["wind_speed"].interpolate(method="linear", limit_direction='forward')
df2["wind_deg"] = df2["wind_deg"].interpolate(method="linear", limit_direction='forward')
df2["weather_main"] = df2["weather_main"].interpolate(method="pad", limit_direction='forward')
display(df2.head(3600))
display(df2.tail(3600))

Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg,weather_main
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
2021-02-12 06:04:09.089621067,21.54,777.41,43.93,151328,37.5,1,2021,2,12,6,4,9,0.205000,2.200000,0.031000,0.207000,0.002000,45.000000,22.000000,0.004000,14.930000,13.600000,1021.000000,43.000000,2.570000,110.000000,Clear
2021-02-12 06:04:12.087778807,21.56,777.41,43.89,152702,35.6,1,2021,2,12,6,4,12,0.205036,2.200274,0.031004,0.206998,0.002000,45.010949,22.002737,0.004001,14.929033,13.598960,1020.999088,43.000912,2.567655,109.899635,Clear
2021-02-12 06:04:15.072475433,21.53,777.41,43.97,151328,37.5,1,2021,2,12,6,4,15,0.205071,2.200547,0.031007,0.206996,0.002000,45.021898,22.005474,0.004002,14.928066,13.597920,1020.998175,43.001825,2.565310,109.799270,Clear
2021-02-12 06:04:18.070170164,21.51,777.41,44.03,151464,38.5,1,2021,2,12,6,4,18,0.205107,2.200821,0.031011,0.206995,0.002000,45.032847,22.008212,0.004003,14.927099,13.596880,1020.997263,43.002737,2.562965,109.698905,Clear
2021-02-12 06:04:21.061994791,21.51,777.41,44.05,152425,36.9,1,2021,2,12,6,4,21,0.205142,2.201095,0.031015,0.206993,0.002000,45.043796,22.010949,0.004004,14.926131,13.595839,1020.996350,43.003650,2.560620,109.598540,Clear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-12 09:04:38.933268309,20.65,776.29,42.03,152564,107.7,1,2021,2,12,9,4,38,0.093207,1.476827,0.038305,0.070069,0.013004,49.227575,27.459302,0.002000,10.487816,8.898904,1019.000000,53.617940,1.739286,58.455150,Clear
2021-02-12 09:04:41.924828053,20.66,776.29,42.01,152149,107.9,1,2021,2,12,9,4,41,0.093145,1.476578,0.038297,0.070048,0.013015,49.219269,27.453488,0.002000,10.484136,8.894053,1019.000000,53.624585,1.741429,58.438538,Clear
2021-02-12 09:04:44.916538477,20.62,776.29,42.08,151737,109.2,1,2021,2,12,9,4,44,0.093082,1.476329,0.038290,0.070027,0.013026,49.210963,27.447674,0.002000,10.480457,8.889203,1019.000000,53.631229,1.743571,58.421927,Clear
2021-02-12 09:04:47.907913446,20.63,776.29,42.05,151464,110.7,1,2021,2,12,9,4,47,0.093020,1.476080,0.038282,0.070007,0.013037,49.202658,27.441860,0.002000,10.476777,8.884352,1019.000000,53.637874,1.745714,58.405316,Clear


Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg,weather_main
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
2021-09-17 22:21:13.529673576,27.02,779.99,49.73,1049961,32.3,1,2021,9,17,22,21,13,0.009248,0.777732,0.028160,0.039298,0.016083,20.946691,16.973366,0.001445,17.660085,17.839022,1015.617485,90.461451,1.593667,205.929554,Rain
2021-09-17 22:21:16.524485349,27.02,780.01,49.74,1050781,31.8,1,2021,9,17,22,21,16,0.009248,0.777726,0.028160,0.039298,0.016082,20.946367,16.973072,0.001445,17.660060,17.839028,1015.617173,90.462733,1.593554,205.926054,Rain
2021-09-17 22:21:19.519613981,27.02,780.01,49.70,1044255,33.4,1,2021,9,17,22,21,19,0.009248,0.777719,0.028160,0.039298,0.016082,20.946043,16.972778,0.001445,17.660034,17.839034,1015.616861,90.464015,1.593441,205.922554,Rain
2021-09-17 22:21:22.514595508,27.02,779.99,49.69,1042636,35.0,1,2021,9,17,22,21,22,0.009248,0.777713,0.028160,0.039297,0.016082,20.945718,16.972484,0.001444,17.660009,17.839039,1015.616549,90.465297,1.593328,205.919055,Rain
2021-09-17 22:21:25.509442091,27.01,780.01,49.68,1047508,34.7,1,2021,9,17,22,21,25,0.009248,0.777707,0.028160,0.039297,0.016081,20.945394,16.972190,0.001444,17.659984,17.839045,1015.616238,90.466579,1.593215,205.915555,Rain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-18 01:20:38.889113188,25.84,782.96,56.64,928867,130.8,1,2021,9,18,1,20,38,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.298203,14.109265,1017.992588,89.009883,2.108172,349.626930,Rain
2021-09-18 01:20:41.882042885,25.83,782.94,56.66,923130,131.5,1,2021,9,18,1,20,41,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.296152,14.106949,1017.994441,89.007412,2.108629,349.720198,Rain
2021-09-18 01:20:44.877856970,25.83,782.94,56.63,925034,131.3,1,2021,9,18,1,20,44,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.294101,14.104632,1017.996294,89.004941,2.109086,349.813465,Rain
2021-09-18 01:20:47.872255564,25.83,782.94,56.62,923130,131.9,1,2021,9,18,1,20,47,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.292051,14.102316,1017.998147,89.002471,2.109543,349.906733,Rain


CPU times: user 20.7 s, sys: 11.3 s, total: 32.1 s
Wall time: 32 s


## Resampling

To reduce training time we propose to have a resampling of the data.

In the following subsections we create those resampled-data dataframes.

### 1 Minute Resampling

In [14]:
%%time

df_1min = (
  df2.
  resample('1min').
  mean()
)
df_1min.to_pickle('data/data_1min.pickle.gz')
df_1min

CPU times: user 6.83 s, sys: 605 ms, total: 7.44 s
Wall time: 7.43 s


Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2021-02-12 06:04:00,21.530000,777.410000,43.974000,151849.400000,37.200000,1.0,2021.0,2.0,12.0,6.0,4.0,15.000000,0.205071,2.200547,0.031007,0.206996,0.002000,45.021898,22.005474,0.004002,14.928066,13.597920,1020.998175,43.001825,2.565310,109.799270
2021-02-12 06:05:00,21.526250,777.408750,43.840000,152790.000000,32.162500,1.0,2021.0,2.0,12.0,6.0,5.0,45.875000,0.205302,2.202327,0.031031,0.206984,0.002000,45.093066,22.023266,0.004008,14.921779,13.591159,1020.992245,43.007755,2.550068,109.146898
2021-02-12 06:06:00,21.693000,777.409000,43.426000,152220.550000,34.325000,1.0,2021.0,2.0,12.0,6.0,6.0,30.500000,0.205801,2.206159,0.031082,0.206959,0.002000,45.246350,22.061588,0.004021,14.908239,13.576597,1020.979471,43.020529,2.517240,107.741788
2021-02-12 06:07:00,21.759000,777.410500,43.245500,151978.450000,36.190000,1.0,2021.0,2.0,12.0,6.0,7.0,30.500000,0.206512,2.211633,0.031155,0.206922,0.002000,45.465328,22.116332,0.004039,14.888896,13.555794,1020.961223,43.038777,2.470342,105.734489
2021-02-12 06:08:00,21.750500,777.390500,43.056000,150300.400000,46.600000,1.0,2021.0,2.0,12.0,6.0,8.0,30.500000,0.207224,2.217108,0.031228,0.206886,0.002000,45.684307,22.171077,0.004057,14.869553,13.534991,1020.942974,43.057026,2.423444,103.727190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-18 01:16:00,25.871000,782.805000,56.607000,921467.050000,133.940000,1.0,2021.0,9.0,18.0,1.0,16.0,28.500000,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.469432,14.302671,1017.837863,89.216183,2.070006,341.839098
2021-09-18 01:17:00,25.861000,782.832000,56.587500,921211.600000,134.815000,1.0,2021.0,9.0,18.0,1.0,17.0,28.500000,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.428419,14.256347,1017.874923,89.166770,2.079148,343.704447
2021-09-18 01:18:00,25.850000,782.866000,56.597500,922348.700000,133.850000,1.0,2021.0,9.0,18.0,1.0,18.0,28.500000,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.387406,14.210022,1017.911983,89.117356,2.088289,345.569796
2021-09-18 01:19:00,25.836190,782.900000,56.683810,921997.095238,134.190476,1.0,2021.0,9.0,18.0,1.0,19.0,29.619048,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.345368,14.162539,1017.949969,89.066708,2.097659,347.481779


### 2 Minute Resampling

In [15]:
%%time

df_2min = (
  df2.
  resample('2min').
  mean()
)
df_2min.to_pickle('data/data_2min.pickle.gz')
df_2min

CPU times: user 3.18 s, sys: 141 ms, total: 3.32 s
Wall time: 3.32 s


Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2021-02-12 06:04:00,21.527692,777.409231,43.891538,152428.230769,34.100000,1.0,2021.0,2.0,12.0,6.0,4.615385,34.000000,0.205214,2.201642,0.031022,0.206989,0.002000,45.065693,22.016423,0.004005,14.924197,13.593759,1020.994526,43.005474,2.555931,109.397810
2021-02-12 06:06:00,21.726000,777.409750,43.335750,152099.500000,35.257500,1.0,2021.0,2.0,12.0,6.0,6.500000,30.500000,0.206156,2.208896,0.031119,0.206941,0.002000,45.355839,22.088960,0.004030,14.898568,13.566195,1020.970347,43.029653,2.493791,106.738139
2021-02-12 06:08:00,21.686250,777.365250,43.291500,147429.200000,72.652500,1.0,2021.0,2.0,12.0,6.0,8.500000,30.500000,0.207580,2.219845,0.031265,0.206868,0.002000,45.793796,22.198449,0.004066,14.859881,13.524589,1020.933850,43.066150,2.399995,102.723540
2021-02-12 06:10:00,21.499500,777.302000,43.106250,149288.475000,69.505000,1.0,2021.0,2.0,12.0,6.0,10.500000,29.900000,0.209003,2.230794,0.031411,0.206795,0.002000,46.231752,22.307938,0.004103,14.821195,13.482984,1020.897354,43.102646,2.306200,98.708942
2021-02-12 06:12:00,21.628250,777.279500,42.830750,149325.975000,71.237500,1.0,2021.0,2.0,12.0,6.0,12.500000,29.500000,0.210427,2.241743,0.031557,0.206722,0.002000,46.669708,22.417427,0.004139,14.782509,13.441378,1020.860858,43.139142,2.212404,94.694343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-18 01:12:00,25.913500,782.776500,56.513250,920534.625000,134.397500,1.0,2021.0,9.0,18.0,1.0,12.500000,28.500000,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.612977,14.464809,1017.708153,89.389129,2.038011,335.310377
2021-09-18 01:14:00,25.892750,782.793500,56.555500,921024.825000,134.380000,1.0,2021.0,9.0,18.0,1.0,14.500000,28.500000,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.530951,14.372159,1017.782273,89.290303,2.056294,339.041075
2021-09-18 01:16:00,25.866000,782.818500,56.597250,921339.325000,134.377500,1.0,2021.0,9.0,18.0,1.0,16.500000,28.500000,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.448925,14.279509,1017.856393,89.191476,2.074577,342.771773
2021-09-18 01:18:00,25.842927,782.883415,56.641707,922168.609756,134.024390,1.0,2021.0,9.0,18.0,1.0,18.512195,29.073171,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.365874,14.185701,1017.931439,89.091414,2.093088,346.549104


### 3 Minute Resampling

In [16]:
%%time

df_5min = (
  df2.
  resample('5min').
  mean()
)
df_5min.to_pickle('data/data_5min.pickle.gz')
df_5min

CPU times: user 1.31 s, sys: 313 ms, total: 1.62 s
Wall time: 1.62 s


Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2021-02-12 06:00:00,21.530000,777.410000,43.974000,151849.400000,37.200000,1.0,2021.0,2.0,12.0,6.0,4.000000,15.000000,0.205071,2.200547,0.031007,0.206996,0.002000,45.021898,22.005474,0.004002,14.928066,13.597920,1020.998175,43.001825,2.565310,109.799270
2021-02-12 06:05:00,21.689773,777.389432,43.361477,150039.409091,51.973864,1.0,2021.0,2.0,12.0,6.0,7.272727,31.897727,0.206726,2.213276,0.031177,0.206911,0.002000,45.531022,22.132755,0.004044,14.883093,13.549553,1020.955748,43.044252,2.456273,105.132299
2021-02-12 06:10:00,21.538300,777.285200,42.909800,149975.940000,67.172000,1.0,2021.0,2.0,12.0,6.0,12.000000,29.660000,0.210071,2.239005,0.031520,0.206740,0.002000,46.560219,22.390055,0.004130,14.792181,13.451779,1020.869982,43.130018,2.235853,95.697993
2021-02-12 06:15:00,21.563900,777.269000,42.704100,150897.020000,65.798000,1.0,2021.0,2.0,12.0,6.0,17.000000,28.890000,0.213629,2.266378,0.031885,0.206557,0.002000,47.655109,22.663777,0.004221,14.695465,13.347765,1020.778741,43.221259,2.001364,85.661496
2021-02-12 06:20:00,21.616931,777.223960,42.695545,149963.910891,71.275248,1.0,2021.0,2.0,12.0,6.0,22.009901,29.029703,0.217205,2.293887,0.032252,0.206374,0.002000,48.755474,22.938869,0.004313,14.598266,13.243230,1020.687044,43.312956,1.765703,75.574818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-18 01:00:00,25.987500,782.832000,56.333000,918660.290000,135.811000,1.0,2021.0,9.0,18.0,1.0,2.000000,29.710000,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,15.043613,14.951220,1017.319024,89.907968,1.942026,315.724212
2021-09-18 01:05:00,25.966100,782.800200,56.379800,920302.740000,134.243000,1.0,2021.0,9.0,18.0,1.0,7.000000,29.500000,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.838548,14.719595,1017.504324,89.660902,1.987733,325.050957
2021-09-18 01:10:00,25.915100,782.782200,56.511300,920836.000000,134.184000,1.0,2021.0,9.0,18.0,1.0,12.000000,28.600000,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.633484,14.487971,1017.689623,89.413836,2.033440,334.377702
2021-09-18 01:15:00,25.860099,782.840594,56.613069,921519.306931,134.372277,1.0,2021.0,9.0,18.0,1.0,17.019802,28.732673,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,14.427393,14.255188,1017.875849,89.165534,2.079376,343.751081


## References

* <https://scikit-learn.org/stable/modules/linear_model.html#generalized-linear-regression>

* <https://pythonhealthcare.org/2018/05/03/81-distribution-fitting-to-data/>

* <https://medium.com/@amirarsalan.rajabi/distribution-fitting-with-python-scipy-bb70a42c0aed>

* <https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html?highlight=kernel%20density#sklearn.neighbors.KernelDensity>

* <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling>

* <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.merge_asof.html#pandas.merge_asof>

* <https://openweathermap.org/history-bulk>

* <https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#interpolation>

* <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html?highlight=interpolate>