# Data Aggregation Across Data Sources

We have 3 different sources of data:

1. Our sensor data: that has the Indoor Air Quality and Indoor Environmental Data.

2. SINAICA: Outdoor Air Quality Monitoring Data from the Government.

3. OpenWeatherData: Outdoor Environmental Data.

We need it to be available that data to the models we plan to train. In the following sections this process is detailed.

In [1]:
import os, gzip, json, re, stan, dplython, asyncio, nest_asyncio
#nest_asyncio.apply()
import warnings
from matplotlib import pyplot as plt
warnings.filterwarnings("ignore", category=DeprecationWarning)
from dplython import (DplyFrame, X, diamonds, select, sift,
  sample_n, sample_frac, head, arrange, mutate, group_by,
  summarize, DelayFunction, dfilter)
import seaborn as sns
from plotnine import *
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import (mean_squared_error, 
                             r2_score,
                             mean_absolute_error)
import pandas as pd
import numpy as np
from IPython.display import display, Markdown, update_display
DEBUG=False
if DEBUG:
  display(Markdown("Default Values:"))
  display(Markdown(f"* pandas max_columns={pd.options.display.max_columns}\n" +
                   f"* pandas max_rows={pd.options.display.max_rows}"))
  pd.options.display.max_columns=35
  pd.options.display.max_rows=100
  display(Markdown("New Values:"))
  display(Markdown(f"* pandas max_columns={pd.options.display.max_columns}\n" +
                   f"* pandas max_rows={pd.options.display.max_rows}"))



## Indoor Data

In [2]:
airdata = pd.read_pickle('data/airdata/air.pickle')
airdata["year"] = [dt.year for dt in airdata["datetime"]]
airdata["month"] = [dt.month for dt in airdata["datetime"]]
airdata["day"] = [dt.day for dt in airdata["datetime"]]
airdata["hour"] = [dt.hour for dt in airdata["datetime"]]
airdata["minute"] = [dt.minute for dt in airdata["datetime"]]
airdata["second"] = [dt.second for dt in airdata["datetime"]]
airdata.set_index("datetime", inplace=True)
airdata.sort_index(inplace=True)
airdata

Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-02-12 06:04:09.089621067,21.54,777.41,43.93,151328,37.5,1,2021,2,12,6,4,9
2021-02-12 06:04:12.087778807,21.56,777.41,43.89,152702,35.6,1,2021,2,12,6,4,12
2021-02-12 06:04:15.072475433,21.53,777.41,43.97,151328,37.5,1,2021,2,12,6,4,15
2021-02-12 06:04:18.070170164,21.51,777.41,44.03,151464,38.5,1,2021,2,12,6,4,18
2021-02-12 06:04:21.061994791,21.51,777.41,44.05,152425,36.9,1,2021,2,12,6,4,21
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-18 01:20:38.889113188,25.84,782.96,56.64,928867,130.8,1,2021,9,18,1,20,38
2021-09-18 01:20:41.882042885,25.83,782.94,56.66,923130,131.5,1,2021,9,18,1,20,41
2021-09-18 01:20:44.877856970,25.83,782.94,56.63,925034,131.3,1,2021,9,18,1,20,44
2021-09-18 01:20:47.872255564,25.83,782.94,56.62,923130,131.9,1,2021,9,18,1,20,47


## Outdoor Air Quality Data

In [3]:
sinaica = pd.read_pickle('data/sinaica2/dsinaica.pickle')
sinaica.rename(mapper={
  "Merced_CO": "CO",
  "Camarones_NO": "NO",
  "Merced_NO2": "NO2",
  "Merced_NOx": "NOx",
  "Merced_O3": "O3",
  "Merced_PM10": "PM10",
  "Merced_PM2.5": "PM2.5",
  "Merced_SO2": "SO2"
}, axis=1, inplace=True)
sinaica.drop(columns=[col 
                       for col in sinaica.columns 
                       if re.match('^(Camaron|Gustavo|Miguel|Tlalne|FES|Merced|La Pre)', col)],
              inplace=True
             )
sinaica["year"] = [dt.year for dt in sinaica["Fecha"]]
sinaica["month"] = [dt.month for dt in sinaica["Fecha"]]
sinaica["day"] = [dt.day for dt in sinaica["Fecha"]]
sinaica["hour"] = [dt.hour for dt in sinaica["Fecha"]]
sinaica["minute"] = [dt.minute for dt in sinaica["Fecha"]]
sinaica.set_index("Fecha", inplace=True)
sinaica.sort_index(inplace=True)
sinaica = sinaica.copy()
sinaica

Unnamed: 0_level_0,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,year,month,day,hour,minute
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-01-01 00:00:00,0.006000,1.000000,0.032000,0.036000,0.006000,31.000000,19.000000,0.003000,2021,1,1,0,0
2021-01-01 01:00:00,0.021000,,,,,,,,2021,1,1,1,0
2021-01-01 02:00:00,0.013000,1.100000,0.032000,0.039000,0.004000,37.000000,24.000000,0.003000,2021,1,1,2,0
2021-01-01 03:00:00,0.031000,1.200000,0.033000,0.043000,0.001000,49.000000,39.000000,0.003000,2021,1,1,3,0
2021-01-01 04:00:00,0.005000,1.200000,0.031000,0.039000,0.002000,80.000000,65.000000,0.003000,2021,1,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-10-04 00:00:00,0.008292,0.545833,0.019083,0.026875,0.015833,11.826087,7.913043,0.000750,2021,10,4,0,0
2021-10-05 00:00:00,0.010000,0.563158,0.019722,0.030500,0.012278,11.090909,6.772727,0.000556,2021,10,5,0,0
2021-10-06 00:00:00,0.007571,0.672222,0.026111,0.035611,0.011000,18.722222,11.833333,0.000111,2021,10,6,0,0
2021-10-07 00:00:00,0.011565,0.713636,0.028636,0.040318,0.017909,26.772727,17.000000,0.001045,2021,10,7,0,0


## Outdoor Weather Data

In [4]:
weather = pd.read_pickle("data/openweathermap/weather.pickle.gz")
#weather["year"] = [dt.year for dt in weather["dt"]]
#weather["month"] = [dt.month for dt in weather["dt"]]
#weather["day"] = [dt.day for dt in weather["dt"]]
#weather["hour"] = [dt.hour for dt in weather["dt"]]
#weather["minute"] = [dt.minute for dt in weather["dt"]]
weather.rename(columns={'temp': 'temperature'},
               inplace=True)
weather.set_index("dt", inplace=True)
weather.sort_index(inplace=True)
weather.drop(columns=['clouds_all', "weather_id", 'rain_1h', 'rain_3h',
                      'temp_max', 'temp_min'], inplace=True)
weather

Unnamed: 0_level_0,temperature,feels_like,pressure,humidity,wind_speed,wind_deg,weather_main
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-02-12 07:00:00,13.87,12.46,1020,44,0.00,0,Clear
2021-02-12 08:00:00,12.81,11.37,1020,47,0.00,0,Clear
2021-02-12 09:00:00,10.83,9.35,1019,53,1.54,60,Clear
2021-02-12 10:00:00,6.40,3.51,1019,61,4.12,40,Clear
2021-02-12 11:00:00,6.23,6.23,1019,57,0.00,0,Clear
...,...,...,...,...,...,...,...
2021-09-27 19:00:00,21.51,20.89,1006,45,0.89,139,Clear
2021-09-27 20:00:00,23.18,22.81,1005,48,0.45,224,Rain
2021-09-27 21:00:00,22.21,21.69,1025,46,6.17,220,Rain
2021-09-27 22:00:00,21.03,20.68,1004,57,0.45,242,Rain


## Merging the 3 Datasets: Indoor Data, Outdoor Air Quality Data, Outdoor Weather Data.

### Merging Air Quality and Weather Data

In [28]:
outdoor = sinaica.drop(columns=['year', 'month', 'day', 'hour', 'minute']).join(weather, 
                                    rsuffix='_weather').copy()
outdoor = outdoor[(outdoor.index >= airdata.index.min()) &
                  (outdoor.index <= airdata.index.max())]
outdoor.sort_index(inplace=True)
outdoor

Unnamed: 0,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature,feels_like,pressure,humidity,wind_speed,wind_deg,weather_main
2021-02-12 07:00:00,0.244000,2.500000,0.035000,0.205000,0.002000,57.000000,25.000000,0.005000,13.87,12.46,1020.0,44.0,0.00,0.0,Clear
2021-02-12 08:00:00,0.146000,1.600000,0.030000,0.089000,0.004000,67.000000,33.000000,0.003000,12.81,11.37,1020.0,47.0,0.00,0.0,Clear
2021-02-12 09:00:00,0.099000,1.500000,0.039000,0.072000,0.012000,50.000000,28.000000,0.002000,10.83,9.35,1019.0,53.0,1.54,60.0,Clear
2021-02-12 10:00:00,0.024000,1.200000,0.030000,0.047000,0.025000,40.000000,21.000000,0.002000,6.40,3.51,1019.0,61.0,4.12,40.0,Clear
2021-02-12 11:00:00,0.009000,0.900000,0.016000,0.026000,0.033000,33.000000,19.000000,0.001000,6.23,6.23,1019.0,57.0,0.00,0.0,Clear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-14 00:00:00,0.017000,0.716667,0.015125,0.024625,0.013333,7.047619,4.500000,0.000125,15.85,14.95,1025.0,56.0,4.12,170.0,Rain
2021-09-15 00:00:00,0.027458,0.954167,0.028167,0.048625,0.019375,24.416667,17.333333,0.000875,17.95,17.08,1023.0,49.0,7.72,130.0,Clouds
2021-09-16 00:00:00,0.006875,0.883333,0.028000,0.034458,0.022792,46.833333,40.041667,0.001542,18.45,17.63,1022.0,49.0,6.17,130.0,Smoke
2021-09-17 00:00:00,0.010250,0.947619,0.032700,0.042750,0.026650,29.666667,24.875000,0.003350,18.34,17.69,1024.0,56.0,4.63,300.0,Rain


In [33]:
outdoor[outdoor.index >= "2021-09-15 23:59"]

Unnamed: 0,NO,CO,NO2,NOx,O3,PM10,PM2.5,SO2,temperature,feels_like,pressure,humidity,wind_speed,wind_deg,weather_main
2021-09-16,0.006875,0.883333,0.028,0.034458,0.022792,46.833333,40.041667,0.001542,18.45,17.63,1022.0,49.0,6.17,130.0,Smoke
2021-09-17,0.01025,0.947619,0.0327,0.04275,0.02665,29.666667,24.875,0.00335,18.34,17.69,1024.0,56.0,4.63,300.0,Rain
2021-09-18,0.009174,0.765217,0.027826,0.039043,0.015304,20.304348,16.391304,0.001304,17.61,17.85,1015.0,93.0,1.37,199.0,Rain


### Merging Indoor and Outdoor (Air Quality and Weather) Data

In [79]:
data = pd.merge_asof(airdata, 
                   outdoor, 
                   left_index=True, right_index=True, 
                   suffixes=('', '_outdoor'),
                   tolerance=pd.Timedelta('3 seconds'),
                   direction="forward"
             )
data[data.index >= "2021-09-16 23:59:40"].head(10)

Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,...,PM10,PM2.5,SO2,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg,weather_main
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-09-16 23:59:42.445474625,25.86,780.44,57.0,877365,246.4,1,2021,9,16,23,...,,,,,,,,,,
2021-09-16 23:59:45.439809322,25.85,780.4,57.04,874512,246.7,1,2021,9,16,23,...,,,,,,,,,,
2021-09-16 23:59:48.434415102,25.85,780.42,57.07,877365,245.9,1,2021,9,16,23,...,,,,,,,,,,
2021-09-16 23:59:51.428925753,25.84,780.42,57.07,872810,246.9,1,2021,9,16,23,...,,,,,,,,,,
2021-09-16 23:59:54.423571348,25.84,780.42,57.07,872244,247.7,1,2021,9,16,23,...,,,,,,,,,,
2021-09-16 23:59:57.418201685,25.84,780.44,57.06,868302,249.6,1,2021,9,16,23,...,29.666667,24.875,0.00335,18.34,17.69,1024.0,56.0,4.63,300.0,Rain
2021-09-17 00:00:00.412962675,25.84,780.42,57.02,869987,250.0,1,2021,9,17,0,...,,,,,,,,,,
2021-09-17 00:00:03.407538652,25.84,780.44,57.02,877365,248.1,1,2021,9,17,0,...,,,,,,,,,,
2021-09-17 00:00:06.402314186,25.83,780.42,57.05,868302,249.8,1,2021,9,17,0,...,,,,,,,,,,
2021-09-17 00:00:09.396840096,25.84,780.42,57.0,875651,248.6,1,2021,9,17,0,...,,,,,,,,,,


In [80]:
Markdown("Dataset with Indoor and Outdoor Data:\n* %d Rows\n* %d Columns."%(data.shape))

Dataset with Indoor and Outdoor Data:
* 6285103 Rows
* 27 Columns.

In [81]:
data[~data.isna().any(axis=1)]

Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,...,PM10,PM2.5,SO2,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg,weather_main
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-02-12 06:59:59.987502337,21.51,777.30,43.78,143943,96.2,1,2021,2,12,6,...,57.000000,25.000000,0.005000,13.87,12.46,1020.0,44.0,0.00,0.0,Clear
2021-02-12 07:59:58.990879536,21.01,776.94,42.43,152841,80.4,1,2021,2,12,7,...,67.000000,33.000000,0.003000,12.81,11.37,1020.0,47.0,0.00,0.0,Clear
2021-02-12 08:59:57.738294601,20.41,776.35,42.60,153259,99.9,1,2021,2,12,8,...,50.000000,28.000000,0.002000,10.83,9.35,1019.0,53.0,1.54,60.0,Clear
2021-02-12 09:59:59.458741903,20.28,776.20,42.18,145689,177.1,1,2021,2,12,9,...,40.000000,21.000000,0.002000,6.40,3.51,1019.0,61.0,4.12,40.0,Clear
2021-02-12 10:59:58.053189993,19.92,776.23,42.24,141519,214.2,1,2021,2,12,10,...,33.000000,19.000000,0.001000,6.23,6.23,1019.0,57.0,0.00,0.0,Clear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-13 23:59:59.861400843,24.32,780.40,55.44,872810,230.0,3,2021,9,13,23,...,7.047619,4.500000,0.000125,15.85,14.95,1025.0,56.0,4.12,170.0,Rain
2021-09-14 23:59:58.549029827,25.52,779.40,54.90,937936,125.9,1,2021,9,14,23,...,24.416667,17.333333,0.000875,17.95,17.08,1023.0,49.0,7.72,130.0,Clouds
2021-09-15 23:59:58.141078472,27.09,778.30,48.42,1221617,148.6,3,2021,9,15,23,...,46.833333,40.041667,0.001542,18.45,17.63,1022.0,49.0,6.17,130.0,Smoke
2021-09-16 23:59:57.418201685,25.84,780.44,57.06,868302,249.6,1,2021,9,16,23,...,29.666667,24.875000,0.003350,18.34,17.69,1024.0,56.0,4.63,300.0,Rain


## Imputations 

We found that the resulting dataframe after merging 2 datasets (Outdoor Data that is sampled every 1 hour and Indoor Data that is sampled every 3 seconds) contains repeated records on the columns of hourly data: SINAICA Gov't Air Quality Monitoring and OpenWeatherData. 

We think that the repeated data can be an issue, as the data moves very abruptly from a record call it at 10:57 and 11:00. This is relevant as the real world is not represented by the data correctly. Temperature, pressure and general natural features move slowly from one value to other. But we don't have that data, and it's not easily obtainable.

Therefore, we propose an approach similar to the imputations using the interpolation incorporating noise, that could avert the overfitting issue on our machine learning and deep learning training.

In [82]:
%%time

df2 = data[["temperature_outdoor", "feels_like", "pressure_outdoor", 
            "humidity_outdoor", "wind_speed", "wind_deg", "weather_main"]].copy()

df2[~df2.isna().any(axis=1)]

CPU times: user 476 ms, sys: 166 ms, total: 642 ms
Wall time: 639 ms


Unnamed: 0_level_0,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg,weather_main
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-02-12 06:59:59.987502337,13.87,12.46,1020.0,44.0,0.00,0.0,Clear
2021-02-12 07:59:58.990879536,12.81,11.37,1020.0,47.0,0.00,0.0,Clear
2021-02-12 08:59:57.738294601,10.83,9.35,1019.0,53.0,1.54,60.0,Clear
2021-02-12 09:59:59.458741903,6.40,3.51,1019.0,61.0,4.12,40.0,Clear
2021-02-12 10:59:58.053189993,6.23,6.23,1019.0,57.0,0.00,0.0,Clear
...,...,...,...,...,...,...,...
2021-09-13 23:59:59.861400843,15.85,14.95,1025.0,56.0,4.12,170.0,Rain
2021-09-14 23:59:58.549029827,17.95,17.08,1023.0,49.0,7.72,130.0,Clouds
2021-09-15 23:59:58.141078472,18.45,17.63,1022.0,49.0,6.17,130.0,Smoke
2021-09-16 23:59:57.418201685,18.34,17.69,1024.0,56.0,4.63,300.0,Rain


In [None]:
%%time

def interpolate_missing(df, idx, hours=1):
    """
    Function to interpolate missing.
    
    examples 
        airdata2 = interpolate_missing(airdata, idx = airdata[airdata.delta > 100].index[0])
        airdata2 = interpolate_missing(airdata, 194044)
    """
    np.random.seed(175904)
    df_prev = df.loc[idx - 1]

    a  = df_prev
    b  = df.loc[idx]
        
    offsetEnd = pd.offsets.Hour(1) # remove closed form

    out = {}
    out["datetime"] = pd.date_range(a["datetime"] + offsetEnd, 
                             b["datetime"] - offsetEnd, 
                             freq='1h')
    out["datetime"] = out["datetime"].set_names("datetime")
    
    for v in ['CO', 'NO', 'NO2', 'NOx', 'O3', 'PM10', 'PM2.5', 'SO2']:
        i = [i+1 for i, d in enumerate(out["datetime"])]
        m = (b[v] - a[v])/len(out["datetime"])
        sd = 0.7*np.std(df_prev[v])
        rnds = np.random.normal(-sd, sd, len(out["datetime"]))
        #rnds = np.random.uniform(-2*np.pi, 2*np.pi, len(out["datetime"]))
        #rnds = np.cos(rnds) * sd
        out[v] = [m*j + a[v] + rnds[j-1] for j in i]    
        #out[v] = [m*j  b[v] + rnds[j-1] for j in i]
        
    #out["iaqAccuracy"] = 1
    
    idf = pd.DataFrame(out)
    reorder_columns = [col for col in out.keys() if col != 'datetime']
    reorder_columns.append("datetime")
    idf = idf.reindex(columns=reorder_columns)
    #print(reorder_columns)
    
    idf["year"] = [dt.year for dt in idf["datetime"]]
    idf["month"] = [dt.month for dt in idf["datetime"]]
    idf["day"] = [dt.day for dt in idf["datetime"]]
    idf["hour"] = [dt.hour for dt in idf["datetime"]]
    #idf["minute"] = [dt.minute for dt in idf["datetime"]]
    idf["imputated"] = True
        
    # original dataframe
    idf = pd.concat([df.reset_index(drop=True), idf.reset_index(drop=True)])
    idf.sort_values("datetime", inplace=True)
    idf.reset_index(inplace=True, drop=True)
    
    idf["datetime-1"] = idf["datetime"].shift(1)
    idf["delta"] = idf["datetime"] - idf["datetime-1"]
    idf["delta"] = idf["delta"].dt.seconds // 60**2
    
    return  idf

sinaica2 = sinaica_imputated.copy()
imputation_list = [x for x in reversed(sinaica_imputated.delta[sinaica_imputated.delta > 1].index)]
sinaica2 = sinaica_imputated.copy()

for x in imputation_list:
  try:
    sinaica2 = interpolate_missing(sinaica2, x)
  except:
    print(f"Skipping {x}")
sinaica2 = sinaica2.loc[1:].reset_index(drop=True)

## Resampling

To reduce training time we propose to have a resampling of the data.

In the following subsections we create those resampled-data dataframes.

### 1 Minute Resampling

In [11]:
%%time

df_1min = (
  df.
  resample('1min').
  mean()
)
df_1min.to_pickle('data/data_1min.pickle.gz')
df_1min

CPU times: user 4.91 s, sys: 166 ms, total: 5.07 s
Wall time: 5.07 s


Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-02-12 06:04:00,21.530000,777.410000,43.974000,151849.400000,37.200000,1.0,2021.0,2.0,12.0,6.0,4.0,15.000000,,,,,,
2021-02-12 06:05:00,21.526250,777.408750,43.840000,152790.000000,32.162500,1.0,2021.0,2.0,12.0,6.0,5.0,45.875000,,,,,,
2021-02-12 06:06:00,21.693000,777.409000,43.426000,152220.550000,34.325000,1.0,2021.0,2.0,12.0,6.0,6.0,30.500000,,,,,,
2021-02-12 06:07:00,21.759000,777.410500,43.245500,151978.450000,36.190000,1.0,2021.0,2.0,12.0,6.0,7.0,30.500000,,,,,,
2021-02-12 06:08:00,21.750500,777.390500,43.056000,150300.400000,46.600000,1.0,2021.0,2.0,12.0,6.0,8.0,30.500000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-18 01:16:00,25.871000,782.805000,56.607000,921467.050000,133.940000,1.0,2021.0,9.0,18.0,1.0,16.0,28.500000,,,,,,
2021-09-18 01:17:00,25.861000,782.832000,56.587500,921211.600000,134.815000,1.0,2021.0,9.0,18.0,1.0,17.0,28.500000,,,,,,
2021-09-18 01:18:00,25.850000,782.866000,56.597500,922348.700000,133.850000,1.0,2021.0,9.0,18.0,1.0,18.0,28.500000,,,,,,
2021-09-18 01:19:00,25.836190,782.900000,56.683810,921997.095238,134.190476,1.0,2021.0,9.0,18.0,1.0,19.0,29.619048,,,,,,


### 2 Minute Resampling

In [12]:
%%time

df_2min = (
  df.
  resample('2min').
  mean()
)
df_2min.to_pickle('data/data_2min.pickle.gz')
df_2min

CPU times: user 2.32 s, sys: 112 ms, total: 2.43 s
Wall time: 2.43 s


Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-02-12 06:04:00,21.527692,777.409231,43.891538,152428.230769,34.100000,1.0,2021.0,2.0,12.0,6.0,4.615385,34.000000,,,,,,
2021-02-12 06:06:00,21.726000,777.409750,43.335750,152099.500000,35.257500,1.0,2021.0,2.0,12.0,6.0,6.500000,30.500000,,,,,,
2021-02-12 06:08:00,21.686250,777.365250,43.291500,147429.200000,72.652500,1.0,2021.0,2.0,12.0,6.0,8.500000,30.500000,,,,,,
2021-02-12 06:10:00,21.499500,777.302000,43.106250,149288.475000,69.505000,1.0,2021.0,2.0,12.0,6.0,10.500000,29.900000,,,,,,
2021-02-12 06:12:00,21.628250,777.279500,42.830750,149325.975000,71.237500,1.0,2021.0,2.0,12.0,6.0,12.500000,29.500000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-18 01:12:00,25.913500,782.776500,56.513250,920534.625000,134.397500,1.0,2021.0,9.0,18.0,1.0,12.500000,28.500000,,,,,,
2021-09-18 01:14:00,25.892750,782.793500,56.555500,921024.825000,134.380000,1.0,2021.0,9.0,18.0,1.0,14.500000,28.500000,,,,,,
2021-09-18 01:16:00,25.866000,782.818500,56.597250,921339.325000,134.377500,1.0,2021.0,9.0,18.0,1.0,16.500000,28.500000,,,,,,
2021-09-18 01:18:00,25.842927,782.883415,56.641707,922168.609756,134.024390,1.0,2021.0,9.0,18.0,1.0,18.512195,29.073171,,,,,,


### 3 Minute Resampling

In [13]:
%%time

df_5min = (
  df.
  resample('5min').
  mean()
)
df_5min.to_pickle('data/data_5min.pickle.gz')
df_5min

CPU times: user 918 ms, sys: 119 ms, total: 1.04 s
Wall time: 1.04 s


Unnamed: 0_level_0,temperature,pressure,humidity,gasResistance,IAQ,iaqAccuracy,year,month,day,hour,minute,second,temperature_outdoor,feels_like,pressure_outdoor,humidity_outdoor,wind_speed,wind_deg
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-02-12 06:00:00,21.530000,777.410000,43.974000,151849.400000,37.200000,1.0,2021.0,2.0,12.0,6.0,4.000000,15.000000,,,,,,
2021-02-12 06:05:00,21.689773,777.389432,43.361477,150039.409091,51.973864,1.0,2021.0,2.0,12.0,6.0,7.272727,31.897727,,,,,,
2021-02-12 06:10:00,21.538300,777.285200,42.909800,149975.940000,67.172000,1.0,2021.0,2.0,12.0,6.0,12.000000,29.660000,,,,,,
2021-02-12 06:15:00,21.563900,777.269000,42.704100,150897.020000,65.798000,1.0,2021.0,2.0,12.0,6.0,17.000000,28.890000,,,,,,
2021-02-12 06:20:00,21.616931,777.223960,42.695545,149963.910891,71.275248,1.0,2021.0,2.0,12.0,6.0,22.009901,29.029703,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-18 01:00:00,25.987500,782.832000,56.333000,918660.290000,135.811000,1.0,2021.0,9.0,18.0,1.0,2.000000,29.710000,,,,,,
2021-09-18 01:05:00,25.966100,782.800200,56.379800,920302.740000,134.243000,1.0,2021.0,9.0,18.0,1.0,7.000000,29.500000,,,,,,
2021-09-18 01:10:00,25.915100,782.782200,56.511300,920836.000000,134.184000,1.0,2021.0,9.0,18.0,1.0,12.000000,28.600000,,,,,,
2021-09-18 01:15:00,25.860099,782.840594,56.613069,921519.306931,134.372277,1.0,2021.0,9.0,18.0,1.0,17.019802,28.732673,,,,,,


## References

* <https://scikit-learn.org/stable/modules/linear_model.html#generalized-linear-regression>

* <https://pythonhealthcare.org/2018/05/03/81-distribution-fitting-to-data/>

* <https://medium.com/@amirarsalan.rajabi/distribution-fitting-with-python-scipy-bb70a42c0aed>

* <https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html?highlight=kernel%20density#sklearn.neighbors.KernelDensity>