In [2]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
import os
from sklearn.model_selection import train_test_split
from datetime import datetime

## Build an unique dataset based on the median of each feature
1- Aggregate each data sensors hourly by applying a mean
2- Create an unique dataset by applying a median of each sensor in each hour

### Define constants and hyperparameters

In [4]:
START_DATE_BOARD    = '2022-11-03'
END_DATE_BOARD      = '2023-06-15'
PM25_DIRECTORY      = '../resources/dataset/'
PM2_MAP = {
    "board20": ["s250.csv", "s252.csv", "s256.csv", "s258.csv"],
    "board20_temp": ["s258.csv"],
    "board20_pres": ["s260.csv"],
    "board20_rh": ["s259.csv"],
    "board21": ["s263.csv", "s265.csv", "s267.csv", "s269.csv"],
    "board22": ["s276.csv", "s278.csv", "s280.csv", "s282.csv"],
    "board25": ["s315.csv", "s317.csv", "s319.csv", "s321.csv"],
    "board29": ["s367.csv", "s369.csv", "s371.csv", "s373.csv"],
    "board31": ["s393.csv", "s395.csv", "s397.csv", "s399.csv"]
}

# Define the hyperparameter
TRAIN_SIZE      = 0.7
RANDOM_STATE    = 42
INPUT_SIZE      = 60
HIDDEN_SIZE     = 128
OUTPUT_SIZE     = 1
LEARNING_RATE   = 0.0001
NUM_EPOCHS      = 200
BATCH_SIZE      = 20

### Transform each CSV into Pandas dataframe

In [None]:
def prepare_pm25_dataframe(file_name: str) -> pd.DataFrame:
    df_pm25             = pd.read_csv(file_name) # mu_g/m^3
    df_pm25.timestamp   = pd.to_datetime(df_pm25.timestamp)
    df_pm25.drop_duplicates(inplace=True)
    df_pm25.sort_values(by='timestamp', inplace=True)
    df_pm25 = df_pm25.loc[(df_pm25['timestamp'] >= START_DATE_BOARD) & (df_pm25['timestamp'] <= END_DATE_BOARD)]
    df_pm25 = df_pm25.groupby(pd.Grouper(key='timestamp', freq='min')).mean().reset_index()
    df_pm25.dropna(inplace=True)
    return df_pm25

def prepare_generic_dataframe(file_name: str) -> pd.DataFrame:
    df             = pd.read_csv(file_name)
    df.timestamp   = pd.to_datetime(df.timestamp)
    df.drop_duplicates(inplace=True)
    df.sort_values(by='timestamp', inplace=True)
    df = df.loc[(df['timestamp'] >= START_DATE_BOARD) & (df['timestamp'] <= END_DATE_BOARD)]
    df = df.groupby(pd.Grouper(key='timestamp', freq='min')).mean().reset_index()
    df.dropna(inplace=True)
    return df

dataframes_pm25 = []
dataframes_temp = []
dataframes_pres = []
dataframes_rh = []
for folder_name in tqdm(os.listdir(PM25_DIRECTORY), desc='Analyzing folders'):
    folder = os.path.join(PM25_DIRECTORY, folder_name)
    if os.path.isdir(folder) and len(folder.split('/')) > 3 and folder.split('/')[3] in PM2_MAP:
        files = PM2_MAP[folder.split('/')[3]]
        for file_name in files:
            file = os.path.join(folder, file_name)
            if os.path.isfile(file) and file.endswith(".csv"):
                df = prepare_generic_dataframe(file)
                if folder_name.endswith("_temp"):
                    dataframes_temp.append(df)
                elif folder_name.endswith("_pres"):
                    dataframes_pres.append(df)
                elif folder_name.endswith("_rh"):
                    dataframes_rh.append(df)
                else:
                    dataframes_pm25.append(df)

Analyzing folders:   7%|▋         | 2/28 [00:29<06:19, 14.59s/it]

### Get an unique dataframe by applying the median in each point

In [None]:
df_final                = pd.DataFrame(columns=['timestamp', 'data'])
df_final['timestamp']   = pd.date_range(start=START_DATE_BOARD, end=END_DATE_BOARD, freq='min')
pm25_series             = []
temp_series             = []
pres_series             = []
rh_series               = []

for pit in tqdm(df_final['timestamp'], desc='Building unique dataset'):
    # PM2.5
    pm25_values = []
    for df in dataframes_pm25:
        value = df[df.timestamp == pit]['data'].values
        if len(value) > 0:
            pm25_values.append(value[0])
        #else:
        #    pm25_values.append(0)
    pm25_series.append(np.median(pm25_values) if len(pm25_values) > 0 else None)
    # Temperature
    temp_values = []
    for df in dataframes_temp:
        value = df[df.timestamp == pit]['data'].values
        if len(value) > 0:
            temp_values.append(value[0])
    temp_series.append(np.median(temp_values) if len(temp_values) > 0 else None)
    # Pressure
    pres_values = []
    for df in dataframes_pres:
        value = df[df.timestamp == pit]['data'].values
        if len(value) > 0:
            pres_values.append(value[0])
    pres_series.append(np.median(pres_values) if len(pres_values) > 0 else None)
    # Humidity
    rh_values = []
    for df in dataframes_rh:
        value = df[df.timestamp == pit]['data'].values
        if len(value) > 0:
            rh_values.append(value[0])
    rh_series.append(np.median(rh_values) if len(rh_values) > 0 else None)

df_final['pm25']        = pd.Series(pm25_series)
df_final['temp']        = pd.Series(temp_series)
df_final['pres']        = pd.Series(pres_series)
df_final['rh']          = pd.Series(rh_series)
df_final.dropna(inplace=True)
df_final.to_csv('../resources/dataset/unique_timeseries_by_median_minutes_all_attributes.csv', index=False)