# Data Download and Preprocessing

The purpose of this notebook is apply the functions in `data/download.py` and `data/preprocess.py`. 

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import sys
sys.path.append('..')

%load_ext autoreload
%autoreload 2

import warnings
import tqdm
import pandas as pd

warnings.simplefilter(action="ignore", category=tqdm.TqdmExperimentalWarning)
warnings.simplefilter(action="ignore", category=pd.core.common.SettingWithCopyWarning)

## 1. Set the data directory

This can be changed in the `config.json` file

In [None]:
from data import get_data_dir

DATA_DIR = get_data_dir()

print(f'Data will be downladed to: {DATA_DIR}')

## 2. Download the data

Use the requests module to access the online data and save it to `DATA_DIR`

In [None]:
from data.download import download_fire_data

download_fire_data(DATA_DIR)

In [None]:
from data.download import download_GLOBE_data

download_GLOBE_data(DATA_DIR)

In [None]:
from data.download import download_met_data

download_met_data(DATA_DIR)

## 3. Preprocess the data

Processed data will be saved to `data/processed`. Apply optional transform to data: `'log'`, `'quantile'`, `'norm'`, `None`. 

In [None]:
# Preprocess the fire data for each transform type 

from data.preprocess import FirePipeline


transform_pbar = tqdm.notebook.tqdm(['log', 'quantile', 'norm', None], leave=False)

for transform in transform_pbar:
    
    transform_pbar.set_description(f'Transform: {transform}')
    FirePipeline(DATA_DIR, transform=transform).process().to_csv()

In [None]:
# Preprocess each metric for each transform type 

from data.preprocess import MetPipeline

transform_pbar = tqdm.notebook.tqdm(['log', 'quantile', 'norm', None], leave=False)
for transform in transform_pbar:
    
    transform_pbar.set_description(f'Transform: {transform}')
    metric_pbar = tqdm.notebook.tqdm(['Ozone', 'SO2', 'CO', 'NO2', 'PM25', 'PM10', 'Wind', 'Pressure', 'Temperature', 'Humidity'], leave=False)

    for metric in metric_pbar:
        
        metric_pbar.set_description(f'Metric: {metric}')
        MetPipeline(DATA_DIR, metric, transform=transform).process().to_csv()

In [24]:
df

Unnamed: 0_level_0,Ozone,CO,NO2,PM2.5,PM10
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GLS KGR,0.6444,0.8207,0.6912,0.7512,0.6899
KGR,0.6539,0.8438,0.7201,0.7809,0.6728
GLS KGR (GF),0.6835,0.8602,0.7392,0.7859,0.7008
KGR (GF),0.6706,0.8685,0.7703,0.8003,0.6957
Ridge,0.7298,0.9704,0.7316,0.7819,0.7195
Lasso,0.7283,0.9881,0.7296,0.7763,0.7222
OLS,1.1276,2.4638,1.0311,1.1715,1.1784
