# Introduction
This notebook is for collecting raw data and updating interim data with the previous

### Get source folder and append to sys directory

In [3]:
from __future__ import print_function
import os
import sys
PROJ_ROOT = os.path.join(os.pardir)
print(os.path.abspath(PROJ_ROOT))
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)
# Data path example
# pump_data_path = os.path.join(PROJ_ROOT,
#                              "data",
#                              "raw",
#                              "pumps_train_values.csv")

/mnt/4ba37af6-51fd-47bc-8321-8c500c229114/study/School/KHOA LUAN TOT NGHIEP/runnable_program


### Imports
Import libraries and write settings here.

In [2]:
# Data manipulation
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30
# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython import get_ipython
ipython = get_ipython()
# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload
%autoreload 1
# Use %aimport module to reload each module

# Visualizations
import matplotlib.pyplot as plt

# Analysis/Modeling
Do work here

In [4]:
import requests
import pandas as pd
_site_URL = "https://moitruongthudo.vn/api/site"

r = requests.get(url=_site_URL)
site_data = r.json()
site_data = pd.DataFrame(site_data)
site_data_path = os.path.join(PROJ_ROOT,
                              "data",
                              "raw",
                              "site_data",
                              "site_data.csv")
site_data[['id', 'name', 'address', 'latitude', 'longtitude',
           'ref_id']].to_csv(site_data_path, index=False)

### Get raw data from moitruongthudo.vn

In [6]:
import datetime
import csv
import asyncio
import requests
import pandas as pd
import time

tic = time.time()
_stat_URL = "https://moitruongthudo.vn/public/dailystat/"
_site_id = pd.DataFrame(site_data['id'])

async def get_indv_data(parameter, site_id):
    r = requests.get(url = _stat_URL + parameter + '/', params = {'site_id': site_id})
    data = r.json()
    data = pd.DataFrame(data)
    data['parameter'] = parameter
    return data

async def data_processing(all_data, site_id, latest_time):
    all_data = pd.concat(all_data)
    all_data['time'] = pd.to_datetime(all_data['time'], format="%Y-%m-%d %H:%M:%S")
    all_data = all_data[all_data['time'] > latest_time]
    all_data['site_id'] = site_id
    all_data = pd.pivot_table(all_data, values = 'value', index = ['site_id', 'time'], columns=['parameter'],
              aggfunc='sum')
#     print(all_data)
    return all_data

async def get_site_data(site_id, latest_time):
    parameters = ['NO2','SO2','CO','PM2.5','PM10','O3']
    site_data = pd.DataFrame()
    all_data = await asyncio.gather(*(get_indv_data(p, site_id) for p in parameters))
    site_data = await data_processing(all_data, site_id, latest_time)
    return site_data

async def update_raw_files(site_id):
    csv_path_name = os.path.join(PROJ_ROOT,
                                "data",
                                "raw",
                                "{}.csv".format(site_id))
    try:
        exist_data = pd.read_csv(csv_path_name)
    except FileNotFoundError: 
        print("No data for site {}".format(site_id))
        exist_data = []
    if len(exist_data) != 0:
        # Convert column to date
        exist_data['time'] = pd.to_datetime(exist_data['time'], format="%Y-%m-%d %H:%M:%S")
        # Find the latest datetime
        latest_time = exist_data['time'].max()
        # Get data for site
        site_data = await get_site_data(site_id, latest_time)
        # Write data to file
        site_data.to_csv(csv_path_name, header=False, mode='a')
        print('done: {} site'.format(site_id))
    return None

tasks = list(_site_id.apply(lambda site_id: update_raw_files(site_id.values[0]), axis=1))
all_done = await asyncio.gather(*tasks)
toc = time.time()
print('total time in ms: {}ms'.format(1000 * (toc - tic)))

done: 1 site
done: 7 site
done: 8 site
done: 9 site
done: 10 site
done: 11 site
done: 12 site
done: 13 site
done: 14 site
done: 24 site
done: 25 site
done: 26 site
done: 27 site
done: 28 site
done: 29 site
done: 30 site
done: 31 site
done: 32 site
done: 33 site
done: 34 site
done: 35 site
done: 36 site
done: 37 site
done: 38 site
done: 39 site
done: 40 site
done: 41 site
done: 42 site
done: 43 site
done: 44 site
done: 45 site
done: 46 site
done: 47 site
total time in ms: 160050.95529556274ms


### Update interim data with newly collected data

In [8]:
# Job: take last time from interim_data
import utilities
from importlib import reload
import utilities
import pandas as pd
import numpy as np
import glob
import xarray as xr

utilities = reload(utilities)

idx = pd.IndexSlice


def update_interim_files(site_id):
    _raw_path_name = os.path.join(PROJ_ROOT,
                                 "data",
                                 "raw",
                                 "{}.csv".format(site_id))
    _interim_path_name = os.path.join(PROJ_ROOT,
                                     "data",
                                     "interim",
                                     "{}.csv".format(site_id))
    try:
        raw_data = pd.read_csv(
            _raw_path_name, parse_dates=True, index_col=['site_id', 'time'])
        interim_data = pd.read_csv(
            _interim_path_name, parse_dates=True, index_col=['site_id', 'time'])
    except FileNotFoundError:
        print("No data for site {}".format(site_id))
        raw_data = []
        interim_data = []
    if len(raw_data) != 0:
        # Find the latest datetime
        raw_latest_time = raw_data.index.get_level_values(1).max()
        interim_latest_time = interim_data.index.get_level_values(1).max()
        # Trim raw_data time/ features
        raw_data = raw_data[raw_data.index.get_level_values(
            1) >= (interim_latest_time - pd.Timedelta(hours=12))]
        raw_data = raw_data[['CO', 'NO2', 'PM25']]
        # Calculate AQI
        AQI = utilities.calculate_AQI_h(raw_data)
        # Trim AQI to interim latest time
        AQI = AQI[AQI.index.get_level_values(1) > interim_latest_time]
        # Write data to file
        AQI.to_csv(_interim_path_name, header=False, mode='a')
        print('done: {} site'.format(site_id))
    return None


tasks = list(_site_id.apply(
    lambda site_id: update_interim_files(site_id.values[0]), axis=1))

Done calculating NowCast for site 1
done: 1 site
Done calculating NowCast for site 7
done: 7 site
Done calculating NowCast for site 8
done: 8 site
Done calculating NowCast for site 9
done: 9 site
Done calculating NowCast for site 10
done: 10 site
Done calculating NowCast for site 11
done: 11 site
Done calculating NowCast for site 12
done: 12 site
Done calculating NowCast for site 13
done: 13 site


  w = min_value / max_value


Done calculating NowCast for site 14
done: 14 site
Done calculating NowCast for site 24
done: 24 site
Done calculating NowCast for site 25
done: 25 site
Done calculating NowCast for site 26
done: 26 site
Done calculating NowCast for site 27
done: 27 site
Done calculating NowCast for site 28
done: 28 site
Done calculating NowCast for site 29
done: 29 site
Done calculating NowCast for site 30
done: 30 site
Done calculating NowCast for site 31
done: 31 site
Done calculating NowCast for site 32
done: 32 site
Done calculating NowCast for site 33
done: 33 site
Done calculating NowCast for site 34
done: 34 site
Done calculating NowCast for site 35
done: 35 site
Done calculating NowCast for site 36
done: 36 site
Done calculating NowCast for site 37
done: 37 site
Done calculating NowCast for site 38
done: 38 site
Done calculating NowCast for site 39
done: 39 site
Done calculating NowCast for site 40
done: 40 site
Done calculating NowCast for site 41
done: 41 site
Done calculating NowCast for si