# Preprocessing datasets

## Import packages

In [1]:
from pysus.utilities.readdbc import read_dbc

import pandas as pd
import xlrd
import glob

## Global variables

In [8]:
# Paths
PATHS = dict(
    IN = '../datas/raw/mr-ssa',
    OUT = '../datas/preprocessing/mr-ssa'
) 

# Folders
FOLDERS = dict(
    AIRQ = 'airquality/cetrel',
    WTHR = 'weather/cetrel',
    DSUS = 'datasus'
)

## Air quality - preprocessing

### [CETREL] Get air quality excel files

In [3]:
path = f"{PATHS['IN']}/{FOLDERS['AIRQ']}/*.xlsx"
f_airq = [f for f in glob.glob(path)]
f_airq

['../datas/raw/mr-ssa/airquality/av-acm-detran.xlsx',
 '../datas/raw/mr-ssa/airquality/barros-reis.xlsx',
 '../datas/raw/mr-ssa/airquality/campo-grande.xlsx',
 '../datas/raw/mr-ssa/airquality/itaigara.xlsx',
 '../datas/raw/mr-ssa/airquality/paralela.xlsx',
 '../datas/raw/mr-ssa/airquality/piraja.xlsx']

### [CETREL] Generating csv

In [4]:
for file in f_airq:
    print(f'\nStarting {file}')

    # Open excel file
    wb = xlrd.open_workbook(file)

    # Get station name
    station = file.split('/')[-1]
    station = station[:-5]

    # Get all sheets
    sheet_names = wb.sheet_names()

    for sheet_name in sheet_names:
        # Get sheet
        sheet = wb.sheet_by_name(sheet_name)

        # Rows number
        nrows = sheet.nrows

        # Get cols names
        cols = sheet.row_values(2)

        # Get units
        units = sheet.row_values(3)

        # Concatenate cols with units
        for i in range(1, len(cols)):
            cols[i] = f'{cols[i]} ({units[i]})'

        # Init dataframe
        df = pd.DataFrame(columns=cols)

        # Get only valid datas
        for r in range(4, nrows-8):
            aux = pd.DataFrame([sheet.row_values(r)], columns=cols)
            df = pd.concat([df, aux], ignore_index=True)

        # Generate output name
        f_out = f"{PATHS['OUT']}/{FOLDERS['AIRQ']}/{station}-{sheet_name}"

        # Generate gzip output file
        df.to_csv(f'{f_out}.csv.gz', index=False, compression='gzip')

        print(f'[DONE] {f_out}.zip')

    print(f'Ending {file}\n')
print('Ending all.')


Starting ../datas/raw/mr-ssa/airquality/av-acm-detran.xlsx
[DONE] ../datas/preprocessing/mr-ssa/airquality/av-acm-detran-2013.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/av-acm-detran-2014.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/av-acm-detran-2015.zip
Ending ../datas/raw/mr-ssa/airquality/av-acm-detran.xlsx


Starting ../datas/raw/mr-ssa/airquality/barros-reis.xlsx
[DONE] ../datas/preprocessing/mr-ssa/airquality/barros-reis-2013.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/barros-reis-2014.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/barros-reis-2015.zip
Ending ../datas/raw/mr-ssa/airquality/barros-reis.xlsx


Starting ../datas/raw/mr-ssa/airquality/campo-grande.xlsx
[DONE] ../datas/preprocessing/mr-ssa/airquality/campo-grande-2013.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/campo-grande-2014.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/campo-grande-2015.zip
Ending ../datas/raw/mr-ssa/airquality/campo-grande.xlsx


Starting ../datas/raw

## Weather - preprocessing

### [CETREL] Get weather excel files

In [5]:
path = f"{PATHS['IN']}/{FOLDERS['WTHR']}/*.xls"
f_wthr = [f for f in glob.glob(path)]
f_wthr

['../datas/raw/mr-ssa/weather/rmar-2013.xls',
 '../datas/raw/mr-ssa/weather/rmar-2014.xls',
 '../datas/raw/mr-ssa/weather/rmar-2015.xls']

### [CETREL] Generating csv

In [6]:
for file in f_wthr:
    print(f'\nStarting {file}')

    # Open excel file
    wb = xlrd.open_workbook(file)

    # Get year
    year = file.split('/')[-1]
    year = year[-8:-4]

    # Get sheet
    sheet = wb.sheet_by_index(0)

    # Rows number
    nrows = sheet.nrows

    # Get cols names with counts
    stations = sheet.row_values(2, start_colx=1)
    stations_u = list(dict.fromkeys(stations))
    stations_c = { x: stations.count(x) for x in stations_u}

    index_col = sheet.cell_value(3,0)

    colx = dict(start_colx=1, end_colx=0)
    for st in stations_c:
        colx['end_colx'] = colx['start_colx'] + stations_c[st]

        # Get cols names
        cols = [index_col]
        cols.extend(sheet.row_values(3, **colx))

        # Get units
        units = sheet.row_values(4, **colx)

        # Concatenate cols with units
        for i in range(1, len(cols)):
            cols[i] = f'{cols[i]} ({units[i-1]})'

        # Init dataframe
        df = pd.DataFrame(columns=cols)

        # Get only valid datas
        for r in range(5, nrows-8):
            register = [sheet.cell_value(r, 0)]
            register.extend(sheet.row_values(r, **colx))
            aux = pd.DataFrame([register], columns=cols)
            df = pd.concat([df, aux], ignore_index=True)

        # Generate output name
        f_out = f"{PATHS['OUT']}/{FOLDERS['WTHR']}/{st}-{year}.csv.gz"

        # Generate gzip output file
        df.to_csv(f_out, index=False, compression='gzip')

        colx['start_colx'] = colx['end_colx']
        print(f'[DONE] {f_out}')

    print(f'Ending {file}\n')
print('Ending all.') 


Starting ../datas/raw/mr-ssa/weather/rmar-2013.xls
[DONE] ../datas/preprocessing/mr-ssa/weather/AV. ACM - DETRAN-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/AV. BARROS REIS-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/CAMPO GRANDE-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/DIQUE DO TORORO-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/ITAIGARA-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/PARALELA-CAB-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/PIRAJA-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/RIO VERMELHO-2013.csv.gz
Ending ../datas/raw/mr-ssa/weather/rmar-2013.xls


Starting ../datas/raw/mr-ssa/weather/rmar-2014.xls
[DONE] ../datas/preprocessing/mr-ssa/weather/AV. ACM - DETRAN-2014.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/AV. BARROS REIS-2014.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/CAMPO GRANDE-2014.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/DIQUE DO TORORO-2014.csv

## Datasus - preprocessing

In [7]:
# df = read_dbc(filename, encoding='iso-8859-1')

NameError: name 'filename' is not defined