In [1]:
# Import packages
import pandas as pd
import xlrd
import glob

In [2]:
# Global variables
PATH_AQ = '../datas/raw/mr-ssa/airquality'
PATH_WT = '../datas/raw/mr-ssa/weather'
AIRQ_OUT = '../datas/preprocessing/mr-ssa/airquality'
WTHR_OUT = '../datas/preprocessing/mr-ssa/weather'

In [3]:
# Function to generate csv from xlsx
def gen_csv_aq(files_in: [str], path_out: str):
    for file in files_in:
        print(f'Starting {file}')
        
        # Open excel file
        wb = xlrd.open_workbook(file)
        
        # Get station name
        station = file.split('/')[-1]
        station = station[:-5]
        
        # Get all sheets
        sheet_names = wb.sheet_names()
        
        for sheet_name in sheet_names:
            # Get sheet
            sheet = wb.sheet_by_name(sheet_name)

            # Rows number
            nrows = sheet.nrows

            # Get cols names
            cols = sheet.row_values(2)

            # Get units
            units = sheet.row_values(3)

            # Concatenate cols with units
            for i in range(1, len(cols)):
                cols[i] = f'{cols[i]} ({units[i]})'

            # Init dataframe
            df = pd.DataFrame(columns=cols)

            # Get only valid datas
            for r in range(4, nrows-8):
                aux = pd.DataFrame([sheet.row_values(r)], columns=cols)
                df = pd.concat([df, aux], ignore_index=True)

            # Generate output name
            f_out = f'{path_out}/{station}-{sheet_name}'
            
            # Generate gzip output file
            df.to_csv(f'{f_out}.csv.gz', index=False, compression='gzip')
            
            print(f'[DONE] {f_out}.zip')
            
        print(f'Ending {file}')
    print('Ending all.')

In [4]:
# Function to generate csv from xls
def gen_csv_wt(files_in: [str], path_out: str):
    for file in files_in:
        print(f'Starting {file}')

        # Open excel file
        wb = xlrd.open_workbook(file)

        # Get year
        year = file.split('/')[-1]
        year = year[-8:-4]

        # Get sheet
        sheet = wb.sheet_by_index(0)

        # Rows number
        nrows = sheet.nrows

        # Get cols names with counts
        stations = sheet.row_values(2, start_colx=1)
        stations_u = list(dict.fromkeys(stations))
        stations_c = { x: stations.count(x) for x in stations_u}

        index_col = sheet.cell_value(3,0)

        colx = dict(start_colx=1, end_colx=0)
        for st in stations_c:
            colx['end_colx'] = colx['start_colx'] + stations_c[st]

            # Get cols names
            cols = [index_col]
            cols.extend(sheet.row_values(3, **colx))

            # Get units
            units = sheet.row_values(4, **colx)

            # Concatenate cols with units
            for i in range(1, len(cols)):
                cols[i] = f'{cols[i]} ({units[i-1]})'

            # Init dataframe
            df = pd.DataFrame(columns=cols)

            # Get only valid datas
            for r in range(5, nrows-8):
                register = [sheet.cell_value(r, 0)]
                register.extend(sheet.row_values(r, **colx))
                aux = pd.DataFrame([register], columns=cols)
                df = pd.concat([df, aux], ignore_index=True)

            # Generate output name
            f_out = f'{WTHR_OUT}/{st}-{year}.csv.gz'

            # Generate gzip output file
            df.to_csv(f_out, index=False, compression='gzip')

            colx['start_colx'] = colx['end_colx']
            print(f'[DONE] {f_out}')
            
        print(f'Ending {file}')
    print('Ending all.') 

In [5]:
# Get air quality excel files
f_airq = [f for f in glob.glob(PATH_AQ + "/*.xlsx")]
f_airq

['../datas/raw/mr-ssa/airquality/av-acm-detran.xlsx',
 '../datas/raw/mr-ssa/airquality/barros-reis.xlsx',
 '../datas/raw/mr-ssa/airquality/campo-grande.xlsx',
 '../datas/raw/mr-ssa/airquality/itaigara.xlsx',
 '../datas/raw/mr-ssa/airquality/paralela.xlsx',
 '../datas/raw/mr-ssa/airquality/piraja.xlsx']

In [6]:
gen_csv_aq(f_airq, AIRQ_OUT)

Starting ../datas/raw/mr-ssa/airquality/av-acm-detran.xlsx
[DONE] ../datas/preprocessing/mr-ssa/airquality/av-acm-detran-2013.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/av-acm-detran-2014.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/av-acm-detran-2015.zip
Ending ../datas/raw/mr-ssa/airquality/av-acm-detran.xlsx
Starting ../datas/raw/mr-ssa/airquality/barros-reis.xlsx
[DONE] ../datas/preprocessing/mr-ssa/airquality/barros-reis-2013.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/barros-reis-2014.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/barros-reis-2015.zip
Ending ../datas/raw/mr-ssa/airquality/barros-reis.xlsx
Starting ../datas/raw/mr-ssa/airquality/campo-grande.xlsx
[DONE] ../datas/preprocessing/mr-ssa/airquality/campo-grande-2013.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/campo-grande-2014.zip
[DONE] ../datas/preprocessing/mr-ssa/airquality/campo-grande-2015.zip
Ending ../datas/raw/mr-ssa/airquality/campo-grande.xlsx
Starting ../datas/raw/mr-ssa

In [7]:
# Get weather excel files
f_wthr = [f for f in glob.glob(PATH_WT + "/*.xls")]
f_wthr

['../datas/raw/mr-ssa/weather/rmar-2013.xls',
 '../datas/raw/mr-ssa/weather/rmar-2014.xls',
 '../datas/raw/mr-ssa/weather/rmar-2015.xls']

In [8]:
gen_csv_wt(f_wthr, WTHR_OUT)

Starting ../datas/raw/mr-ssa/weather/rmar-2013.xls
[DONE] ../datas/preprocessing/mr-ssa/weather/AV. ACM - DETRAN-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/AV. BARROS REIS-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/CAMPO GRANDE-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/DIQUE DO TORORO-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/ITAIGARA-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/PARALELA-CAB-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/PIRAJA-2013.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/RIO VERMELHO-2013.csv.gz
Ending ../datas/raw/mr-ssa/weather/rmar-2013.xls
Starting ../datas/raw/mr-ssa/weather/rmar-2014.xls
[DONE] ../datas/preprocessing/mr-ssa/weather/AV. ACM - DETRAN-2014.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/AV. BARROS REIS-2014.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/CAMPO GRANDE-2014.csv.gz
[DONE] ../datas/preprocessing/mr-ssa/weather/DIQUE DO TORORO-2014.csv.gz