# Data preprocessing

In [None]:
import numpy as np
import pandas as pd

data_raw = '../nosync/data_raw'
data_csv_only_converted = '../nosync/data_csv_full_only_converted'
data_csv_transformed = '../nosync/data_csv'
data_csv_reduced = '../nosync/data_csv_reduced'

## Convert XML to CSV

In [None]:
import pandas as pd

from xml.etree import ElementTree as ET

def transform(file):
    input_file = file

    #removing newlines
    clean = open(input_file, encoding='utf8').read().replace('\n', '')
    f = open(input_file, 'w', encoding='utf8')
    f.write(clean)
    f.close()

    STK = []
    DrTP = []
    VIN = []
    DatKont = []
    TZn = []
    TypMot = []
    DrVoz = []
    ObchOznTyp = []
    Ct = []
    DatPrvReg = []
    Km = []
    Zavady = []
    tmp = []
    VyslSTK = []
    VyslEmise = []

    # parsing
    parser = ET.iterparse(input_file)
    for event, element in parser:
        if element.tag == 'record':

            # Number of STK (station itself)
            if 'STK' in element.attrib:
                STK.append(element.attrib['STK'])
            elif 'CisP' in element.attrib:
                STK.append(element.attrib['CisP'].split('-')[1])
            else:
                STK.append('')

            if 'DrTP' in element.attrib:
                DrTP.append(element.attrib['DrTP'])
            else:
                DrTP.append('')

            if 'VIN' in element.attrib:
                VIN.append(element.attrib['VIN'])
            else:
                VIN.append('')

            if 'DatKont' in element.attrib:
                DatKont.append(element.attrib['DatKont'])
            else:
                DatKont.append('')

            if 'TZn' in element.attrib:
                TZn.append(element.attrib['TZn'])
            else:
                TZn.append('')

            if 'TypMot' in element.attrib:
                TypMot.append(element.attrib['TypMot'])
            else:
                TypMot.append('')

            if 'DrVoz' in element.attrib:
                DrVoz.append(element.attrib['DrVoz'])
            else:
                DrVoz.append('')

            if 'ObchOznTyp' in element.attrib:
                ObchOznTyp.append(element.attrib['ObchOznTyp'])
            else:
                ObchOznTyp.append('')

            if 'Ct' in element.attrib:
                Ct.append(element.attrib['Ct'])
            else:
                Ct.append('')

            if 'DatPrvReg' in element.attrib:
                DatPrvReg.append(element.attrib['DatPrvReg'])
            else:
                DatPrvReg.append('')

            if 'Km' in element.attrib:
                Km.append(element.attrib['Km'])
            else:
                Km.append('')

            # There are no defects recorded in the 2018 dataset, so this is useless.
            # In the newer datasets, it's computed after conversion to dataframe from the raw "Zav" field.

            # if 'ZavA' in element.attrib:
            #     tmp.append(element.attrib['ZavA'])
            # else:
            #     tmp.append('')

            # if 'ZavB' in element.attrib:
            #     tmp.append(element.attrib['ZavB'])
            # else:
            #     tmp.append('')

            # if 'ZavC' in element.attrib:
            #     tmp.append(element.attrib['ZavC'])
            # else:
            #     tmp.append('')

            if 'Zav' in element.attrib:
                Zavady.append(element.attrib['Zav'])
            else:
                Zavady.append(','.join(tmp))
                tmp = []

            if 'VyslSTK' in element.attrib:
                VyslSTK.append(element.attrib['VyslSTK'])
            elif 'Vysl' in element.attrib:
                VyslSTK.append(element.attrib['Vysl'])
            else:
                VyslSTK.append('')

            if 'VyslEmise' in element.attrib:
                VyslEmise.append(element.attrib['VyslEmise'])
            else:
                VyslEmise.append('')

            element.clear()

    data = pd.DataFrame({'STK': STK, 'DrTP': DrTP, 'VIN': VIN, 'DatKont': DatKont, 'TypMot': TypMot, 'TZn': TZn, 'DrVoz': DrVoz, 'ObchOznTyp': ObchOznTyp, 'Ct': Ct, 'DatPrvReg': DatPrvReg, 'Km': Km, 'Zavady': Zavady, 'VyslSTK': VyslSTK, 'VyslEmise': VyslEmise})

    return data

In [None]:
import os

# Load raw xmls
dirs_raw = {}

for dir in os.listdir(data_raw):
    if not os.path.isdir(data_raw + '/' + dir):
        continue
    for instance in os.listdir(data_raw + '/'  + dir):
        year = instance.split('.')[0].split('_')[2]
        month = instance.split('.')[0].split('_')[3]
        print(f'{year}-{month}')
        data = transform(f'{data_raw}/{dir}/{instance}')
        data.to_csv(f'{data_csv_only_converted}/{year}-{month}.csv')
        # instances.append(instance)
    # dirs_raw[dir] = instances

In [None]:
# Handle 2018 separately to split months.
# This needs a few gigs of RAM.
all_data = transform(f'{data_raw}/Seznam_prohlídek_STK_2018.xml')
for month in range(1, 13):
    datepadded = f'2018-{str(month).rjust(2, "0")}'
    # Flag rows from the current month.
    all_data['retain'] = all_data['DatKont'].apply(lambda val: True if val.startswith(datepadded) else False)
    # Save positively flagged rows without the flag column
    all_data[all_data['retain']].loc[:, all_data.columns != 'retain'].to_csv(f'{data_csv_only_converted}/{datepadded}.csv')

## Unify and preprocess column values

In [None]:
# Prepare the processing pipeline

dataset_filenames = [filename for filename in os.listdir(data_csv_only_converted)]
pipelines = {filename: [] for filename in dataset_filenames}

# def apply_func(dataset_path, func, tmp=None):
#     """Apply a single function to a CSV dataset. Inefficient loading, use this only for testing."""
#     df = pd.read_csv(dataset_path, index_col=0)
#     x = dataset_path.split('/')[-1].split('.')[-2].split('-')
#     year = x[0]
#     month = x[1]
#     df = func(df)

#     if tmp == 'tmp':
#         df.to_csv(dataset_path + '_tmp.csv')<
#     df.to_csv(dataset_path)

# # For mem only computations
# datasets = [pd.read_csv(path, index_col=0) for path in dataset_filenames]
# def apply_func_to_all(func, tmp = 'mem'):
#     for df in datasets:
#         df = func(df)

def apply_pipeline(dataset_filename, pipeline):
    """Apply a list of functions to a CSV dataset"""
    df = pd.read_csv(data_csv_only_converted + '/' + dataset_filename, index_col=0)
    # x = dataset_filename.split('.')[-2].split('-')
    # year = x[0]
    # month = x[1]
    for func in pipeline:
        print('  ' + func.__name__)
        df = func(df)
    df.to_csv(data_csv_transformed + '/' + dataset_filename)

def append_pipeline_for_all(func):
    for dataset_filename in dataset_filenames:
        pipelines[dataset_filename].append(func)

def run():
    for dataset_filename, pipeline in pipelines.items():
        print(dataset_filename)
        apply_pipeline(dataset_filename, pipeline)

In [None]:
# Replace `---` with np.nan
###### Drop records with missing essential values.

def process_missing(df):
    return df.replace('---', np.NaN)
    return df.replace('', np.NaN)

append_pipeline_for_all(process_missing)
# apply_func_to_all(process_missing)

In [None]:
# Split DatKont and DatPrvReg to separate year, month, day columns. Add weekday indicator.
# Convert dates to a pandas date.
# 2018-01 => 2019-03  yyyy-mm-ddThh:mm:ss.sss
# 2019-04 => 2019-07  mm/dd/yyyy
# 2019-08+            dd.mm.yyyy

from datetime import date

def add_weekday(df: pd.DataFrame) -> pd.DataFrame:
    df['DTKont'] = df['DatKont'].apply(lambda date: date.isoweekday())
    return df

def strip_time_v1(df):
    # df[['RokKont', 'MesKont', 'DenKont']] = df['DatKont'].str.split('T').str[0].str.split('-', expand=True).astype('uint16')
    # df[['HodKont', 'MinKont']] = df['DatKont'].str.split('T').str[1].str.split(':', expand=True).iloc[:, :2].astype('uint8')
    df['DatKont'] = pd.to_datetime(df['DatKont'], format = '%Y-%m-%d', exact = False, errors = 'coerce')
    # df[['RokPrvReg', 'MesPrvReg', 'DenPrvReg']] = df['DatPrvReg'].str.split('T').str[0].str.split('-', expand=True).astype('uint16')
    df['DatPrvReg'] = pd.to_datetime(df['DatPrvReg'], format = '%Y-%m-%d', exact = False, errors = 'coerce')
    return add_weekday(df)

def strip_time_v2(df):
    # df[['MesKont', 'DenKont', 'RokKont']] = df['DatKont'].str.split('/', expand=True).astype('uint16')
    df['DatKont'] = pd.to_datetime(df['DatKont'], format = '%m/%d/%Y', exact = True, errors = 'coerce')
    # df[['MesPrvReg', 'DenPrvReg', 'RokPrvReg']] = df['DatPrvReg'].str.split('/', expand=True).astype('uint16')
    df['DatPrvReg'] = pd.to_datetime(df['DatPrvReg'], format = '%m/%d/%Y', exact = True, errors = 'coerce')
    return add_weekday(df)

def strip_time_v3(df):
    # df[['DenKont', 'MesKont', 'RokKont']] = df['DatKont'].str.split('.', expand=True).astype('uint16')
    df['DatKont'] = pd.to_datetime(df['DatKont'], format = '%d.%m.%Y', exact = True, errors = 'coerce')
    # df[['DenPrvReg', 'MesPrvReg', 'RokPrvReg']] = df['DatPrvReg'].str.split('.', expand=True).astype('uint16')
    df['DatPrvReg'] = pd.to_datetime(df['DatPrvReg'], format = '%d.%m.%Y', exact = True, errors = 'coerce')
    return add_weekday(df)

# Add appropriate function to each dataset's pipeline
for dfn in dataset_filenames:
    if dfn.__contains__('2018') or dfn.__contains__('2019-01') \
        or dfn.__contains__('2019-02') or dfn.__contains__('2019-03'):
        pipelines[dfn].append(strip_time_v1)
    elif dfn.__contains__('2019-04') or dfn.__contains__('2019-05') \
        or dfn.__contains__('2019-06') or dfn.__contains__('2019-07'):
        pipelines[dfn].append(strip_time_v2)
    else:
        pipelines[dfn].append(strip_time_v3)

In [None]:
# Add counts of failures by type.

# Load ciselnik_zavad.
cz = pd.read_csv('../../defect_list/defect_list.csv')
cz.set_index('Kod', inplace=True)
# display(cz.head())

def count_failures(df: pd.DataFrame) -> pd.DataFrame:
    """"Add ZavA, ZavB, ZavC (count of failures by severity) based on the contents of `Zav`"""

    def map_failures(row):
        failure_codes = str(row['Zavady']).split(',')
        if len(failure_codes) == 0:
            return (0, 0, 0)
        else:
            a = 0
            b = 0
            c = 0
            for failure_code in failure_codes:
                type = None
                try:
                    type = cz['Typ'].loc[failure_code]
                except:
                    pass
                if type == 'A':
                    a = a + 1
                if type == 'B':
                    b = b + 1
                if type == 'C':
                    c = c + 1
            return (a, b, c)
   
    df[['ZavA', 'ZavB', 'ZavC']] = df.apply(map_failures, axis=1, result_type='expand')
    return df

append_pipeline_for_all(count_failures)
# apply_func_to_all(count_failures)

In [None]:
# Add counts of failures by first-level category

def count_failures_by_fl_cat(df: pd.DataFrame) -> pd.DataFrame:
    """"Add Zav0-9 (count of failures by first-level category) based on the contents of `Zav`"""

    def map_failures(row):
        failure_codes = str(row['Zavady']).split(',')
        if len(failure_codes) == 0:
            return (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
        else:
            fs = [0 for i in range(10)]
            for failure_code in failure_codes:
                fln = -1
                try:
                    fln = int(failure_code.split('.')[0])
                except:
                    pass
                if fln >= 0 and fln <= 9:
                    fs[fln] = fs[fln] + 1
            return (fs[0], fs[1], fs[2], fs[3], fs[4], fs[5], fs[6], fs[7], fs[8], fs[9])

    df[['Zav0', 'Zav1', 'Zav2', 'Zav3', 'Zav4', 'Zav5', 'Zav6', 'Zav7', 'Zav8', 'Zav9']] = df.apply(map_failures, axis=1, result_type='expand')
    return df

append_pipeline_for_all(count_failures_by_fl_cat)

In [None]:
# Add vehicle age in days. (float to support nan)
# TODO: Add float age in years for analysis?

from datetime import datetime

def count_vehicle_age(df: pd.DataFrame) -> pd.DataFrame:
    today = datetime.today()

    df['StariDnu'] = df['DatPrvReg'].apply(lambda date: (today - date).days)

    return df

append_pipeline_for_all(count_vehicle_age)
# apply_func_to_all(count_vehicle_age)

In [None]:
# From here, preprocessing for analysis begins to mix with preprocessing just for data storage.

# Process categorical indicators

# 1. Gather all possible values, save them.
cat_ind = ['STK', 'DrTP', 'TypMot', 'TZn', 'DrVoz', 'ObchOznTyp', 'Ct']
cat_ind_vals = {cat: set() for cat in cat_ind}

def gather_categorical(df: pd.DataFrame) -> pd.DataFrame:
    
    def process(row):
        for cat in cat_ind:
            cat_ind_vals[cat].add(row[cat])

    df.apply(process, axis=1)
    return df

append_pipeline_for_all(gather_categorical)

# # 2. Drop currently unusable variables.
# def drop_useless(df: pd.DataFrame) -> pd.DataFrame:
#     df = df.drop(['TypMot'])

#     return df

# 2. Convert categorical values to dummy variables
#    Adjust this step to include only variables that are needed for analysis

# 3. Convert VyslSTK, VyslEmise to an ordinal value
#    VyslSTK:
#      zpusobile: 0
#      castecne zpusobile: 1
#      nezpusobile: 2
#    VyslEmise:
#      vyhovuje: 0
#      nevyhovuje: 1

In [None]:
# Run the pipeline
run()

# Save gathered categorical values.
import json
with open('categories.json', "w") as fp:
    json.dump({cat: list(cat_ind_vals[cat]) for cat in cat_ind}, fp)

In [None]:
len(cat_ind_vals['STK'])