# Clean and merge data

In [1]:
# First party libraries
import os

# 3rd Party Libraries
import pandas as pd
import numpy as np

In [2]:
def clean_data(df):
    # Get time in seconds
    df[['tm_split_s','tm_split_m','tm_split_h']] = df['time'].\
        apply(lambda x:pd.Series(x.split(':')[::-1]))
    df[['tm_split_s','tm_split_m','tm_split_h']] = \
    df[['tm_split_s','tm_split_m','tm_split_h']].fillna(0).astype('int64')
    df['time_seconds'] = df['tm_split_s'] + df['tm_split_m']  *60 + df['tm_split_h'] *3600

    # Remove records missing age
    df.dropna(subset=['age'], inplace=True)

    # Change field types
    convert_dict = {'bib':'object','age':'int64'}
    df = df.astype(convert_dict)

    # Drop unneeded fields
    df.drop(columns=['name','bib', 'tm_split_s', 'tm_split_m', 'tm_split_h']\
            , inplace=True)

    return df


In [6]:
data_dir = 'data'
csv_files = ['Steamboat_15K_2016_results.csv',\
             'Steamboat_15K_2018_results.csv',\
             'Steamboat_15K_2022_results.csv',\
             'Steamboat_15K_2017_results.csv',\
             'Steamboat_15K_2019_results.csv',\
             'Steamboat_15K_2023_results.csv',\
             'Steamboat_4mi_2016_results.csv',\
             'Steamboat_4mi_2017_results.csv',\
             'Steamboat_4mi_2018_results.csv',\
             'Steamboat_4mi_2019_results.csv',\
             'Steamboat_4mi_2021_results.csv',\
             'Steamboat_4mi_2022_results.csv',\
             'Steamboat_4mi_2023_results.csv'
            ]


In [7]:
race_results = pd.DataFrame()
for f_name in csv_files:
    race_df = pd.read_csv(os.path.join(data_dir, f_name))
    race_df['distance'] = f_name.split('_')[1]
    race_df['year'] = f_name.split('_')[2]
    race_results = pd.concat([race_results, clean_data(race_df)], ignore_index=True)

In [8]:
race_results.to_pickle(os.path.join(data_dir, 'Steamboat_clean_results.pickle'))
race_results.to_csv(os.path.join(data_dir, 'Steamboat_clean_results.csv'))