# Reading Relay Data

In [None]:
import re
from dataclasses import dataclass
from typing import List
from operator import attrgetter
import numpy as np
import pandas as pd

This workshop reads the results from the *James Joyce Ramble 10K* in *Dedham MA* and converts the pace of each
runner to mph.

We want to read in lines that look like this:
     
     1601 169/171  M5059 1:17:17 1:16:51  12:22 James Katides 57 M   633 Quincy MA

Each line contains the following information:

- Place
- Div/Tot
- Div
- Guntime
- Nettime
- Pace
- Name
- Ag
- S
- Race#
- City/state

In [None]:
line = ' 1601 169/171  M5059 1:17:17 1:16:51  12:22 James Katides          57 M   633 Quincy MA'

We need a way of identifying data lines - i.e those that start with a integer place number surrounded by whitespace

In [None]:
DATA_LINE_IDENTIFIER = re.compile(r'^\s+\d+\s+')

In [None]:
DATA_LINE_IDENTIFIER.match(line)

We don't want to match non-data lines like this:

In [None]:
header = '              27th Anniversary Edition James Joyce Ramble 10K'

In [None]:
assert DATA_LINE_IDENTIFIER.match(header) is None

We want to get the first six pieces of information

In [None]:
parts = line.split()

The second item needs to be in the form of `Div/Tot`:

In [None]:
parts[1]

In [None]:
assert '/' in parts[1]

Those that don't have this didn't finish the race, so can be discounted

In [None]:
non_finisher = '   55                  40:37   40:28   6:31 Mark Healey               M  1496 Dover MA'

In [None]:
parts = non_finisher.split()
parts[1]

In [None]:
assert '/' not in parts[1]

In [None]:
# just check the first two parts
parts = non_finisher.split(maxsplit=2)

In [None]:
parts[1]

In [None]:
parts = line.split(maxsplit=2)

In [None]:
parts[1]

In [None]:
# take a look at the first 4 items
parts = line.split(maxsplit=4)

In [None]:
# the third item should be a time
parts[3]

So now we have a way of identifying valid lines and skipping invalid lines

In [None]:
datafile = '../data/relay_results.txt'
num_processed = 0
num_skipped = 0
with open(datafile, 'r') as fp:
    for line in fp:
        if DATA_LINE_IDENTIFIER.match(line) is None:
            num_skipped += 1
            continue
        parts = line.split(maxsplit=4)
        # there should be 5 items - the ones we split on plus the rest
        if len(parts) < 5:
            num_skipped += 1
            continue
        # the second item must be div/tot
        if '/' not in parts[1]:
            num_skipped += 1
            continue
        # the fourth item must be a time
        if ':' not in parts[3]:
            num_skipped += 1
            continue
        # now we have a valid line
        num_processed += 1
            
print(f'Processed: {num_processed}, Skipped: {num_skipped}')

Now we can focus on processing an individual line

In [None]:
line = ' 1601 169/171  M5059 1:17:17 1:16:51  12:22 James Katides          57 M   633 Quincy MA'

In [None]:
place, divtot, div, gun, net, pace = line.split()[0:6]

place is an integer

In [None]:
int(place)

gun, net and pace are times. We need a routine that converts strings to time instances

In [None]:
print(f'Gun: {gun}, Net: {net}, Pace: {pace}')

Lets collect the hour, minute and second components together, and add a parse routine that creates and instance from a string representation

In [None]:
@dataclass
class Time:
    
    @classmethod
    def parse(cls, s: str):
        parts = [int(c) for c in s.split(':')]
        if len(parts) == 2:
            # hour component is zero
            return cls(0, *parts)
        return cls(*parts)
    
    hour: int
    minute: int
    second: int
        
    @property
    def seconds(self) -> int:
        return sum((
            self.hour * 3600,
            self.minute * 60,
            self.second
        ))

In [None]:
for t in ('6:18', '59:20', '1:00:04'):
    print(Time.parse(t))

We also need to check for any time fields with invalid characters such as `59:20*'

In [None]:
t = '59:20*'
# replace any characters that are not digits or colons with the empty string
re.sub(r'[^\d:]', '', t)

In [None]:
def clean_time_str(s: str) -> str:
    return re.sub(r'[^\d:]', '', s)

In [None]:
for t in ('6:18', '59:20', '1:00:04', '56:15*'):
    print(Time.parse(clean_time_str(t)))

For now we will just extract the place and times. Create a dataclass to contain the data extracted for each line

In [None]:
@dataclass
class Record:
    
    place: int
    gun: Time
    net: Time
    pace: Time

Now we can define a `read_record` that extracts the race info and returns it as a `Record`

In [None]:
def read_record(line: str) -> Record:
    place, divtot, div, gun, net, pace = line.split()[0:6]
    return Record(
        int(place),
        Time.parse(clean_time_str(gun)),
        Time.parse(clean_time_str(net)),
        Time.parse(clean_time_str(pace))
    )

In [None]:
read_record(line)

Finally we can parse the data file as a list of Records:

In [None]:
def read_records(filename: str) -> List[Record]:
    records = []
    with open(filename, 'r') as fp:
        for line in fp:
            # does the line contain data?
            if DATA_LINE_IDENTIFIER.match(line) is None:
                continue
            parts = line.split(maxsplit=4)
            # there should be 5 items - the ones we split on plus the rest
            if len(parts) < 5:
                continue
            # the second item must be div/tot
            if '/' not in parts[1]:
                continue
            # the fourth item must be a time
            if ':' not in parts[3]:
                continue
            # now we have a valid line - parse out the data
            records.append(read_record(line))
    # sort by place
    records.sort(key=attrgetter('place'))
    return records

In [None]:
records = read_records(datafile)

In [None]:
# first and list
print(records[0])
print(records[-1])

Finally we need to convert the pace (average time to run each mile) into a speed in mph. We can enhance our `Time` class to return the total in terms of seconds

In [None]:
def to_mph(t: Time) -> float:
    '''
    Converts seconds per mile to miles per hour
    '''
    # multiply the total number of hours by 3600 to convert to seconds
    # speed is distance/time, i.e 3600/[number of seconds]
    return 3600 / t.seconds

In [None]:
records = read_records(datafile)

In [None]:
print(f'First: {to_mph(records[0].pace):0.2f} mph, Last: {to_mph(records[-1].pace):0.2f} mph')

So we have speeds between 3 and 12 mph - we can bin these into 100 categories

In [None]:
# collect speeds
speeds = np.array([to_mph(record.pace) for record in records])
# highest and lowest
hi, low = (speeds[0], speeds[-1])
hi, low = 12, 3
# and the range
dt = hi - low
print(f'Hi: {hi}, Low: {low}, Range: {dt}')

In [None]:
# the number of bins
n = 100

In [None]:
# express speeds in units of dt, and then expressed as a value between 0 and the bin size
binvals = ((speeds - low) / dt) * n
binvals

In [None]:
# round down and put new values back in terms of their original speed
binvals = np.round(binvals) * dt / n + low

In [None]:
pd.Series(binvals).value_counts().sort_values()

Here is a better way of binning the data

In [None]:
def bin_float(value: float, binsize=0.5):
    floor = np.floor(value)
    ds = value - floor
    # want to round to one of these discrete values
    vals = np.arange(0, 1+binsize, binsize)
    # how much does the fractional part differ from each of our rounded values
    diffs = np.abs(vals - (value - floor))
    # we want the closest
    return floor + vals[np.argmin(diffs)]

And save the results for later analysis

In [None]:
df = pd.DataFrame(dict(
    place=np.array([r.place for r in records], dtype=np.uint32),
    mph=speeds,
    speed=[bin_float(speed, 0.25) for speed in speeds]
))
df.head(n=10)

In [None]:
df.to_feather('../data/relay_results.feather')