The entire point of this notebook is to take the 'data/gpx' folder, parse the interesting data from each, write it to csv, and save it in the 'data/csv' folder. All of this is done using the column names from Strava. If importing a gpx file from some other site there may be issues.

In [1]:
import gpxpy # Used for reading gpx files
from geopy.distance import vincenty # Used for calculating distances between (lat, lon) pairs
import os
import pandas as pd
import numpy as np

In [2]:
# INDIR is where the raw gpx files are located, and OUTDIR is where the cleaned csv files should go
INDIR = r'data/gpx/'
OUTDIR = r'data/csv/'

def parsegpx(f):
    #Parse a GPX file into a list of dictoinaries.  
    #Each dict is one row of the final dataset
    
    points2 = []
    with open(f, 'r') as gpxfile:
        print(f)
        gpx = gpxpy.parse(gpxfile)
        for track in gpx.tracks:
            for segment in track.segments:
                for point in segment.points:
                    dict = {'Timestamp' : point.time,
                            'Latitude' : point.latitude,
                            'Longitude' : point.longitude,
                            'Elevation' : point.elevation
                            }
                    points2.append(dict)
    return points2

This function takes a raw gpx file from Strava, computes the seconds elapsed, the distance between successive points, the elevation change, the time change, the instantaneous speed, and the gradient. It then writes this cleaned df to a csv in the data/csv folder.

In [3]:
def clean_df(df):
    # Compute time elapsed in seconds
    initial_time = df['Timestamp'].iloc[0]
    df['sec_elapsed'] = (df['Timestamp'] - initial_time) / np.timedelta64(1, 's')
    
    # Compute the distance (in meters) between successive points
    df['lat_lon'] = list(zip(df['Latitude'], df['Longitude']))
    
    lat_lon_offset = list(df['lat_lon'])
    lat_lon_offset.insert(0,lat_lon_offset[0])
    lat_lon_offset = lat_lon_offset[:-1]

    df['lat_lon_offset'] = lat_lon_offset
    
    df['lat_lon_pairs'] = list(zip(df['lat_lon'], df['lat_lon_offset']))
    
    df['dist_delta_meters'] = df['lat_lon_pairs'].apply(lambda x: vincenty(x[0], x[1]).meters)
    
    # Compute the elevation difference (in meters) between successive points
    elev_offset = list(df['Elevation'])
    elev_offset.insert(0,elev_offset[0])
    elev_offset = pd.Series(elev_offset[:-1])
    df['elev_delta_meters'] = df['Elevation'] - elev_offset
    
    # Compute the time difference (in seconds) between successive points
    time_offset = list(df['Timestamp'])
    time_offset.insert(0,time_offset[0])
    time_offset = pd.Series(time_offset[:-1])
    df['time_delta_sec'] = (df['Timestamp'] - time_offset) / np.timedelta64(1, 's')
    
    # Compute instantaneous speed (in meters per second)
    df['inst_speed_meters_sec'] = df['dist_delta_meters'] / df['time_delta_sec']
    df['inst_speed_meters_sec'] = df['inst_speed_meters_sec'].fillna(0.0)
    
    # Compute gradient (unitless)
    df['gradient'] = df['elev_delta_meters'] / df['dist_delta_meters']
    df['gradient'].fillna(0.0, inplace=True)
    
    df.drop(['Latitude', 'Longitude', 'lat_lon_offset', 'lat_lon_pairs'], axis=1, inplace=True)
    
    return df

In [4]:
#Parse the gpx files into a pandas dataframe and clean each, then save to csv
files = os.listdir(INDIR)

for file in files:
    if file.endswith('.gpx'):
        df = pd.DataFrame(parsegpx(INDIR + file))
        df = clean_df(df)
        df.to_csv('data/csv/' + file.replace('gpx', 'csv'), index=False)

data/gpx/Orchard_Hills_with_Ling_and_Almond.gpx
