Skip to content

Commit

Permalink
Scripts to collect, prepare and plot cyclists' data from Strava
Browse files Browse the repository at this point in the history
  • Loading branch information
Oliver Nash committed Feb 28, 2015
1 parent 8cb105e commit 49fb901
Show file tree
Hide file tree
Showing 3 changed files with 360 additions and 0 deletions.
120 changes: 120 additions & 0 deletions charts.py
@@ -0,0 +1,120 @@
"""
I generated the animated gif by using ImageMagick's 'convert' utility according to the following recipe:
convert -dispose previous $(for a in grade_velocity*all.png; do printf -- "-delay 50 %s " $a; done; ) grade_velocity.all.all.gif
"""
import cPickle, sys, os, prepare_data, gzip
import matplotlib.pyplot as plt
import numpy as N
from operator import itemgetter as nth

def nadaraya_watson(xx, xs, ys, sigma):
K = lambda t : N.exp(-0.5*(t/sigma)**2)
p = q = 0.0
for (x, y) in zip(xs, ys):
w = K(x - xx)
p += w*y
q += w
return p/q

def loadMaybePickledData(segment_id):
pickle_fname_ext = '%d.pkl.gz' % segment_id
if os.path.isfile('data.' + pickle_fname_ext):
sys.stdout.write('Found pickle for %d; unpickling.\n' % segment_id)
data = cPickle.load(gzip.open('data.' + pickle_fname_ext))
index = cPickle.load(gzip.open('index.' + pickle_fname_ext))
else:
sys.stdout.write('No pickle for %d; reading raw.\n' % segment_id)
dataAndIndex = prepare_data.loadData(segment_id)
index, data = dataAndIndex.asDataFrames()
cPickle.dump(data, gzip.open('data.' + pickle_fname_ext, 'w'))
cPickle.dump(index, gzip.open('index.' + pickle_fname_ext, 'w'))
return (index, data)


def getNiceAxes(xlabel, ylabel, title, fig, sub_plot_args=(1, 1, 1)):
ax = fig.add_subplot(*sub_plot_args)
ax.patch.set_alpha(0)
ax.set_title(title, fontsize=18, color='black', y=1.05)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.get_xaxis().set_tick_params(direction='out')
ax.get_yaxis().set_tick_params(direction='out')
ax.spines['left'].set_position(('outward', 15))
ax.spines['bottom'].set_position(('outward', 15))
ax.set_xlabel(xlabel, fontsize=16, color='black', labelpad=20)
ax.set_ylabel(ylabel, fontsize=16, color='black', labelpad=10)
return ax


def makeCharts(returned_series, segment_id, color, title, pctile_lb, pctile_ub, index, data):
"""Uncomment for just one iteration to generate histograms and scatter plots.
fig = plt.figure()
fig.set_size_inches(16.0, 6.0)
ax = getNiceAxes('Gradient', 'Frequency (normalised)', title, fig, (1, 2, 1))
ax.hist(data.grade_smooth,
bins=N.arange(-5, 25, 0.3),
linewidth=0,
color=color,
normed=True)
ax = getNiceAxes('Gradient', 'Speed (kph)', title, fig, (1, 2, 2))
ax.set_xlim([-5, 25])
ax.set_ylim([0, 50])
ax.scatter(data.grade_smooth,
data.velocity_smooth * 3.6,
s=1,
color=color)
plt.tight_layout()
fig.savefig('histogram_and_scatter.%d.png' % segment_id, transparent=True, dpi=100)"""

xs = N.arange(-5, 25, 0.2)
a = index.start_row[len(index) * pctile_lb / 100]
b = index.start_row[len(index) * pctile_ub / 100]
ys = [nadaraya_watson(x,
data.grade_smooth[a:b],
data.velocity_smooth[a:b] * 3.6,
1.0) for x in xs]
returned_series.extend([xs, ys])
fig = plt.figure()
fig.set_size_inches(8.0, 6.0)
ax = getNiceAxes('Gradient', 'Speed (kph)',
title + ' (top %d-%d%% of riders)' % (pctile_lb, pctile_ub),
fig)
ax.set_xlim([0, 20])
ax.set_ylim([0, 30])
ax.plot(xs, ys, color=color, linewidth=2)
plt.tight_layout()
fig.savefig('grade_velocity_line.%d-%d.%d.png' % (pctile_lb, pctile_ub, segment_id),
transparent=True, dpi=100)

color_from_segment_id = { 3538533 : '#FF6961',
665229 : '#C23B22',
4629741 : '#FFB347' }
index_data_from_segment_id = { segment_id : loadMaybePickledData(segment_id) for\
segment_id in color_from_segment_id.keys()}

for (pctile_lb, pctile_ub) in zip(N.arange(0, 90, 10), N.arange(10, 100, 10)):
joint_chart_series = []
for (segment_id, title) in sorted([(3538533, "Stocking Lane"),
(665229, "Col du Tourmalet"),
(4629741, "L'Alpe d'Huez")]):
makeCharts(joint_chart_series,
segment_id,
color_from_segment_id[segment_id],
title,
pctile_lb, pctile_ub,
*index_data_from_segment_id[segment_id])

fig = plt.figure()
fig.set_size_inches(8.0, 6.0)
ax = getNiceAxes('Gradient', 'Speed (kph)',
"Stocking, Tourmalet, d'Huez (top %d-%d%% of riders)" % (pctile_lb, pctile_ub),
fig)
ax.set_xlim([0, 20])
ax.set_ylim([0, 30])
ax.set_color_cycle(map(nth(1), sorted(color_from_segment_id.items())))
ax.plot(*joint_chart_series, linewidth=2)
plt.tight_layout()
fig.savefig('grade_velocity_line.%d-%d.all.png' % (pctile_lb, pctile_ub),
transparent=True, dpi=100)
46 changes: 46 additions & 0 deletions fetch.py
@@ -0,0 +1,46 @@
import requests, cPickle, time, sys, os

access_token = sys.argv[1]

extra_headers = {'Authorization' : 'Bearer %s' % access_token}

api_base_url = 'https://www.strava.com/api/v3/'
api_segment_url = api_base_url + 'segments/%d'
api_segment_all_efforts_url = api_segment_url + '/all_efforts'
api_segment_effort_stream_url = api_base_url + 'segment_efforts/%d/streams/latlng,time,grade_smooth,velocity_smooth'

per_page = 200 # Strava max

#segment_id = 3538533 # "Stocking Lane Roundabout to Viewpoint"
#segment_id = 665229 # "Tourmalet - Eastern Approach"
segment_id = 4629741 # "L'Alpe d'Huez (Full)"
all_efforts_fname = 'all_efforts.%d' % segment_id

if os.path.isfile(all_efforts_fname):
sys.stdout.write('Unpickling all efforts from %s\n' % all_efforts_fname)
all_efforts = cPickle.load(open(all_efforts_fname))
else:
all_efforts = []
r = requests.get(api_segment_url % segment_id, headers=extra_headers)
n_efforts = r.json()['effort_count']
sys.stdout.write('Fetching all %d effort summaries for segment %d\n' % (n_efforts, segment_id))
for i in range(1, 2 + n_efforts / per_page):
sys.stdout.write('Making summary request %d\n' % i)
r = requests.get(api_segment_all_efforts_url % segment_id, headers=extra_headers, params={'per_page' : per_page, 'page' : i})
if r.status_code != 200:
sys.stderr.write('Error, received code %d for summary request %d\n' % (r.status_code, i))
else:
all_efforts.extend(r.json())
time.sleep(2) # Make sure do not hit Strava rate limiting
cPickle.dump(all_efforts, open(all_efforts_fname, 'w'))

effort_ids = map(lambda d: d['id'], all_efforts)
sys.stdout.write('Fetching all %d effort streams for segment %d\n' % (len(effort_ids), segment_id))
for (i, effort_id) in enumerate(effort_ids):
sys.stdout.write('Making stream request %d (%d of %d)\n' % (effort_id, i+1, len(effort_ids)))
r = requests.get(api_segment_effort_stream_url % effort_id, headers=extra_headers)
if r.status_code != 200:
sys.stderr.write('Error, received code %d for stream request %d\n' % (r.status_code, i+1))
else:
cPickle.dump(r.json(), open('effort_stream.%d.%d' % (segment_id, effort_id), 'w')) # Persist to file system right away for safety.
time.sleep(2) # Make sure do not hit Strava rate limiting
194 changes: 194 additions & 0 deletions prepare_data.py
@@ -0,0 +1,194 @@
import cPickle
import numpy as N
import copy
from pandas import DataFrame
from collections import namedtuple
from operator import itemgetter as nth

class StravaSegmentEffortData(object):
IndexRow = namedtuple('IndexRow', ['athlete_id', 'effort_id', 'start_row', 'n_rows'])
ESType = namedtuple('ESType', ['name', 'delta_from_start'])

es_types = (ESType('lat', False),
ESType('lng', False),
ESType('time', True),
ESType('distance', True),
ESType('grade_smooth', False),
ESType('velocity_smooth', False))

def __init__(self, segment_characteristics):
self.segment_characteristics = segment_characteristics
es_types_names = [es_type.name for es_type in self.es_types]
self.time_col = es_types_names.index('time')
self.distance_col = es_types_names.index('distance')
self.grade_smooth_col = es_types_names.index('grade_smooth')
self.velocity_smooth_col = es_types_names.index('velocity_smooth')
self.index = []
self.data = []

@classmethod
def genIthRow(cls, effort_streams, i):
res = []
for (j, es_type) in enumerate(cls.es_types):
offset = effort_streams[j]['data'][0] if es_type.delta_from_start else 0
res.append(effort_streams[j]['data'][i] - offset)
return res

@staticmethod
def fixColumns(effort_streams):
# Special case hack for Strava's annoying 2-valued 'latlng' column
assert effort_streams[0]['type'] == 'latlng'
def fix(i, new_type):
l = copy.copy(effort_streams[0])
l['type'] = new_type
l['data'] = map(nth(i), l['data'])
return l
effort_streams = [fix(0, 'lat'), fix(1, 'lng')] + effort_streams[1:]
# Fix ordering
local_es_types = [effort_stream['type'] for effort_stream in effort_streams]
return map(lambda es_type : effort_streams[local_es_types.index(es_type.name)], StravaSegmentEffortData.es_types)

def isTooFewRows(self, n_rows, effort_streams):
if n_rows < self.segment_characteristics.min_effort_rows:
return
for stream in effort_streams:
if len(stream['data']) != n_rows:
return True
return False

def isInconsistentTimes(self, effort_summary, effort_streams):
ts = effort_streams[self.time_col]['data']
return effort_summary['elapsed_time'] != ts[-1] - ts[0]

def isNotAlwaysMoving(self, effort_summary):
return effort_summary['elapsed_time'] != effort_summary['moving_time']

def isInconsistentDistances(self, effort_summary, effort_streams):
ds = effort_streams[self.distance_col]['data']
abs_diff_p = lambda x, y : abs(x / y - 1)
if abs_diff_p(ds[-1] - ds[0], effort_summary['distance']) > self.segment_characteristics.distance_slippage or\
abs_diff_p(effort_summary['distance'], effort_summary['segment']['distance']) > self.segment_characteristics.distance_slippage:
return True
return False

def isInadmissableAvgSpeed(self, effort_summary):
avg_speed = (effort_summary['distance'] / 1000.0) / (effort_summary['elapsed_time'] / 3600.0)
if avg_speed < self.segment_characteristics.min_avg_speed_kph or\
avg_speed > self.segment_characteristics.max_avg_speed_kph:
return True
return False

def getRowsToCensor(self, effort_summary, effort_streams):
def xs_dxs_from_col(col):
xs = N.array(effort_streams[col]['data'])
return (xs, xs[1:] - xs[:-1])
ts, dts = xs_dxs_from_col(self.time_col)
ds, dds = xs_dxs_from_col(self.distance_col)
vs, dvs = xs_dxs_from_col(self.velocity_smooth_col)
gs, dgs = xs_dxs_from_col(self.grade_smooth_col)
if min(dts) < 0 or min(dds) < 0:
return N.ones_like(ts, dtype=bool) # Censor all points if distance or time ever run backwards.
dvdts = dvs/dts
dgdds = dgs/dds
# The below is a pretty bad way to try and deal with extremely-noisy data points (these are fairly
# common in 'grade_smooth' and also quite common in the 'velocity_smooth' for the Tourmalet segment ---
# I wonder if GPS signal is worse there?). With the below we lose two data-points for every bad outlier.
# The proper way to do it would be to use some simple ideas from signal filtering.
# Also note that previously I was filtering the grade_smooth using:
# (abs(gs) > (1 + self.segment_characteristics.grade_slippage) * effort_summary['segment']['maximum_grade'])
# but having the cutoff vary by segment made various charts look ugly so I decided to for for a static value.
return (vs > self.segment_characteristics.max_abs_speed_kph * 1000 / 3600.0) |\
(abs(gs) > 25.0) |\
N.append([True], (abs(dvdts) > self.segment_characteristics.max_accel_mpsps) |\
(abs(dgdds) > self.segment_characteristics.max_grade_delta_pm))

def maybeConsumeEffort(self, effort_summary, effort_streams):
effort_streams = self.fixColumns(effort_streams)
# Double check column ordering as critically important
assert len(effort_streams) == len(self.es_types)
for (i, es_type) in enumerate(self.es_types):
assert es_type.name == effort_streams[i]['type']

n_rows = effort_summary['end_index'] - effort_summary['start_index'] + 1

if self.isTooFewRows(n_rows, effort_streams) or\
self.isInconsistentTimes(effort_summary, effort_streams) or\
self.isNotAlwaysMoving(effort_summary) or\
self.isInconsistentDistances(effort_summary, effort_streams) or\
self.isInadmissableAvgSpeed(effort_summary):
return

censor_f = self.getRowsToCensor(effort_summary, effort_streams)
if float(sum(censor_f)) / n_rows > self.segment_characteristics.max_censoring:
return
for (i, censor) in enumerate(censor_f):
if not censor:
self.data.append(self.genIthRow(effort_streams, i))
n_rows -= sum(censor_f)
self.index.append(self.IndexRow(effort_summary['athlete']['id'],
effort_summary['id'],
len(self.data) - n_rows,
n_rows))

def getIthTotalTime(self, i):
if i < 0 or i > len(self.index):
return 0
indexRow = self.index[i]
r1 = indexRow.start_row
r2 = indexRow.start_row + indexRow.n_rows
return self.data[r2-1][self.time_col] - self.data[r1][self.time_col]

def sortByTotalTime(self):
perm = map(nth(1), sorted([(self.getIthTotalTime(i), i) for i in range(len(self.index))]))
new_data = []
new_index = []
r_tot = 0
for i in perm:
indexRow = self.index[i]
r1 = indexRow.start_row
r2 = indexRow.start_row + indexRow.n_rows
new_data.extend(self.data[r1:r2])
new_index.append(self.IndexRow(indexRow.athlete_id,
indexRow.effort_id,
r_tot,
indexRow.n_rows))
r_tot += indexRow.n_rows
self.data = new_data
self.index = new_index

def asDataFrames(self):
return (DataFrame(N.array(self.index), columns=self.IndexRow._fields),
DataFrame(N.array(self.data), columns=[es_type.name for es_type in self.es_types]))

SegmentCharacteristics = namedtuple('SegmentCharacteristics', ['min_effort_rows',
'min_avg_speed_kph',
'max_avg_speed_kph',
'distance_slippage',
'grade_slippage',
'max_abs_speed_kph',
'max_accel_mpsps',
'max_grade_delta_pm',
'max_censoring'])
segment_characteristics_from_id = { 3538533 : SegmentCharacteristics(min_effort_rows=50,
min_avg_speed_kph=6.0,
max_avg_speed_kph=25.0,
distance_slippage=0.02,
grade_slippage=0.2,
max_abs_speed_kph=60.0,
max_accel_mpsps=5.0,
max_grade_delta_pm=5.0,
max_censoring=0.05) }
segment_characteristics_from_id[4629741] = segment_characteristics_from_id[3538533]
segment_characteristics_from_id[665229] = segment_characteristics_from_id[3538533]

def loadData(segment_id, max_efforts = 99999):
path = 'effort_streams/%d/' % segment_id
all_efforts_fname = path + 'all_efforts.%d'
effort_stream_fname = path + 'effort_stream.%d.%d'

effortsData = StravaSegmentEffortData(segment_characteristics_from_id[segment_id])
for effort_summary in cPickle.load(open(all_efforts_fname % segment_id))[:max_efforts]:
effortsData.maybeConsumeEffort(effort_summary,
cPickle.load(open(effort_stream_fname % (segment_id, effort_summary['id']))))
effortsData.sortByTotalTime()
return effortsData

0 comments on commit 49fb901

Please sign in to comment.