Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Scripts to collect, prepare and plot cyclists' data from Strava
- Loading branch information
Oliver Nash
committed
Feb 28, 2015
1 parent
8cb105e
commit 49fb901
Showing
3 changed files
with
360 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
""" | ||
I generated the animated gif by using ImageMagick's 'convert' utility according to the following recipe: | ||
convert -dispose previous $(for a in grade_velocity*all.png; do printf -- "-delay 50 %s " $a; done; ) grade_velocity.all.all.gif | ||
""" | ||
import cPickle, sys, os, prepare_data, gzip | ||
import matplotlib.pyplot as plt | ||
import numpy as N | ||
from operator import itemgetter as nth | ||
|
||
def nadaraya_watson(xx, xs, ys, sigma): | ||
K = lambda t : N.exp(-0.5*(t/sigma)**2) | ||
p = q = 0.0 | ||
for (x, y) in zip(xs, ys): | ||
w = K(x - xx) | ||
p += w*y | ||
q += w | ||
return p/q | ||
|
||
def loadMaybePickledData(segment_id): | ||
pickle_fname_ext = '%d.pkl.gz' % segment_id | ||
if os.path.isfile('data.' + pickle_fname_ext): | ||
sys.stdout.write('Found pickle for %d; unpickling.\n' % segment_id) | ||
data = cPickle.load(gzip.open('data.' + pickle_fname_ext)) | ||
index = cPickle.load(gzip.open('index.' + pickle_fname_ext)) | ||
else: | ||
sys.stdout.write('No pickle for %d; reading raw.\n' % segment_id) | ||
dataAndIndex = prepare_data.loadData(segment_id) | ||
index, data = dataAndIndex.asDataFrames() | ||
cPickle.dump(data, gzip.open('data.' + pickle_fname_ext, 'w')) | ||
cPickle.dump(index, gzip.open('index.' + pickle_fname_ext, 'w')) | ||
return (index, data) | ||
|
||
|
||
def getNiceAxes(xlabel, ylabel, title, fig, sub_plot_args=(1, 1, 1)): | ||
ax = fig.add_subplot(*sub_plot_args) | ||
ax.patch.set_alpha(0) | ||
ax.set_title(title, fontsize=18, color='black', y=1.05) | ||
ax.spines['top'].set_visible(False) | ||
ax.spines['right'].set_visible(False) | ||
ax.get_xaxis().tick_bottom() | ||
ax.get_yaxis().tick_left() | ||
ax.get_xaxis().set_tick_params(direction='out') | ||
ax.get_yaxis().set_tick_params(direction='out') | ||
ax.spines['left'].set_position(('outward', 15)) | ||
ax.spines['bottom'].set_position(('outward', 15)) | ||
ax.set_xlabel(xlabel, fontsize=16, color='black', labelpad=20) | ||
ax.set_ylabel(ylabel, fontsize=16, color='black', labelpad=10) | ||
return ax | ||
|
||
|
||
def makeCharts(returned_series, segment_id, color, title, pctile_lb, pctile_ub, index, data): | ||
"""Uncomment for just one iteration to generate histograms and scatter plots. | ||
fig = plt.figure() | ||
fig.set_size_inches(16.0, 6.0) | ||
ax = getNiceAxes('Gradient', 'Frequency (normalised)', title, fig, (1, 2, 1)) | ||
ax.hist(data.grade_smooth, | ||
bins=N.arange(-5, 25, 0.3), | ||
linewidth=0, | ||
color=color, | ||
normed=True) | ||
ax = getNiceAxes('Gradient', 'Speed (kph)', title, fig, (1, 2, 2)) | ||
ax.set_xlim([-5, 25]) | ||
ax.set_ylim([0, 50]) | ||
ax.scatter(data.grade_smooth, | ||
data.velocity_smooth * 3.6, | ||
s=1, | ||
color=color) | ||
plt.tight_layout() | ||
fig.savefig('histogram_and_scatter.%d.png' % segment_id, transparent=True, dpi=100)""" | ||
|
||
xs = N.arange(-5, 25, 0.2) | ||
a = index.start_row[len(index) * pctile_lb / 100] | ||
b = index.start_row[len(index) * pctile_ub / 100] | ||
ys = [nadaraya_watson(x, | ||
data.grade_smooth[a:b], | ||
data.velocity_smooth[a:b] * 3.6, | ||
1.0) for x in xs] | ||
returned_series.extend([xs, ys]) | ||
fig = plt.figure() | ||
fig.set_size_inches(8.0, 6.0) | ||
ax = getNiceAxes('Gradient', 'Speed (kph)', | ||
title + ' (top %d-%d%% of riders)' % (pctile_lb, pctile_ub), | ||
fig) | ||
ax.set_xlim([0, 20]) | ||
ax.set_ylim([0, 30]) | ||
ax.plot(xs, ys, color=color, linewidth=2) | ||
plt.tight_layout() | ||
fig.savefig('grade_velocity_line.%d-%d.%d.png' % (pctile_lb, pctile_ub, segment_id), | ||
transparent=True, dpi=100) | ||
|
||
color_from_segment_id = { 3538533 : '#FF6961', | ||
665229 : '#C23B22', | ||
4629741 : '#FFB347' } | ||
index_data_from_segment_id = { segment_id : loadMaybePickledData(segment_id) for\ | ||
segment_id in color_from_segment_id.keys()} | ||
|
||
for (pctile_lb, pctile_ub) in zip(N.arange(0, 90, 10), N.arange(10, 100, 10)): | ||
joint_chart_series = [] | ||
for (segment_id, title) in sorted([(3538533, "Stocking Lane"), | ||
(665229, "Col du Tourmalet"), | ||
(4629741, "L'Alpe d'Huez")]): | ||
makeCharts(joint_chart_series, | ||
segment_id, | ||
color_from_segment_id[segment_id], | ||
title, | ||
pctile_lb, pctile_ub, | ||
*index_data_from_segment_id[segment_id]) | ||
|
||
fig = plt.figure() | ||
fig.set_size_inches(8.0, 6.0) | ||
ax = getNiceAxes('Gradient', 'Speed (kph)', | ||
"Stocking, Tourmalet, d'Huez (top %d-%d%% of riders)" % (pctile_lb, pctile_ub), | ||
fig) | ||
ax.set_xlim([0, 20]) | ||
ax.set_ylim([0, 30]) | ||
ax.set_color_cycle(map(nth(1), sorted(color_from_segment_id.items()))) | ||
ax.plot(*joint_chart_series, linewidth=2) | ||
plt.tight_layout() | ||
fig.savefig('grade_velocity_line.%d-%d.all.png' % (pctile_lb, pctile_ub), | ||
transparent=True, dpi=100) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import requests, cPickle, time, sys, os | ||
|
||
access_token = sys.argv[1] | ||
|
||
extra_headers = {'Authorization' : 'Bearer %s' % access_token} | ||
|
||
api_base_url = 'https://www.strava.com/api/v3/' | ||
api_segment_url = api_base_url + 'segments/%d' | ||
api_segment_all_efforts_url = api_segment_url + '/all_efforts' | ||
api_segment_effort_stream_url = api_base_url + 'segment_efforts/%d/streams/latlng,time,grade_smooth,velocity_smooth' | ||
|
||
per_page = 200 # Strava max | ||
|
||
#segment_id = 3538533 # "Stocking Lane Roundabout to Viewpoint" | ||
#segment_id = 665229 # "Tourmalet - Eastern Approach" | ||
segment_id = 4629741 # "L'Alpe d'Huez (Full)" | ||
all_efforts_fname = 'all_efforts.%d' % segment_id | ||
|
||
if os.path.isfile(all_efforts_fname): | ||
sys.stdout.write('Unpickling all efforts from %s\n' % all_efforts_fname) | ||
all_efforts = cPickle.load(open(all_efforts_fname)) | ||
else: | ||
all_efforts = [] | ||
r = requests.get(api_segment_url % segment_id, headers=extra_headers) | ||
n_efforts = r.json()['effort_count'] | ||
sys.stdout.write('Fetching all %d effort summaries for segment %d\n' % (n_efforts, segment_id)) | ||
for i in range(1, 2 + n_efforts / per_page): | ||
sys.stdout.write('Making summary request %d\n' % i) | ||
r = requests.get(api_segment_all_efforts_url % segment_id, headers=extra_headers, params={'per_page' : per_page, 'page' : i}) | ||
if r.status_code != 200: | ||
sys.stderr.write('Error, received code %d for summary request %d\n' % (r.status_code, i)) | ||
else: | ||
all_efforts.extend(r.json()) | ||
time.sleep(2) # Make sure do not hit Strava rate limiting | ||
cPickle.dump(all_efforts, open(all_efforts_fname, 'w')) | ||
|
||
effort_ids = map(lambda d: d['id'], all_efforts) | ||
sys.stdout.write('Fetching all %d effort streams for segment %d\n' % (len(effort_ids), segment_id)) | ||
for (i, effort_id) in enumerate(effort_ids): | ||
sys.stdout.write('Making stream request %d (%d of %d)\n' % (effort_id, i+1, len(effort_ids))) | ||
r = requests.get(api_segment_effort_stream_url % effort_id, headers=extra_headers) | ||
if r.status_code != 200: | ||
sys.stderr.write('Error, received code %d for stream request %d\n' % (r.status_code, i+1)) | ||
else: | ||
cPickle.dump(r.json(), open('effort_stream.%d.%d' % (segment_id, effort_id), 'w')) # Persist to file system right away for safety. | ||
time.sleep(2) # Make sure do not hit Strava rate limiting |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
import cPickle | ||
import numpy as N | ||
import copy | ||
from pandas import DataFrame | ||
from collections import namedtuple | ||
from operator import itemgetter as nth | ||
|
||
class StravaSegmentEffortData(object): | ||
IndexRow = namedtuple('IndexRow', ['athlete_id', 'effort_id', 'start_row', 'n_rows']) | ||
ESType = namedtuple('ESType', ['name', 'delta_from_start']) | ||
|
||
es_types = (ESType('lat', False), | ||
ESType('lng', False), | ||
ESType('time', True), | ||
ESType('distance', True), | ||
ESType('grade_smooth', False), | ||
ESType('velocity_smooth', False)) | ||
|
||
def __init__(self, segment_characteristics): | ||
self.segment_characteristics = segment_characteristics | ||
es_types_names = [es_type.name for es_type in self.es_types] | ||
self.time_col = es_types_names.index('time') | ||
self.distance_col = es_types_names.index('distance') | ||
self.grade_smooth_col = es_types_names.index('grade_smooth') | ||
self.velocity_smooth_col = es_types_names.index('velocity_smooth') | ||
self.index = [] | ||
self.data = [] | ||
|
||
@classmethod | ||
def genIthRow(cls, effort_streams, i): | ||
res = [] | ||
for (j, es_type) in enumerate(cls.es_types): | ||
offset = effort_streams[j]['data'][0] if es_type.delta_from_start else 0 | ||
res.append(effort_streams[j]['data'][i] - offset) | ||
return res | ||
|
||
@staticmethod | ||
def fixColumns(effort_streams): | ||
# Special case hack for Strava's annoying 2-valued 'latlng' column | ||
assert effort_streams[0]['type'] == 'latlng' | ||
def fix(i, new_type): | ||
l = copy.copy(effort_streams[0]) | ||
l['type'] = new_type | ||
l['data'] = map(nth(i), l['data']) | ||
return l | ||
effort_streams = [fix(0, 'lat'), fix(1, 'lng')] + effort_streams[1:] | ||
# Fix ordering | ||
local_es_types = [effort_stream['type'] for effort_stream in effort_streams] | ||
return map(lambda es_type : effort_streams[local_es_types.index(es_type.name)], StravaSegmentEffortData.es_types) | ||
|
||
def isTooFewRows(self, n_rows, effort_streams): | ||
if n_rows < self.segment_characteristics.min_effort_rows: | ||
return | ||
for stream in effort_streams: | ||
if len(stream['data']) != n_rows: | ||
return True | ||
return False | ||
|
||
def isInconsistentTimes(self, effort_summary, effort_streams): | ||
ts = effort_streams[self.time_col]['data'] | ||
return effort_summary['elapsed_time'] != ts[-1] - ts[0] | ||
|
||
def isNotAlwaysMoving(self, effort_summary): | ||
return effort_summary['elapsed_time'] != effort_summary['moving_time'] | ||
|
||
def isInconsistentDistances(self, effort_summary, effort_streams): | ||
ds = effort_streams[self.distance_col]['data'] | ||
abs_diff_p = lambda x, y : abs(x / y - 1) | ||
if abs_diff_p(ds[-1] - ds[0], effort_summary['distance']) > self.segment_characteristics.distance_slippage or\ | ||
abs_diff_p(effort_summary['distance'], effort_summary['segment']['distance']) > self.segment_characteristics.distance_slippage: | ||
return True | ||
return False | ||
|
||
def isInadmissableAvgSpeed(self, effort_summary): | ||
avg_speed = (effort_summary['distance'] / 1000.0) / (effort_summary['elapsed_time'] / 3600.0) | ||
if avg_speed < self.segment_characteristics.min_avg_speed_kph or\ | ||
avg_speed > self.segment_characteristics.max_avg_speed_kph: | ||
return True | ||
return False | ||
|
||
def getRowsToCensor(self, effort_summary, effort_streams): | ||
def xs_dxs_from_col(col): | ||
xs = N.array(effort_streams[col]['data']) | ||
return (xs, xs[1:] - xs[:-1]) | ||
ts, dts = xs_dxs_from_col(self.time_col) | ||
ds, dds = xs_dxs_from_col(self.distance_col) | ||
vs, dvs = xs_dxs_from_col(self.velocity_smooth_col) | ||
gs, dgs = xs_dxs_from_col(self.grade_smooth_col) | ||
if min(dts) < 0 or min(dds) < 0: | ||
return N.ones_like(ts, dtype=bool) # Censor all points if distance or time ever run backwards. | ||
dvdts = dvs/dts | ||
dgdds = dgs/dds | ||
# The below is a pretty bad way to try and deal with extremely-noisy data points (these are fairly | ||
# common in 'grade_smooth' and also quite common in the 'velocity_smooth' for the Tourmalet segment --- | ||
# I wonder if GPS signal is worse there?). With the below we lose two data-points for every bad outlier. | ||
# The proper way to do it would be to use some simple ideas from signal filtering. | ||
# Also note that previously I was filtering the grade_smooth using: | ||
# (abs(gs) > (1 + self.segment_characteristics.grade_slippage) * effort_summary['segment']['maximum_grade']) | ||
# but having the cutoff vary by segment made various charts look ugly so I decided to for for a static value. | ||
return (vs > self.segment_characteristics.max_abs_speed_kph * 1000 / 3600.0) |\ | ||
(abs(gs) > 25.0) |\ | ||
N.append([True], (abs(dvdts) > self.segment_characteristics.max_accel_mpsps) |\ | ||
(abs(dgdds) > self.segment_characteristics.max_grade_delta_pm)) | ||
|
||
def maybeConsumeEffort(self, effort_summary, effort_streams): | ||
effort_streams = self.fixColumns(effort_streams) | ||
# Double check column ordering as critically important | ||
assert len(effort_streams) == len(self.es_types) | ||
for (i, es_type) in enumerate(self.es_types): | ||
assert es_type.name == effort_streams[i]['type'] | ||
|
||
n_rows = effort_summary['end_index'] - effort_summary['start_index'] + 1 | ||
|
||
if self.isTooFewRows(n_rows, effort_streams) or\ | ||
self.isInconsistentTimes(effort_summary, effort_streams) or\ | ||
self.isNotAlwaysMoving(effort_summary) or\ | ||
self.isInconsistentDistances(effort_summary, effort_streams) or\ | ||
self.isInadmissableAvgSpeed(effort_summary): | ||
return | ||
|
||
censor_f = self.getRowsToCensor(effort_summary, effort_streams) | ||
if float(sum(censor_f)) / n_rows > self.segment_characteristics.max_censoring: | ||
return | ||
for (i, censor) in enumerate(censor_f): | ||
if not censor: | ||
self.data.append(self.genIthRow(effort_streams, i)) | ||
n_rows -= sum(censor_f) | ||
self.index.append(self.IndexRow(effort_summary['athlete']['id'], | ||
effort_summary['id'], | ||
len(self.data) - n_rows, | ||
n_rows)) | ||
|
||
def getIthTotalTime(self, i): | ||
if i < 0 or i > len(self.index): | ||
return 0 | ||
indexRow = self.index[i] | ||
r1 = indexRow.start_row | ||
r2 = indexRow.start_row + indexRow.n_rows | ||
return self.data[r2-1][self.time_col] - self.data[r1][self.time_col] | ||
|
||
def sortByTotalTime(self): | ||
perm = map(nth(1), sorted([(self.getIthTotalTime(i), i) for i in range(len(self.index))])) | ||
new_data = [] | ||
new_index = [] | ||
r_tot = 0 | ||
for i in perm: | ||
indexRow = self.index[i] | ||
r1 = indexRow.start_row | ||
r2 = indexRow.start_row + indexRow.n_rows | ||
new_data.extend(self.data[r1:r2]) | ||
new_index.append(self.IndexRow(indexRow.athlete_id, | ||
indexRow.effort_id, | ||
r_tot, | ||
indexRow.n_rows)) | ||
r_tot += indexRow.n_rows | ||
self.data = new_data | ||
self.index = new_index | ||
|
||
def asDataFrames(self): | ||
return (DataFrame(N.array(self.index), columns=self.IndexRow._fields), | ||
DataFrame(N.array(self.data), columns=[es_type.name for es_type in self.es_types])) | ||
|
||
SegmentCharacteristics = namedtuple('SegmentCharacteristics', ['min_effort_rows', | ||
'min_avg_speed_kph', | ||
'max_avg_speed_kph', | ||
'distance_slippage', | ||
'grade_slippage', | ||
'max_abs_speed_kph', | ||
'max_accel_mpsps', | ||
'max_grade_delta_pm', | ||
'max_censoring']) | ||
segment_characteristics_from_id = { 3538533 : SegmentCharacteristics(min_effort_rows=50, | ||
min_avg_speed_kph=6.0, | ||
max_avg_speed_kph=25.0, | ||
distance_slippage=0.02, | ||
grade_slippage=0.2, | ||
max_abs_speed_kph=60.0, | ||
max_accel_mpsps=5.0, | ||
max_grade_delta_pm=5.0, | ||
max_censoring=0.05) } | ||
segment_characteristics_from_id[4629741] = segment_characteristics_from_id[3538533] | ||
segment_characteristics_from_id[665229] = segment_characteristics_from_id[3538533] | ||
|
||
def loadData(segment_id, max_efforts = 99999): | ||
path = 'effort_streams/%d/' % segment_id | ||
all_efforts_fname = path + 'all_efforts.%d' | ||
effort_stream_fname = path + 'effort_stream.%d.%d' | ||
|
||
effortsData = StravaSegmentEffortData(segment_characteristics_from_id[segment_id]) | ||
for effort_summary in cPickle.load(open(all_efforts_fname % segment_id))[:max_efforts]: | ||
effortsData.maybeConsumeEffort(effort_summary, | ||
cPickle.load(open(effort_stream_fname % (segment_id, effort_summary['id'])))) | ||
effortsData.sortByTotalTime() | ||
return effortsData |