In [1]:
import pandas as pd
import numpy as np
from scipy import stats

Beginning the analysis by reading in the three dataframes created previously in the EDAV notebook for each route

In [2]:
LESloop_df = pd.read_csv("../data/LESloop_gdf.csv")

LESloop_df.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,elevation,time,heart_rate,cadence,geometry
0,0,40.688497,-73.970898,25.5,2022-03-03 11:58:46+00:00,103,83,POINT (-73.970898 40.688497)
1,1,40.688514,-73.970898,25.4,2022-03-03 11:58:51+00:00,99,0,POINT (-73.970898 40.688514)
2,2,40.688518,-73.970909,25.4,2022-03-03 11:58:52+00:00,99,0,POINT (-73.970909 40.688518)
3,3,40.688531,-73.970922,25.4,2022-03-03 11:58:53+00:00,100,0,POINT (-73.970922 40.688531)
4,4,40.688545,-73.970934,25.4,2022-03-03 11:58:54+00:00,101,0,POINT (-73.970934 40.688545)


In [3]:
PPccw_df = pd.read_csv("../data/PPccw_gdf.csv")

PPccw_df.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,elevation,time,heart_rate,cadence,geometry
0,0,40.688268,-73.971265,25.9,2022-02-28 11:53:49+00:00,97,83,POINT (-73.971265 40.688268)
1,1,40.688264,-73.971261,25.9,2022-02-28 11:53:50+00:00,96,83,POINT (-73.971261 40.688264)
2,2,40.688236,-73.97123,26.0,2022-02-28 11:53:51+00:00,96,0,POINT (-73.97123 40.688236)
3,3,40.688222,-73.971218,26.0,2022-02-28 11:53:52+00:00,98,0,POINT (-73.971218 40.688222)
4,4,40.688209,-73.971201,26.1,2022-02-28 11:53:53+00:00,98,0,POINT (-73.971201 40.688209)


In [4]:
PPcw_df = pd.read_csv("../data/PPcw_gdf.csv")

PPcw_df.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,elevation,time,heart_rate,cadence,geometry
0,0,40.688572,-73.970831,25.2,2022-02-10 12:02:06+00:00,101,82,POINT (-73.970831 40.688572)
1,1,40.688582,-73.970768,25.3,2022-02-10 12:02:07+00:00,101,82,POINT (-73.970768 40.688582)
2,2,40.688543,-73.970779,25.3,2022-02-10 12:02:08+00:00,102,0,POINT (-73.970779 40.688543)
3,3,40.688462,-73.970733,25.4,2022-02-10 12:02:09+00:00,103,0,POINT (-73.970733 40.688462)
4,4,40.688483,-73.970752,25.4,2022-02-10 12:02:10+00:00,104,0,POINT (-73.970752 40.688483)


Extracting the heart rate data from each route and storing as a numpy array.

In [5]:
LES_hr = LESloop_df['heart_rate'].to_numpy()

PPccw_hr = PPccw_df['heart_rate'].to_numpy()

PPcw_hr = PPcw_df['heart_rate'].to_numpy()

Calculating the mean heart rate for each route as a starting point

In [6]:
LES_hr.mean()

156.53628808864266

In [7]:
PPccw_hr.mean()

150.5704945992041

In [8]:
PPcw_hr.mean()

154.50350058343057

Upon inspecting the means, it appears that the Prospect Park Counter-Clockwise loop is the lowest, the LES loop is the highest, and the Prospect Park Clockwise loop is in between. To make a more conclusive assesment, I'll compare different routes using a statistical test. First, I'll need to test each distribution for normality to determine which statistical test is appropriate

In [9]:
stats.normaltest(LES_hr)

NormaltestResult(statistic=1687.053646845702, pvalue=0.0)

In [10]:
stats.normaltest(PPccw_hr)

NormaltestResult(statistic=1545.0429760179593, pvalue=0.0)

In [11]:
stats.normaltest(PPcw_hr)

NormaltestResult(statistic=1668.4523293705388, pvalue=0.0)

Based on the p-values for each route, I can safely reject the null hypothesis that the heart rates are normally distributed. Because of this, I'll use a Mann-Whitney U test to compare the difficulty of each route.

First, I want to compare the LES Loop to the PP Counter-Clockwise Loop, as those are the most common routes that I run. The null hypothesis in this test is that both routes are equally difficult, and any variation in effort between the two is due to chance. My alternative hypothesis is that the LES loop is easier.

In [12]:
stats.mannwhitneyu(LES_hr, PPccw_hr, alternative='less')

MannwhitneyuResult(statistic=9277256.0, pvalue=1.0)

Based on the p-value of 1.0, I cannot reject the null hypothesis. In other words, the heart rates observed in each route could have come from the same distribution.

As a secondary test, I want to compare the two Prospect Park loops to see if there is a statistically significant difference in the effort required for each direction (counter-clockwise vs clockwise). The null hypothesis in this test is that both directions are equally difficult. My alternative hypothesis is that the loop is easier when run counter clock-wise.

In [13]:
stats.mannwhitneyu(PPccw_hr, PPcw_hr, alternative='less')

MannwhitneyuResult(statistic=4120046.0, pvalue=3.371749048162538e-116)

Based on the p-values obtained from the Mann-Whitney U test, I can safely reject the null hypothesis in favor of the alternative. Running around Prospect Park in a clockwise direction is harder than running it counter-clockwise.

## Feature Engineering

I want to create a column to summarize the effort experienced at each trackpoint. First, I will calculate the mean and standard deviation of the heart rates from the LES Loop activity. I will then create a function that will categorize each trackpoint as "easy", "medium", or "hard" based on the heart rate measurement.

In [18]:
def create_effort_zones(x):
    # "easy" effort will be defined as a heart rate that is one standard deviation below the mean
    if x < (hr_mean - hr_std):
        return "easy"
    # "medium" effort will be defined as a heart rate that is within one standard deviation above or below the mean
    elif x >= (hr_mean - hr_std) and x < (hr_mean + hr_std):
        return "medium"
    # "hard" effort will be defined as a heart rate that is one standard deviation above the mean
    elif x >= (hr_mean + hr_std):
        return "hard"

In [19]:
hr_mean = LES_hr.mean()

In [20]:
hr_std = LES_hr.std()

In [21]:
LESloop_df.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,elevation,time,heart_rate,cadence,geometry
0,0,40.688497,-73.970898,25.5,2022-03-03 11:58:46+00:00,103,83,POINT (-73.970898 40.688497)
1,1,40.688514,-73.970898,25.4,2022-03-03 11:58:51+00:00,99,0,POINT (-73.970898 40.688514)
2,2,40.688518,-73.970909,25.4,2022-03-03 11:58:52+00:00,99,0,POINT (-73.970909 40.688518)
3,3,40.688531,-73.970922,25.4,2022-03-03 11:58:53+00:00,100,0,POINT (-73.970922 40.688531)
4,4,40.688545,-73.970934,25.4,2022-03-03 11:58:54+00:00,101,0,POINT (-73.970934 40.688545)


In [22]:
LESloop_df['effort_zone'] = LESloop_df['heart_rate'].apply(lambda x: create_effort_zones(x))

In [23]:
LESloop_df

Unnamed: 0.1,Unnamed: 0,latitude,longitude,elevation,time,heart_rate,cadence,geometry,effort_zone
0,0,40.688497,-73.970898,25.5,2022-03-03 11:58:46+00:00,103,83,POINT (-73.970898 40.688497),easy
1,1,40.688514,-73.970898,25.4,2022-03-03 11:58:51+00:00,99,0,POINT (-73.970898 40.688514),easy
2,2,40.688518,-73.970909,25.4,2022-03-03 11:58:52+00:00,99,0,POINT (-73.970909 40.688518),easy
3,3,40.688531,-73.970922,25.4,2022-03-03 11:58:53+00:00,100,0,POINT (-73.970922 40.688531),easy
4,4,40.688545,-73.970934,25.4,2022-03-03 11:58:54+00:00,101,0,POINT (-73.970934 40.688545),easy
...,...,...,...,...,...,...,...,...,...
3605,3605,40.688159,-73.970896,26.2,2022-03-03 12:59:14+00:00,168,86,POINT (-73.970896 40.688159),hard
3606,3606,40.688134,-73.970880,26.3,2022-03-03 12:59:15+00:00,168,86,POINT (-73.97088 40.688134),hard
3607,3607,40.688112,-73.970860,26.3,2022-03-03 12:59:16+00:00,167,85,POINT (-73.97086 40.688112),hard
3608,3608,40.688089,-73.970846,26.4,2022-03-03 12:59:17+00:00,167,85,POINT (-73.970846 40.688089),hard


In [24]:
LESloop_df.to_csv('../data/LESloop_effort_gdf.csv')

In [26]:
def create_effort_zones_int(x):
    # "easy" effort will be defined as a heart rate that is one standard deviation below the mean
    if x < (hr_mean - hr_std):
        return "1"
    # "medium" effort will be defined as a heart rate that is within one standard deviation above or below the mean
    elif x >= (hr_mean - hr_std) and x < (hr_mean + hr_std):
        return "2"
    # "hard" effort will be defined as a heart rate that is one standard deviation above the mean
    elif x >= (hr_mean + hr_std):
        return "3"

In [27]:
LESloop_df['effort_zone_int'] = LESloop_df['heart_rate'].apply(lambda x: create_effort_zones_int(x))

In [30]:
LESloop_df.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,elevation,time,heart_rate,cadence,geometry,effort_zone,effort_zone_int
0,0,40.688497,-73.970898,25.5,2022-03-03 11:58:46+00:00,103,83,POINT (-73.970898 40.688497),easy,1
1,1,40.688514,-73.970898,25.4,2022-03-03 11:58:51+00:00,99,0,POINT (-73.970898 40.688514),easy,1
2,2,40.688518,-73.970909,25.4,2022-03-03 11:58:52+00:00,99,0,POINT (-73.970909 40.688518),easy,1
3,3,40.688531,-73.970922,25.4,2022-03-03 11:58:53+00:00,100,0,POINT (-73.970922 40.688531),easy,1
4,4,40.688545,-73.970934,25.4,2022-03-03 11:58:54+00:00,101,0,POINT (-73.970934 40.688545),easy,1


In [31]:
LESloop_df.to_csv('../data/LESloop_effort_int_gdf.csv')