## Load Libraries

In [1]:
import pandas as pd
import numpy as np

## Load total CSV after downloading from STRAVA API

In [11]:
# Load it into a Dataframe using pandas
path = 'python_data/strava_angelos_20170812122309.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,act_startDate,timestamp,time,act_id,act_name,distance,altitude,velocity_smooth,moving,grade_smooth,heartrate,lat,long
0,2017-07-05 13:35:37+00:00,2017-07-05 13:35:37+00:00,0,1068864142,Afternoon Ride,0.0,24.3,0.0,False,-1.4,85,59.442823,24.623962
1,2017-07-05 13:35:37+00:00,2017-07-05 13:35:38+00:00,1,1068864142,Afternoon Ride,0.0,24.3,0.0,False,-1.9,85,59.442821,24.623959
2,2017-07-05 13:35:37+00:00,2017-07-05 13:35:46+00:00,9,1068864142,Afternoon Ride,20.8,24.0,2.3,True,-2.2,85,59.442884,24.623628
3,2017-07-05 13:35:37+00:00,2017-07-05 13:35:47+00:00,10,1068864142,Afternoon Ride,20.8,23.9,2.3,False,-2.3,83,59.442828,24.623551
4,2017-07-05 13:35:37+00:00,2017-07-05 13:35:50+00:00,13,1068864142,Afternoon Ride,27.5,23.7,2.3,True,-3.1,80,59.44292,24.623511


In [4]:
# Comparing with test data to spot any difference
path_test = 'python_data/strava_angelos_test.csv'
df_test = pd.read_csv(path_test)
df_test.head()

Unnamed: 0,act_startDate,timestamp,act_id,act_name,altitude,distance,grade_smooth,heartrate,moving,time,velocity_smooth,lat,long
0,2017-03-27 06:52:47+00:00,2017-03-27 06:52:47+00:00,916298883,Morning Ride,8.0,0.0,3.2,,False,0,0.0,55.674874,12.592714
1,2017-03-27 06:52:47+00:00,2017-03-27 06:52:57+00:00,916298883,Morning Ride,8.0,4.4,5.1,,True,10,0.4,55.67491,12.592685
2,2017-03-27 06:52:47+00:00,2017-03-27 06:52:59+00:00,916298883,Morning Ride,8.3,9.3,5.4,,True,12,0.8,55.674938,12.592625
3,2017-03-27 06:52:47+00:00,2017-03-27 06:53:01+00:00,916298883,Morning Ride,8.8,15.6,6.3,,True,14,1.1,55.674979,12.592555
4,2017-03-27 06:52:47+00:00,2017-03-27 06:53:02+00:00,916298883,Morning Ride,9.0,18.6,6.0,,True,15,2.8,55.674997,12.592519


## Useful Functions

In [5]:
# Trim data per specific time threshold
def trimmer(time_df, thresh):
    cur = None
    for i, item in time_df.iteritems():
        if (cur is None) or (item - cur >= thresh):
            yield i
            cur = item

In [6]:
# Create a string format for the Day column
def dayConverter(s):
    # Set date formats
    time_format = "%Y-%m-%d"

    # Convert from period to string
    converted = s.strftime(time_format)
    
    return converted

In [7]:
# Add distance to all activities
def distanceEnhancer(added,x):
    return x + added
    

In [8]:
# Transforms seconds to hours
def secToHours(d):
    # Create the rule
    seconds = d
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    
    #Convert all to string
    seconds = str(seconds)
    minutes = str(minutes)
    hours = str(hours)
    
    #Add 0(zeros) for single digit numbers
    if len(seconds)==1:
        seconds = '0'+seconds
    if len(minutes)==1:
        minutes = '0'+minutes
    if len(hours)==1:
        hours = '0'+hours
    return hours+':'+minutes+':'+seconds

## Modify the data

In [12]:
# Convert to act_StartDate to datetime
df['act_startDate'] = pd.to_datetime(df['act_startDate'])
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Keep only date as a period in a seperate column
df['day'] = pd.DatetimeIndex(df['act_startDate']).to_period('D')

# Convert days to strings
df['day'] = df['day'].apply(lambda x: dayConverter(x))

# Sort df by ascending day and timestamp
df = df.sort_values(by=['day','timestamp'],ascending=[True,True])

# Check main df
df.head()

Unnamed: 0,act_startDate,timestamp,time,act_id,act_name,distance,altitude,velocity_smooth,moving,grade_smooth,heartrate,lat,long,day
0,2017-07-05 13:35:37,2017-07-05 13:35:37,0,1068864142,Afternoon Ride,0.0,24.3,0.0,False,-1.4,85,59.442823,24.623962,2017-07-05
1,2017-07-05 13:35:37,2017-07-05 13:35:38,1,1068864142,Afternoon Ride,0.0,24.3,0.0,False,-1.9,85,59.442821,24.623959,2017-07-05
2,2017-07-05 13:35:37,2017-07-05 13:35:46,9,1068864142,Afternoon Ride,20.8,24.0,2.3,True,-2.2,85,59.442884,24.623628,2017-07-05
3,2017-07-05 13:35:37,2017-07-05 13:35:47,10,1068864142,Afternoon Ride,20.8,23.9,2.3,False,-2.3,83,59.442828,24.623551,2017-07-05
4,2017-07-05 13:35:37,2017-07-05 13:35:50,13,1068864142,Afternoon Ride,27.5,23.7,2.3,True,-3.1,80,59.44292,24.623511,2017-07-05


## Create a day number flag for keeping track of the trip days    
## Create an iteration number flag for keeping track of activities per day

In [13]:
# Create helper dataframe with unique days
df_helper = df.groupby(by=['day','act_id']).count().iloc[:,0].reset_index().filter(items=['day','act_id'])

# Create columns for iter_no(per activity) and day_no(per day)
days = list(set(df_helper['day']))
days.sort()
day_no = list()
iter_no = list()
for index,day in enumerate(days):
    counter=1
    for dfday in df_helper['day']:
        if dfday == day:
            iter_no.append(counter)
            day_no.append(index+1)
            counter+=1

df_helper['day_no'] = pd.Series(day_no).values
df_helper['iter_no'] = pd.Series(iter_no).values      

df_helper

Unnamed: 0,day,act_id,day_no,iter_no
0,2017-07-05,1068864142,1,1
1,2017-07-05,1069096779,1,2
2,2017-07-06,1070399442,2,1
3,2017-07-06,1070439262,2,2
4,2017-07-06,1071012380,2,3
5,2017-07-07,1071696894,3,1
6,2017-07-07,1071957381,3,2
7,2017-07-07,1072056824,3,3
8,2017-07-07,1072169052,3,4
9,2017-07-07,1072858110,3,5


In [14]:
# Merge flagger with the main dataframe
df = pd.merge(df,df_helper,on=['day','act_id'])

# Check main df
df.head()

Unnamed: 0,act_startDate,timestamp,time,act_id,act_name,distance,altitude,velocity_smooth,moving,grade_smooth,heartrate,lat,long,day,day_no,iter_no
0,2017-07-05 13:35:37,2017-07-05 13:35:37,0,1068864142,Afternoon Ride,0.0,24.3,0.0,False,-1.4,85,59.442823,24.623962,2017-07-05,1,1
1,2017-07-05 13:35:37,2017-07-05 13:35:38,1,1068864142,Afternoon Ride,0.0,24.3,0.0,False,-1.9,85,59.442821,24.623959,2017-07-05,1,1
2,2017-07-05 13:35:37,2017-07-05 13:35:46,9,1068864142,Afternoon Ride,20.8,24.0,2.3,True,-2.2,85,59.442884,24.623628,2017-07-05,1,1
3,2017-07-05 13:35:37,2017-07-05 13:35:47,10,1068864142,Afternoon Ride,20.8,23.9,2.3,False,-2.3,83,59.442828,24.623551,2017-07-05,1,1
4,2017-07-05 13:35:37,2017-07-05 13:35:50,13,1068864142,Afternoon Ride,27.5,23.7,2.3,True,-3.1,80,59.44292,24.623511,2017-07-05,1,1


## Create cumulative distance/time when changing activity

Keep the last row of each iteration of the same day and add its number to all the rest of the next day and do the same for the next activity

In [15]:
# Transform distance to cumulative distance when changing activity (per day)
for day in days:
    act_day = list(df_helper[df_helper['day']==day]['act_id'])
    if len(act_day)>1:
        last_dist = []
        last_time = []
        for i,act in enumerate(act_day):
            adding_dist = float(df[(df['day']==day) & (df['act_id']==act)]['distance'].iloc[-1:]) # +1
            adding_time = int(df[(df['day']==day) & (df['act_id']==act)]['time'].iloc[-1:]) # +1 if problem with division
            if i == 0:
                last_dist.append(adding_dist)
                last_time.append(adding_time)
            else:
                adding_dist_plus = adding_dist+last_dist[i-1]
                adding_time_plus = adding_time+last_time[i-1]
                last_dist.append(adding_dist_plus)
                last_time.append(adding_time_plus)
                df.loc[(df['day']==day) & (df['act_id']==act),'distance'] = df[(df['day']==day) & (df['act_id']==act)]['distance'].apply(
                    lambda x: distanceEnhancer(last_dist[i-1],x))
                df.loc[(df['day']==day) & (df['act_id']==act),'time'] = df[(df['day']==day) & (df['act_id']==act)]['time'].apply(
                    lambda x: distanceEnhancer(last_time[i-1],x))
        
#new = df.groupby(by=['day','act_id']).count().iloc[:,0].reset_index().filter(items=['day','act_id'])

## Create total elevation per day 

Create total elevation per day by summing the absolute difference between each consecutive point way before trimming.

In [18]:
# Create total elevation per day dataframe
result = {}
for day in days:
    temp_alt_df = df[df['day'] == day]['altitude']
    deltas = []
    for i in range(len(temp_alt_df)):
        if i>0:
            delta = temp_alt_df.iloc[i]-temp_alt_df.iloc[i-1]
            if delta>0:
                deltas.append(delta)
    result[day] = sum(deltas)
    
# Create dataframe from result dictionary
alt_ttl_df = pd.DataFrame(result.items(), columns=['day', 'elevation_gain'])
alt_ttl_df

Unnamed: 0,day,elevation_gain
0,2017-07-07,665.8
1,2017-07-06,552.5
2,2017-07-05,341.3
3,2017-08-01,1543.5
4,2017-07-29,1192.1
5,2017-07-28,1069.2
6,2017-07-25,643.8
7,2017-07-24,1289.7
8,2017-07-27,711.9
9,2017-07-26,373.2


## Create total distance in KM per day 

Create total distance in km per day by picking the last value of distance column for each day

In [19]:
temp = {}
for day in days:
    temp_dist_df = df[df['day'] == day]['distance']
    temp[day]=round(temp_dist_df.iloc[-1]/1000,1)

# Create dataframe from temp dictionary
dist_ttl_df = pd.DataFrame(temp.items(), columns=['day', 'ttl_distance'])
dist_ttl_df        

Unnamed: 0,day,ttl_distance
0,2017-07-07,127.7
1,2017-07-06,115.1
2,2017-07-05,52.0
3,2017-08-01,107.8
4,2017-07-29,106.2
5,2017-07-28,114.5
6,2017-07-25,95.7
7,2017-07-24,103.1
8,2017-07-27,95.2
9,2017-07-26,92.5


## Create a speed column km/h

In [20]:
# Create an empty dataframe
speed_df = pd.DataFrame(columns = ['time','distance','speed'])

# Calculate the speed by measuring the m/s times 3.6 for km/h
for day in days:
    temp_speed_df = df[df['day'] == day].filter(items = ['time','distance'])
    speed_list = []
    for i in range(len(temp_speed_df)):
        if i==0:
            speed_list.append(0)
        else:
            dist_delta = float(temp_speed_df['distance'].iloc[i]-temp_speed_df['distance'].iloc[i-1])
            time_delta = int(temp_speed_df['time'].iloc[i]-temp_speed_df['time'].iloc[i-1])
            if time_delta == 0: # handles division error
                speed = speed_list[i-1]
                speed_list.append(speed)
            else:
                speed = (dist_delta/time_delta) * 3.6
                speed_list.append(speed)
    # Add new speed column to temp_speed_df
    temp_speed_df['speed'] = np.asarray(speed_list)
    speed_df = pd.concat([temp_speed_df,speed_df])

Check if the lengths match each other

In [21]:
print len(speed_df)
print len(df)

151130
151130


In [22]:
# Merge speed_df with the main dataframe on indexes
df = df.join(speed_df['speed'], how='outer')

Make sure that the speed makes sense

In [23]:
df[df['speed']>100]

Unnamed: 0,act_startDate,timestamp,time,act_id,act_name,distance,altitude,velocity_smooth,moving,grade_smooth,heartrate,lat,long,day,day_no,iter_no,speed
21947,2017-07-08 17:37:04,2017-07-08 17:37:24,16039,1073979848,Evening Ride,95960.8,43.0,5.3,True,-0.1,132,56.973557,24.633693,2017-07-08,4,5,200.16
138612,2017-07-30 17:19:09,2017-07-30 17:27:53,20607,1110450382,Evening Activity,84522.8,578.2,18.9,True,-5.0,74,41.971482,27.407865,2017-07-30,26,4,114.84
138613,2017-07-30 17:19:09,2017-07-30 17:27:54,20608,1110450382,Evening Activity,84550.7,577.4,18.9,True,-3.6,74,41.971302,27.407609,2017-07-30,26,4,100.44


In [24]:
df.iloc[21945:21950]

Unnamed: 0,act_startDate,timestamp,time,act_id,act_name,distance,altitude,velocity_smooth,moving,grade_smooth,heartrate,lat,long,day,day_no,iter_no,speed
21945,2017-07-08 17:37:04,2017-07-08 17:37:20,16035,1073979848,Evening Ride,95886.6,43.1,5.5,True,-0.4,132,56.973675,24.633108,2017-07-08,4,5,16.92
21946,2017-07-08 17:37:04,2017-07-08 17:37:23,16038,1073979848,Evening Ride,95905.2,43.0,5.3,True,-0.2,132,56.973674,24.632801,2017-07-08,4,5,22.32
21947,2017-07-08 17:37:04,2017-07-08 17:37:24,16039,1073979848,Evening Ride,95960.8,43.0,5.3,True,-0.1,132,56.973557,24.633693,2017-07-08,4,5,200.16
21948,2017-07-08 17:37:04,2017-07-08 17:37:25,16040,1073979848,Evening Ride,95964.6,43.0,5.3,True,0.0,132,56.973568,24.633634,2017-07-08,4,5,13.68
21949,2017-07-08 17:37:04,2017-07-08 17:37:30,16045,1073979848,Evening Ride,95983.4,43.0,3.8,True,0.0,132,56.97362,24.63334,2017-07-08,4,5,13.536


In [25]:
# Remove those lines with extremely high speed. There must be inaccurate tracking
df = df[df['speed']<100]

## Create max and average speed per day and heartbeat

In [26]:
# Create table with avg_speed
avg_speed_df = df.groupby('day').mean().reset_index().sort_values(by='day', ascending=1).filter(items=['day','speed'])

# Change column name
avg_speed_df.rename(columns={'speed': 'avg_speed'}, inplace=True)

# Create table with avg_speed
max_speed_df = df.groupby('day').max().reset_index().sort_values(by='day', ascending=1).filter(items=['day','speed'])

# Change column name
max_speed_df.rename(columns={'speed': 'max_speed'}, inplace=True)

# Merge two dfs
ttl_speed_df = pd.merge(avg_speed_df,max_speed_df,on='day')

# Create table with avg_speed
avg_heartbeat_df = df.groupby('day').mean().reset_index().sort_values(by='day', ascending=1).filter(items=['day','heartrate'])

# Change column name
avg_heartbeat_df.rename(columns={'heartrate': 'avg_active_HR'}, inplace=True)

# Merge two dfs
ttl_speed_df = pd.merge(ttl_speed_df,avg_heartbeat_df,on='day')

ttl_speed_df

Unnamed: 0,day,avg_speed,max_speed,avg_active_HR
0,2017-07-05,17.910134,48.42,122.153682
1,2017-07-06,21.048788,33.0,128.956203
2,2017-07-07,21.055257,76.2,128.630937
3,2017-07-08,22.059354,76.86,134.259271
4,2017-07-09,22.032012,54.36,124.89118
5,2017-07-10,20.316166,43.74,118.91888
6,2017-07-11,20.163304,70.92,127.992278
7,2017-07-12,14.481316,37.62,114.137436
8,2017-07-13,16.303031,99.12,114.023519
9,2017-07-14,19.069724,46.26,109.974267


## Create cumulative distance in string format (hours:minutes:seconds)

In [27]:
# Create column with converted seconds to hours:minutes:seconds format (string)
df['time_form'] = df['time'].apply(lambda x: secToHours(x))

## Keep only selected columns

In [28]:
df = df.filter(items=['day','act_startDate','timestamp','day_no','iter_no','altitude','distance','heartrate','time','time_form','speed','long','lat'])

In [29]:
df.head()

Unnamed: 0,day,act_startDate,timestamp,day_no,iter_no,altitude,distance,heartrate,time,time_form,speed,long,lat
0,2017-07-05,2017-07-05 13:35:37,2017-07-05 13:35:37,1,1,24.3,0.0,85,0,00:00:00,0.0,24.623962,59.442823
1,2017-07-05,2017-07-05 13:35:37,2017-07-05 13:35:38,1,1,24.3,0.0,85,1,00:00:01,0.0,24.623959,59.442821
2,2017-07-05,2017-07-05 13:35:37,2017-07-05 13:35:46,1,1,24.0,20.8,85,9,00:00:09,9.36,24.623628,59.442884
3,2017-07-05,2017-07-05 13:35:37,2017-07-05 13:35:47,1,1,23.9,20.8,83,10,00:00:10,0.0,24.623551,59.442828
4,2017-07-05,2017-07-05 13:35:37,2017-07-05 13:35:50,1,1,23.7,27.5,80,13,00:00:13,8.04,24.623511,59.44292


## Extract data to a json to check D3

In [25]:
# Exclude this filter. It is just to test NaN functionality at D3
#df = df[(df['day']=='2017-04-11') | (df['day']=='2017-05-01')]

In [31]:
def create_json(df):
    init_list = []
    for i,day in enumerate(days):
        temp = {}
        temp['day'] = day
        temp['ttl_dist'] = dist_ttl_df[dist_ttl_df['day']==day]['ttl_distance'].iloc[0]
        temp['day_no'] = i+1
        temp['path'] = df[df['day'] == day].ix[:,['lat','long']].values.tolist()
        
        #.loc[:,df.columns.isin(['lat','long'])].values.tolist()
        temp['distance'] = [i for i in df[df['day'] == day]['distance']]
        temp['elevation'] = [i for i in df[df['day'] == day]['altitude']]
        temp['elev_gain'] = alt_ttl_df[alt_ttl_df['day']==day]['elevation_gain'].iloc[0]
        temp['speed'] = [i for i in df[df['day'] == day]['speed']]
        temp['avg_speed'] = ttl_speed_df[ttl_speed_df['day']==day]['avg_speed'].iloc[0]
        temp['max_speed'] = ttl_speed_df[ttl_speed_df['day']==day]['max_speed'].iloc[0]
        temp['heartrate'] = [i for i in df[df['day'] == day]['heartrate']]
        temp['avg_active_HR'] = ttl_speed_df[ttl_speed_df['day']==day]['avg_active_HR'].iloc[0]
        temp['time_form'] = [i for i in df[df['day'] == day]['time_form']]

        init_list.append(temp)
    return init_list

In [33]:
# Transform the list of dicts into a json file
with open('../app/interactive_map/original_active.json', 'w') as outfile:  
    json.dump(create_json(df), outfile)

## Trim data points if they are too many to be handled by D3

### If the json from above is too big too be parsed by the browser, trim the df and run the json creation after the trim

In [34]:
df_trim = df.loc[list(trimmer(df['timestamp'], pd.to_timedelta(30, 's')))]

In [35]:
# Check if the trimming is correct
df_trim.head()

Unnamed: 0,day,act_startDate,timestamp,day_no,iter_no,altitude,distance,heartrate,time,time_form,speed,long,lat
0,2017-07-05,2017-07-05 13:35:37,2017-07-05 13:35:37,1,1,24.3,0.0,85,0,00:00:00,0.0,24.623962,59.442823
14,2017-07-05,2017-07-05 13:35:37,2017-07-05 13:36:12,1,1,22.3,105.9,109,35,00:00:35,12.66,24.622399,59.443344
27,2017-07-05,2017-07-05 13:35:37,2017-07-05 13:36:42,1,1,21.9,210.5,120,65,00:01:05,7.68,24.621295,59.444071
34,2017-07-05,2017-07-05 13:35:37,2017-07-05 13:37:15,1,1,20.9,326.2,117,98,00:01:38,12.42,24.619629,59.444661
39,2017-07-05,2017-07-05 13:35:37,2017-07-05 13:37:45,1,1,19.4,439.2,116,128,00:02:08,13.86,24.618034,59.445264


In [36]:
# Transform the list of dicts into a json file
with open('../app/interactive_map/trimmed_active.json', 'w') as outfile:  
    json.dump(create_json(df_trim), outfile)

## Correct data issues with day#25


In [37]:
df_trim[df_trim['day_no']==25]

Unnamed: 0,day,act_startDate,timestamp,day_no,iter_no,altitude,distance,heartrate,time,time_form,speed,long,lat
128226,2017-07-29,2017-07-29 06:43:01,2017-07-29 06:43:01,25,2,61.9,22117.2,109,4519,01:15:19,0.000000,27.682136,42.968049
128240,2017-07-29,2017-07-29 06:43:01,2017-07-29 06:43:31,25,2,65.1,22201.2,128,4549,01:15:49,9.360000,27.681912,42.967371
128248,2017-07-29,2017-07-29 06:43:01,2017-07-29 06:44:05,25,2,69.2,22302.3,132,4583,01:16:23,10.851429,27.681585,42.966496
128254,2017-07-29,2017-07-29 06:43:01,2017-07-29 06:44:41,25,2,73.4,22418.5,132,4619,01:16:59,11.622857,27.681130,42.965508
128259,2017-07-29,2017-07-29 06:43:01,2017-07-29 06:45:13,25,2,76.8,22521.7,133,4651,01:17:31,11.931429,27.680817,42.964610
128264,2017-07-29,2017-07-29 06:43:01,2017-07-29 06:45:45,25,2,80.0,22629.7,133,4683,01:18:03,11.777143,27.680559,42.963663
128270,2017-07-29,2017-07-29 06:43:01,2017-07-29 06:46:19,25,2,82.6,22752.1,129,4717,01:18:37,14.040000,27.680646,42.962564
128276,2017-07-29,2017-07-29 06:43:01,2017-07-29 06:46:52,25,2,82.8,22879.0,127,4750,01:19:10,13.140000,27.680557,42.961425
128285,2017-07-29,2017-07-29 06:43:01,2017-07-29 06:47:24,25,2,80.0,23033.8,117,4782,01:19:42,23.160000,27.680513,42.960037
128296,2017-07-29,2017-07-29 06:43:01,2017-07-29 06:47:55,25,2,70.4,23282.4,110,4813,01:20:13,34.200000,27.680497,42.957802


# Corrections

There has been an issue after transforming commulative distances. Day#25 (29/07/2017) had a duplicate activity and the graphs mismatch. The duplicates have been manually excluded from the trimmed json file.

# Discussion

The idea from now on is to have two different dataframe, one for the totals and one for the activities.    
Concerning the **totals**:

* Merge ttl_speed_df and alt_ttl_df on key day in order to have the total elevation gain, speed, max speed and active HR per day
* Rename each column of the new dataframe from above to blabla_angelos
* Then merge Andreas and Angelos dataframes to a single CSV with all data included

Concerning the **activities**:

* Make sure what are the columns we actually need for the D3 and minimize it(with the trimmer above) if needed in order to have the lowest possible size for javascript to not lag.
* Identify the columns that will be andreas and angelos different (probably active HR and speed)