In [139]:
%run config

Code from file 'file:///c%3A/Users/nicholas.radich/Documents/Strava_Lakehouse/config.py':
 client_id = dbutils.secrets.get(scope = "key_vault_secrets", key = "clientid") 
client_secret = dbutils.secrets.get(scope = "key_vault_secrets", key = "clientsecret") 
new_refresh_token = dbutils.secrets.get(scope = "key_vault_secrets", key = "newrefreshtoken")
activity_id_path = dbutils.secrets.get(scope = "key_vault_secrets", key = "activityidpath") 
historical_activity_id_path = dbutils.secrets.get(scope = "key_vault_secrets", key = "historicalactivitydfpath") 
segment_effort_path = dbutils.secrets.get(scope = "key_vault_secrets", key = "segmenteffortpath") 
segment_details_path = dbutils.secrets.get(scope = "key_vault_secrets", key = "segmentdetailspath") 



import requests
import urllib3

auth_url = "https://www.strava.com/oauth/token"
activites_url = "https://www.strava.com/api/v3/athlete/activities"


payload = {
    'client_id':  client_id,
    'client_secret': client_



In [140]:
from pyspark.sql.functions import * 
import pandas as pd
import utils



### Logic to orchestrate querying of Strava API

1. Raw query of API to attain all activites currently showing
2. Query DBFS for all activities written to storage
3. Make comparison, and return those activities not wrriten to storage
    - 3a. If all written to storage continue to next comparison
4. For those not written storage, query API and write to storage
    - Repeat 3 to ensure all activites are written

5.  Will now need to compare activities and their associated segments ie each acitvity may have double digit segments

6. Looking at query segment notebook for that comparison - checking all activity IDs have been queried
    - 6a  IF no continue queries, if yes continue to segment details

7. Segment details, will take the longest. Ensure that all segments and their associated details have been queried.    

### Step 1

In [142]:
#authenticate to API
my_dataset = utils.activity_api_call(access_token)
#grab activity ids, and the more information about the activities
activity_id_DF, activity_df = utils.extract_activities(my_dataset)

#stored activitity ids is where we will compare what we queries in the api, vs what is in storage
stored_activity_ids = spark.read.format("delta").load(activity_id_path)

#islote the distinct activity Ids from each dataframe
activity_id_list_in_storage = stored_activity_ids.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()
activity_ids_from_API = activity_id_DF.select('activity_id').distinct().rdd.flatMap(lambda x:x).collect()

#find activities not writtent to storage
activity_ids_not_in_storage = utils.list_comparison(activity_id_list_in_storage,activity_ids_from_API )

#take ids not writtent to storage from activity_df and filter them
new_activities = activity_df.filter(activity_df.activity_ids.isin(activity_ids_not_in_storage))





In [146]:
len(activity_id_list_in_storage)

10

In [143]:
#could throw in an assert statement to ensure that counts line up
print(len(activity_ids_not_in_storage))
print(new_activities.count())

137
137

In [144]:
new_activities.display()

activity_ids,start_date,activity_name,distance,moving_time,elapsed_time,sport_type,total_elevation_gain,ingest_file_name,ingested_at
9235757648,2023-06-10T00:00:26Z,Friday ride,15217.7,3046,6362,Ride,110.8,activity_information,2023-08-25T17:58:43.579+0000
9217399958,2023-06-07T00:56:35Z,evening trek,4219.1,2978,3544,Hike,208.8,activity_information,2023-08-25T17:58:43.579+0000
9197541512,2023-06-03T16:45:32Z,San Miguel mountain,7085.7,5257,5727,Hike,328.7,activity_information,2023-08-25T17:58:43.579+0000
9153279740,2023-05-27T21:44:15Z,nine,2408.5,1796,8829,Golf,20.7,activity_information,2023-08-25T17:58:43.579+0000
9122668027,2023-05-23T00:45:26Z,out and about,5457.9,1891,1913,Run,49.0,activity_information,2023-08-25T17:58:43.579+0000
9097857822,2023-05-19T01:21:31Z,Thursday rumble,20672.3,3683,3920,Ride,161.3,activity_information,2023-08-25T17:58:43.579+0000
9018676993,2023-05-06T03:12:28Z,full moon ride,16797.0,3642,8784,Ride,147.8,activity_information,2023-08-25T17:58:43.579+0000
9012852102,2023-05-05T01:55:09Z,evening jog,3823.4,1429,1450,Run,31.1,activity_information,2023-08-25T17:58:43.579+0000
8987499152,2023-04-30T21:38:39Z,Torrey pines south,7945.5,5441,16829,Golf,67.8,activity_information,2023-08-25T17:58:43.579+0000
8969740420,2023-04-28T00:30:53Z,rolling,19708.2,3817,4027,Ride,171.3,activity_information,2023-08-25T17:58:43.579+0000


In [None]:
#will need to compare the activity ids that have already been queried for their segments
segments_in_storage = spark.read.format("delta").load(segment_effort_path)
activity_ids_with_queried_segments = segments_in_storage.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()


activity_ids_without_queried_segments = [x for x in full_activity_ids  if x not in activity_ids_with_queried_segments ]

#grab the first 99 spots so as not to overload the api call
eligible_activities = activity_ids_without_queried_segments[:99]



segment_id_df = utils.query_segments(eligible_activities, access_token)

In [None]:
activity_id_list = [9663381569,9656452945,9635250821, 9578982519,9559341308,9515669005,9408871895
,9298361043,9248492217,9235757648]

activity_id_subset = [9663381569,9656452945,9635250821]

In [None]:
activity_ids_not_written_to_storage = [x for x in activity_id_list if x not in activity_id_subset ]

In [None]:
if len(activity_ids_not_written_to_storage) == 0:
    print ("All activities in storage")
else:
    print( f"Need to query {len(activity_ids_not_written_to_storage)}, activities")

In [None]:
# make api call to strava API
#from query_activities notebook
my_dataset = utils.activity_api_call(access_token)

#extract the activities
activity_id_DF, activity_df = extract_activities(my_dataset)

#read in historical activities 
stored_activity_ids = spark.read.format("delta").load(activity_id_path)
activity_id_list = stored_activity_ids.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()


#make comparison between stored and queries
#need to convert activity_id_DF to a list of IDS
#activity_ids_not_written_to_storage = [x for x in activity_id_list if x not in historical_activity_id_list ]


activity_ids_not_written_to_storage = [x for x in activity_id_subset if x not in activity_id_list ]



#if we do not have all activities written to storage
if len(activity_ids_not_written_to_storage) = 0:
    continue
#take those and query
else: new_activities extract_activities(activity_ids_not_written_to_storage) 
& write_to_storage(new_activities)

done:

Run check again ie if len(activity_ids_not_written_to_storage) = 0:
#Then compare activities with segments to those written in storage

#now looking at 'Query_Segment' notebook 
#will need to compare the activity ids that have already been queried for their segments
segments_in_storage = spark.read.format("delta").load(segment_effort_path)
activity_ids_with_queried_segments = segments_in_storage.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()


activity_ids_without_queried_segments = [x for x in full_activity_ids  if x not in activity_ids_with_queried_segments ]

#will also need to incorporate activities without segments into the results

#querying historical segments
#will need to make sure that 
segments_in_storage = spark.read.format("delta").load(segment_effort_path)

#make that comparison

#will now need to get the segment details, from 'segment_exploration' notebook
segment_df = query_segment_details(segment_list)

#again making comparison






