In [4]:
%run config

Code from file 'file:///c%3A/Users/nicholas.radich/Documents/Strava_Lakehouse/config.py':
 client_id = dbutils.secrets.get(scope = "key_vault_secrets", key = "clientid") 
client_secret = dbutils.secrets.get(scope = "key_vault_secrets", key = "clientsecret") 
new_refresh_token = dbutils.secrets.get(scope = "key_vault_secrets", key = "newrefreshtoken")
activity_id_path = dbutils.secrets.get(scope = "key_vault_secrets", key = "activityidpath") 
historical_activity_id_path = dbutils.secrets.get(scope = "key_vault_secrets", key = "historicalactivitydfpath") 
segment_effort_path = dbutils.secrets.get(scope = "key_vault_secrets", key = "segmenteffortpath") 
segment_details_path = dbutils.secrets.get(scope = "key_vault_secrets", key = "segmentdetailspath") 



import requests
import urllib3

auth_url = "https://www.strava.com/oauth/token"
activites_url = "https://www.strava.com/api/v3/athlete/activities"


payload = {
    'client_id':  client_id,
    'client_secret': client_



In [5]:
from pyspark.sql.functions import * 
import pandas as pd
import utils




### Logic to orchestrate querying of Strava API

1. Raw query of API to attain all activites currently showing
2. Query DBFS for all activities written to storage
3. Make comparison, and return those activities not wrriten to storage
    - 3a. If all written to storage continue to next comparison
4. For those not written storage, query API and write to storage
    - Repeat 3 to ensure all activites are written

5.  Will now need to compare activities and their associated segments ie each acitvity may have double digit segments

6. Looking at query segment notebook for that comparison - checking all activity IDs have been queried
    - 6a  IF no continue queries, if yes continue to segment details

7. Segment details, will take the longest. Ensure that all segments and their associated details have been queried.    

### Step 1

#### Makes 1 API Call

In [6]:
api_starting_limit = 99



In [8]:
#authenticate to API
my_dataset = utils.activity_api_call(access_token)
#grab activity ids, and the more information about the activities
activity_id_DF, activity_df = utils.extract_activities(my_dataset)

#stored activitity ids is where we will compare what we queries in the api, vs what is in storage
stored_activity_ids = spark.read.format("delta").load(activity_id_path)

#islote the distinct activity Ids from each dataframe
activity_id_list_in_storage = stored_activity_ids.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()
activity_ids_from_API = activity_id_DF.select('activity_id').distinct().rdd.flatMap(lambda x:x).collect()

#find activities not writtent to storage
activity_ids_not_in_storage = utils.list_comparison(activity_id_list_in_storage,activity_ids_from_API )


#ensure that there are activites to write to storage, otherwise continue
if len(activity_ids_not_in_storage) >0:

    #take ids not writtent to storage from activity_df and filter them
    new_activities = activity_df.filter(activity_df.activity_ids.isin(activity_ids_not_in_storage))
    new_ids = activity_id_DF.filter(activity_id_DF.activity_id.isin(activity_ids_not_in_storage))

    #write new activities to storage, ensure no duplicates
    utils.write_dataframe_to_storage(new_activities,historical_activity_id_path, "mergeSchema", "append" )
    utils.write_dataframe_to_storage(new_ids,activity_id_path, "mergeSchema", "append" )





In [14]:
#activity ids in storage post compare
stored_activity_ids = spark.read.format("delta").load(activity_id_path)

#activity IDs with segments in storage, 
segments_in_storage = spark.read.format("delta").load(segment_effort_path)

#isolte activity_ids
activities_with_segments_in_storage = segments_in_storage.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()
activity_ids_all = stored_activity_ids.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()

#compare the two
activities_without_segments = utils.list_comparison(activities_with_segments_in_storage, activity_ids_all)
#returns activities with out semgment information
#now need to query segment info, will probably hit 99 request limit


if len(activities_without_segments) > 0: 
    #condition that if there are activities to query, do it, otherwise continue on       

    #got activities and semgents
    segment_id_df = utils.query_segments(activities_without_segments, access_token)

    #need to add in activities that don' have segments
    returned_activity_ids = segment_id_df.select("activity_id").distinct().rdd.flatMap(lambda x: x).collect()
    activities_no_segments = utils.list_comparison(returned_activity_ids, activities_without_segments)
    all_activities_with_segments = utils.append_activities_without_segments(segment_id_df,activities_no_segments )

    #write all queried segments to storage
    utils.write_dataframe_to_storage(all_activities_with_segments, segment_effort_path, "mergeSchema", "append")




In [16]:
post_activity_api_limit = api_starting_limit - len(activities_without_segments)



## Section for querying segment information

In [21]:
segment_details_to_query_filtered =[x for x in segment_details_to_query if x is not None] 



In [25]:
all_segments = spark.read.load(segment_effort_path)
all_segments.display()

Activity_Segment_JointID,segment_id,activity_id,ingest_file_name,ingested_at
31267101453397970929656452945,3.126710145339797e+18,9656452945,segment_efforts_ids,2023-08-29T22:43:01.062+0000
31267101453402988529656452945,3.1267101453402993e+18,9656452945,segment_efforts_ids,2023-08-29T22:43:01.062+0000
31267101453341384689656452945,3.1267101453341384e+18,9656452945,segment_efforts_ids,2023-08-29T22:43:01.062+0000
31267101453406920689656452945,3.126710145340692e+18,9656452945,segment_efforts_ids,2023-08-29T22:43:01.062+0000
31267101453372934129656452945,3.1267101453372933e+18,9656452945,segment_efforts_ids,2023-08-29T22:43:01.062+0000
31267101453366185969656452945,3.1267101453366185e+18,9656452945,segment_efforts_ids,2023-08-29T22:43:01.062+0000
31267101453369288689656452945,3.126710145336929e+18,9656452945,segment_efforts_ids,2023-08-29T22:43:01.062+0000
31267101453371592689656452945,3.126710145337159e+18,9656452945,segment_efforts_ids,2023-08-29T22:43:01.062+0000
31267101453417805809656452945,3.1267101453417805e+18,9656452945,segment_efforts_ids,2023-08-29T22:43:01.062+0000
31267101453385119729656452945,3.126710145338512e+18,9656452945,segment_efforts_ids,2023-08-29T22:43:01.062+0000


In [26]:
#segments with activities 
all_segments = spark.read.load(segment_effort_path)
#isolate segment IDS
all_segment_ids = all_segments.select("segment_id").distinct().rdd.flatMap(lambda x: x).collect()

# segment details
segment_details = spark.read.load(segment_details_path)
segment_details_ids = segment_details.select("returned_segment").distinct().rdd.flatMap(lambda x: x).collect()
#extract segment id values

#comparison to feed to function
segment_details_to_query = utils.list_comparison(segment_details_ids, all_segment_ids)

#grab the max amount of segments we can query within the API contraints
if len(segment_details_to_query) > 0  and post_activity_api_limit > 0 :
    api_limit_segments = segment_details_to_query[:post_activity_api_limit]
    #had to removed 'none' values from the list 
    api_limit_segments_filtered =[x for x in api_limit_segments if x is not None] 
    returned_segment_details = utils.query_segment_details(api_limit_segments_filtered, access_token)

    #write to storage 
    utils.write_dataframe_to_storage(returned_segment_details,segment_details_path, "mergeSchema", "append" )




#### Scratch Work

In [None]:
#overwrite the data instorage with a small dataset
#two of these do not have segments, will need to append in
activity_id_subset = [9663381569,9656452945,9635250821]

In [None]:
returned_segment_details.display()

In [None]:
activity_id_list = [9663381569,9656452945,9635250821, 9578982519,9559341308,9515669005,9408871895
,9298361043,9248492217,9235757648]

activity_id_subset = [9663381569,9656452945,9635250821]

In [None]:
activity_ids_not_written_to_storage = [x for x in activity_id_list if x not in activity_id_subset ]