In [None]:
%run config

In [None]:
from pyspark.sql.functions import * 
import pandas as pd
import utils
from ratelimiter import RateLimiter

### Logic to orchestrate querying of Strava API

1. Raw query of API to attain all activites currently showing
2. Query DBFS for all activities written to storage
3. Make comparison, and return those activities not wrriten to storage
    - 3a. If all written to storage continue to next comparison
4. For those not written storage, query API and write to storage
    - Repeat 3 to ensure all activites are written

5.  Will now need to compare activities and their associated segments ie each acitvity may have double digit segments

6. Looking at query segment notebook for that comparison - checking all activity IDs have been queried
    - 6a  IF no continue queries, if yes continue to segment details

7. Segment details, will take the longest. Ensure that all segments and their associated details have been queried.    

### Step 1

#### Makes 1 API Call

In [None]:
#authenticate to API
my_dataset = utils.activity_api_call(access_token)
#grab activity ids, and the more information about the activities
activity_id_DF, activity_df = utils.extract_activities(my_dataset)

#stored activitity ids is where we will compare what we queries in the api, vs what is in storage
stored_activity_ids = spark.read.format("delta").load(activity_id_path)

#islote the distinct activity Ids from each dataframe
activity_id_list_in_storage = stored_activity_ids.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()
activity_ids_from_API = activity_id_DF.select('activity_id').distinct().rdd.flatMap(lambda x:x).collect()

#find activities not writtent to storage
activity_ids_not_in_storage = utils.list_comparison(activity_id_list_in_storage,activity_ids_from_API )

#take ids not writtent to storage from activity_df and filter them
new_activities = activity_df.filter(activity_df.activity_ids.isin(activity_ids_not_in_storage))
new_ids = activity_id_DF.filter(activity_id_DF.activity_id.isin(activity_ids_not_in_storage))

#write new activities to storage, ensure no duplicates
utils.write_dataframe_to_storage(new_activities,historical_activity_id_path, "mergeSchema", "append" )
utils.write_dataframe_to_storage(new_ids,activity_id_path, "mergeSchema", "append" )



#### Variable number of api call, could do the math ie subtract from rate counter at the top
#### 30 in this particular call

In [None]:
len(activities_without_segments)

In [None]:
#activity ids in storage post compare
stored_activity_ids = spark.read.format("delta").load(activity_id_path)

#activity IDs with segments in storage, 
segments_in_storage = spark.read.format("delta").load(segment_effort_path)

#isolte activity_ids
activities_with_segments_in_storage = segments_in_storage.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()
activity_ids_all = stored_activity_ids.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()

#compare the two
activities_without_segments = utils.list_comparison(activities_with_segments_in_storage, activity_ids_all)
#returns activities with out semgment information
#now need to query segment info, will probably hit 99 request limit

if len(activities_without_segments) > 0: 
    #condition that if there are activities to query, do it, otherwise continue on       

    #got activities and semgents
    segment_id_df = utils.query_segments(activities_without_segments, access_token)

    #need to add in activities that don' have segments
    returned_activity_ids = segment_id_df.select("activity_id").distinct().rdd.flatMap(lambda x: x).collect()
    activities_no_segments = utils.list_comparison(returned_activity_ids, activities_without_segments)
    all_activities_with_segments = utils.append_activities_without_segments(segment_id_df,activities_no_segments )

    #write all queried segments to storage
    utils.write_dataframe_to_storage(all_activities_with_segments, segment_effort_path, "mergeSchema", "append")

## Section for querying segment information

In [None]:
#segments with activities 
all_segments = spark.read.load(segment_effort_path)
#isolate segment IDS
all_segment_ids = all_segments.select("segment_id").distinct().rdd.flatMap(lambda x: x).collect()

# segment details
segment_details = spark.read.load(segment_details_path)
segment_details_ids = segment_details.select("returned_segment").distinct().rdd.flatMap(lambda x: x).collect()
#extract segment id values

#comparison to feed to function
segment_details_to_query = utils.list_comparison(segment_details_ids, all_segment_ids)

rate_limiter = RateLimiter(max_calls=20)
test_df = utils.query_segment_details_with_limits(segment_details_to_query, access_token, rate_limiter)



#### Scratch Work

In [None]:
subset = segment_details_to_query[30:45]
rate_limiter = RateLimiter(max_calls=25)
test_df_SHOULD_BE5 = query_segment_details_with_limits(subset, access_token)

In [None]:
test_df_SHOULD_BE5.count()

In [None]:
for i in subset:
    print(i)

In [None]:

#make comparison
segment_details_to_query = utils.list_comparison(segment_details_ids, all_segment_ids)
subset = segment_details_to_query[:25]

recent_segment_details = utils.query_segment_details( subset, access_token)

#write these new segments to storage
utils.write_dataframe_to_storage(recent_segment_details, segment_details_path, "mergeSchema", "append")

In [None]:
for i in segment_details_to_query:
    print(i)

In [None]:
segments_in_storage = spark.read.format("delta").load(segment_effort_path)
segments_in_storage.orderby('activity_id').desc()display(10)

In [None]:
activity_ids_with_queried_segments = segments_in_storage.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()


In [None]:
#will need to compare the activity ids that have already been queried for their segments
segments_in_storage = spark.read.format("delta").load(segment_effort_path)
activity_ids_with_queried_segments = segments_in_storage.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()


activity_ids_without_queried_segments = [x for x in full_activity_ids  if x not in activity_ids_with_queried_segments ]

#grab the first 99 spots so as not to overload the api call
eligible_activities = activity_ids_without_queried_segments[:99]



segment_id_df = utils.query_segments(eligible_activities, access_token)

In [None]:
activity_id_list = [9663381569,9656452945,9635250821, 9578982519,9559341308,9515669005,9408871895
,9298361043,9248492217,9235757648]

activity_id_subset = [9663381569,9656452945,9635250821]

In [None]:
activity_ids_not_written_to_storage = [x for x in activity_id_list if x not in activity_id_subset ]

In [None]:
if len(activity_ids_not_written_to_storage) == 0:
    print ("All activities in storage")
else:
    print( f"Need to query {len(activity_ids_not_written_to_storage)}, activities")

In [None]:
# make api call to strava API
#from query_activities notebook
my_dataset = utils.activity_api_call(access_token)

#extract the activities
activity_id_DF, activity_df = extract_activities(my_dataset)

#read in historical activities 
stored_activity_ids = spark.read.format("delta").load(activity_id_path)
activity_id_list = stored_activity_ids.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()


#make comparison between stored and queries
#need to convert activity_id_DF to a list of IDS
#activity_ids_not_written_to_storage = [x for x in activity_id_list if x not in historical_activity_id_list ]


activity_ids_not_written_to_storage = [x for x in activity_id_subset if x not in activity_id_list ]



#if we do not have all activities written to storage
if len(activity_ids_not_written_to_storage) = 0:
    continue
#take those and query
else: new_activities extract_activities(activity_ids_not_written_to_storage) 
& write_to_storage(new_activities)

done:

Run check again ie if len(activity_ids_not_written_to_storage) = 0:
#Then compare activities with segments to those written in storage

#now looking at 'Query_Segment' notebook 
#will need to compare the activity ids that have already been queried for their segments
segments_in_storage = spark.read.format("delta").load(segment_effort_path)
activity_ids_with_queried_segments = segments_in_storage.select('activity_id').distinct().rdd.flatMap(lambda x: x).collect()


activity_ids_without_queried_segments = [x for x in full_activity_ids  if x not in activity_ids_with_queried_segments ]

#will also need to incorporate activities without segments into the results

#querying historical segments
#will need to make sure that 
segments_in_storage = spark.read.format("delta").load(segment_effort_path)

#make that comparison

#will now need to get the segment details, from 'segment_exploration' notebook
segment_df = query_segment_details(segment_list)

#again making comparison






