### Import and config

In [1]:
# Imports
import os
import logging
from datetime import timezone, timedelta

import polyline

from dotenv import load_dotenv

import pandas as pd
import numpy as np

from sqlalchemy import create_engine, text, Integer, Float, String, Boolean, DateTime, Interval, Text, BigInteger, Date

# Geopy - Nominatim
import json, time
from tqdm import tqdm
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Configuration
load_dotenv()

# DB
DB_URI = os.getenv('DB_URI')

# Bronze tables
TARGET_B_SCHEMA = os.getenv('TARGET_B_SCHEMA')
ACTIVITIES_B_TABLE = os.getenv('ACTIVITIES_B_TABLE')
DETAILS_B_TABLE = os.getenv('DETAILS_B_TABLE')
KUDOS_B_TABLE = os.getenv('KUDOS_B_TABLE')
ZONES_B_TABLE = os.getenv('ZONES_B_TABLE')
SEGMENTS_DETAILS_B_TABLE = os.getenv('SEGMENTS_DETAILS_B_TABLE')

# Silver tables
TARGET_S_SCHEMA = os.getenv('TARGET_S_SCHEMA')
ACTIVITIES_S_TABLE = os.getenv('ACTIVITIES_S_TABLE')
BEST_EFFORTS_S_TABLE = os.getenv('BEST_EFFORTS_S_TABLE')
GEAR_S_TABLE = os.getenv('GEAR_S_TABLE')
LAPS_S_TABLE = os.getenv('LAPS_S_TABLE')
MAPS_S_TABLE = os.getenv('MAPS_S_TABLE')
SEGMENTS_S_TABLE = os.getenv('SEGMENTS_S_TABLE')
SEGMENTS_EFFORTS_S_TABLE = os.getenv('SEGMENTS_EFFORTS_S_TABLE')
LOCATIONS_S_TABLE = os.getenv('LOCATIONS_S_TABLE')
KUDOS_S_TABLE = os.getenv('KUDOS_S_TABLE')
ZONES_S_TABLE = os.getenv('ZONES_S_TABLE')
RELATIVE_EFFORT_S_TABLE = os.getenv('RELATIVE_EFFORT_S_TABLE')

# Other
LOG_LEVEL = os.getenv('LOG_LEVEL')
NO_GEAR_ID = 'x00000000'

# Geopy - Nominatim
PRECISION = int(os.getenv('PRECISION'))
CACHE_PATH = os.getenv('CACHE_PATH')
USER_AGENT = os.getenv('USER_AGENT')

logging.basicConfig(
    level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
    format="%(asctime)s | %(levelname)s | %(message)s"
)

pd.set_option('display.max_columns', None)

### DB names validation

In [2]:
REQUIRED_DB_ENV = ['DB_URI', 'TARGET_B_SCHEMA', 'ACTIVITIES_B_TABLE', 'DETAILS_B_TABLE','KUDOS_B_TABLE', 'ZONES_B_TABLE', 'SEGMENTS_DETAILS_B_TABLE', 'TARGET_S_SCHEMA', 'ACTIVITIES_S_TABLE', 'BEST_EFFORTS_S_TABLE', 'GEAR_S_TABLE', 'LAPS_S_TABLE', 'MAPS_S_TABLE', 'SEGMENTS_S_TABLE', 'SEGMENTS_EFFORTS_S_TABLE', 'ZONES_S_TABLE', 'RELATIVE_EFFORT_S_TABLE']
missing_db_env = [env for env in REQUIRED_DB_ENV if not os.getenv(env)]
if missing_db_env:
  raise RuntimeError(f"Missing env variables: {', '.join(missing_db_env)}.")

### Request data from `bronze.activities_details` and `bronze.kudos`

In [3]:
engine = create_engine(
  DB_URI, 
  pool_pre_ping=True, 
  pool_size=5, 
  max_overflow=10
)
logging.info("Connection established")

2025-12-09 14:54:47,707 | INFO | Connection established


In [4]:
with engine.begin() as conn:
  activities_details_df = pd.read_sql(text(f"SELECT * FROM {TARGET_B_SCHEMA}.{DETAILS_B_TABLE}"), conn)
  kudos_df = pd.read_sql(text(f"SELECT * FROM {TARGET_B_SCHEMA}.{KUDOS_B_TABLE}"), conn)
  activities_zones_df = pd.read_sql(text(f"SELECT * FROM {TARGET_B_SCHEMA}.{ZONES_B_TABLE}"), conn)
  segment_detailed_df = pd.read_sql(text(f"SELECT * FROM {TARGET_B_SCHEMA}.{SEGMENTS_DETAILS_B_TABLE}"), conn)
logging.info(f"Data from {TARGET_B_SCHEMA}.{DETAILS_B_TABLE}, {TARGET_B_SCHEMA}.{KUDOS_B_TABLE}, {TARGET_B_SCHEMA}.{ZONES_B_TABLE} and {TARGET_B_SCHEMA}.{SEGMENTS_DETAILS_B_TABLE} downloaded.")

2025-12-09 14:54:48,518 | INFO | Data from bronze.activities_details, bronze.kudos, bronze.activities_zones and bronze.segments_details downloaded.


In [5]:
activities_details_df.head()

Unnamed: 0,resource_state,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id,start_date,start_date_local,timezone,utc_offset,location_city,location_state,location_country,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,private,visibility,flagged,gear_id,start_latlng,end_latlng,average_speed,max_speed,average_cadence,average_watts,max_watts,weighted_average_watts,device_watts,kilojoules,has_heartrate,average_heartrate,max_heartrate,heartrate_opt_out,display_hide_heartrate_option,elev_high,elev_low,upload_id,upload_id_str,external_id,from_accepted_tag,pr_count,total_photo_count,has_kudoed,suffer_score,description,calories,perceived_exertion,prefer_perceived_exertion,segment_efforts,splits_metric,splits_standard,laps,best_efforts,stats_visibility,hide_from_home,device_name,embed_token,available_zones,athlete_id,athlete_resource_state,map_id,map_polyline,map_resource_state,map_summary_polyline,gear_primary,gear_name,gear_nickname,gear_resource_state,gear_retired,gear_distance,gear_converted_distance,photos_primary,photos_count,similar_activities_effort_count,similar_activities_average_speed,similar_activities_min_average_speed,similar_activities_mid_average_speed,similar_activities_max_average_speed,similar_activities_pr_rank,similar_activities_frequency_milestone,similar_activities_trend_speeds,similar_activities_trend_current_activity_index,similar_activities_trend_min_speed,similar_activities_trend_mid_speed,similar_activities_trend_max_speed,similar_activities_trend_direction,similar_activities_resource_state,average_temp,private_note,photos_primary_unique_id,photos_primary_urls_600,photos_primary_urls_100,photos_primary_source,photos_primary_media_type,photos_use_primary_photo
0,3,Afternoon Weight Training,0.0,3673,3673,0.0,Workout,WeightTraining,,15865360447,2025-09-19T13:02:12Z,2025-09-19T15:02:12Z,(GMT+02:00) Africa/Blantyre,7200.0,,,,0,10,0,1,0,True,False,False,False,followers_only,False,,[],[],0.0,0.0,,,,,,,True,101.1,153.0,False,True,0.0,0.0,16944040000.0,16944036400,garmin_ping_481978870072,False,0,0,False,9.0,Reska8Ô∏è‚É£9Ô∏è‚É£,330.0,,,[],,,"[{'id': 56505060057, 'name': 'Lap 1', 'split':...",,"[{'type': 'heart_rate', 'visibility': 'everyon...",False,Garmin Forerunner 970,724f76a68cf3fd012b9362a1d999973c389851ca,[heartrate],81055898,1,a15865360447,,3,,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,
1,3,Evening Ride,19347.2,4273,4375,75.0,Ride,Ride,10.0,15879687027,2025-09-20T17:03:29Z,2025-09-20T19:03:29Z,(GMT+01:00) Europe/Warsaw,7200.0,,,,0,12,0,2,0,False,False,False,False,followers_only,False,b12572672,"[51.107321, 17.124391]","[51.107749, 17.124288]",4.528,11.5,,97.3,,,False,415.9,True,91.3,134.0,False,True,127.8,114.4,16958750000.0,16958751274,garmin_ping_482396211132,False,0,0,False,8.0,Recovery patrolüöî,295.0,,,"[{'id': 3404548530909393958, 'name': 'Odcinek ...","[{'split': 1, 'distance': 1001.5, 'pace_zone':...","[{'split': 1, 'distance': 1610.2, 'pace_zone':...","[{'id': 56559813306, 'name': 'Lap 1', 'split':...",,"[{'type': 'heart_rate', 'visibility': 'everyon...",False,Garmin Edge 840,8c93ccea663b0b91cf00b346720a5f2698083970,"[heartrate, power]",81055898,1,a15879687027,w{|vHmrogBDGVWp@}@HCDBf@fAJHTADBF^d@x@\dAHDJAP...,3,oh|vHsuogBVn@XNDNAFiBjDy@hAi@f@qAxBcAvAgAdBSN[...,False,Cube Nuroad Pro,Cube Nuroad Pro,2.0,False,3788632.0,3788.6,,0,,,,,,,,,,,,,,,25.0,,,,,,,
2,3,Lunch Weight Training,0.0,6066,6066,0.0,WeightTraining,WeightTraining,,14086094444,2025-04-05T09:39:40Z,2025-04-05T11:39:40Z,(GMT+02:00) Africa/Blantyre,7200.0,,,,0,9,0,1,0,True,False,False,False,followers_only,False,,[],[],0.0,0.0,,,,,,,True,106.9,149.0,False,True,0.0,0.0,15035940000.0,15035936606,garmin_ping_426122504181,False,0,0,False,17.0,,552.0,,False,[],,,"[{'id': 49980673377, 'name': 'Lap 1', 'split':...",,"[{'type': 'heart_rate', 'visibility': 'everyon...",False,Garmin Forerunner 945,82075ce6443833456de2feeb79ad874381da038b,[heartrate],81055898,1,a14086094444,,3,,,,,,,,,,0,,,,,,,,,,,,,,,27.0,,,,,,,
3,3,Evening Walk,5035.5,3637,3829,17.0,Walk,Walk,,14080030310,2025-04-04T16:37:40Z,2025-04-04T18:37:40Z,(GMT+01:00) Europe/Warsaw,7200.0,,,,0,8,0,2,0,False,False,False,False,followers_only,False,g19800575,"[51.107619, 17.124085]","[51.108098, 17.124984]",1.385,3.4,52.2,,,,,,True,94.3,107.0,False,True,125.6,114.6,15029560000.0,15029564885,garmin_ping_425899048795,False,0,0,False,7.0,Dokrƒôcanie krok√≥wüôÇ‚Äç‚ÜîÔ∏è,344.0,,False,"[{'id': 3343296484852327920, 'name': 'BƒÖczek c...","[{'split': 1, 'distance': 1000.4, 'pace_zone':...","[{'split': 1, 'distance': 1615.1, 'pace_zone':...","[{'id': 49956432295, 'name': 'Lap 1', 'split':...",,"[{'type': 'heart_rate', 'visibility': 'everyon...",False,Garmin Forerunner 945,33a9d220712243f8e49e182755025746b35e4118,[heartrate],81055898,1,a14080030310,q}|vHopogB?BH@VJFGD]\|@H\?HBDDAJJJd@DJFANJ@HJL...,3,mh|vH{togBP`@F\G|@GTSVGAKd@IRKJ?BOZQVMDIHCJUHG...,False,Nike Invincible Run 3 White,White,2.0,True,430102.0,430.1,,0,,,,,,,,,,,,,,,25.0,,,,,,,
4,3,Morning Run,4478.9,1826,1880,0.0,Run,Run,,4961015103,2021-01-19T08:02:09Z,2021-01-19T09:02:09Z,(GMT+01:00) Europe/Warsaw,3600.0,,,,0,1,0,1,0,False,False,False,False,everyone,False,,"[51.099947, 17.119148]","[51.097558, 17.111651]",2.453,4.3,,,,,,,False,,,False,False,121.4,120.2,5291260000.0,5291259722,2021-01-19-090209.gpx,False,0,0,False,,,438.0,,,[],"[{'split': 1, 'distance': 1005.5, 'pace_zone':...","[{'split': 1, 'distance': 1611.3, 'pace_zone':...","[{'id': 16205241523, 'name': 'Lap 1', 'split':...","[{'id': 31466450585, 'name': '400m', 'athlete'...","[{'type': 'heart_rate', 'visibility': 'everyon...",False,,3b8e2f9d57d32789fb02b92cab2a2852ad93e9c9,[pace],81055898,1,a4961015103,sm{vHsqngBPFZzBJZH~@rAlGB^b@nA^xATx@Xr@H\d@hA^...,3,sm{vHsqngBPFZzBJZH~@rAlGB^b@nA^xATx@Xr@H\d@hA^...,,,,,,,,,0,1.0,2.452848,2.452848,2.452848,2.452848,,,[2.452847754654983],0.0,2.452848,2.452848,2.452848,0.0,2.0,,,,,,,,


In [6]:
kudos_df.head()

Unnamed: 0,resource_state,firstname,lastname,activity_id,kudos_id,id
0,2,Kacper,G.,15716821076,0,15716821076-0
1,2,Jan,K.,15716821076,1,15716821076-1
2,2,Jacek,S.,15716821076,2,15716821076-2
3,2,Ola,≈Å.,15716821076,3,15716821076-3
4,2,Kacper,K.,15716821076,4,15716821076-4


In [7]:
activities_zones_df.head()

Unnamed: 0,score,distribution_buckets,type,resource_state,sensor_based,points,custom_zones,activity_id,id
0,39.0,"[{'max': 133, 'min': 0, 'time': 73.0}, {'max':...",heartrate,3,True,0.0,True,15923268347,15923268347-heartrate
1,7.0,"[{'max': 2.542, 'min': 0, 'time': 0.0}, {'max'...",pace,3,True,,,15923268347,15923268347-pace
2,,"[{'max': 0, 'min': 0, 'time': 0.0}, {'max': 50...",power,3,True,,,15923268347,15923268347-power
3,17.0,"[{'max': 138, 'min': 0, 'time': 173.0}, {'max'...",heartrate,3,True,0.0,True,8254517069,8254517069-heartrate
4,4.0,"[{'max': 2.386, 'min': 0, 'time': 2470}, {'max...",pace,3,True,,,8254517069,8254517069-pace


In [8]:
segment_detailed_df.head()

Unnamed: 0,id,resource_state,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,start_latlng,end_latlng,elevation_profile,climb_category,city,state,country,private,hazardous,starred,created_at,updated_at,total_elevation_gain,effort_count,athlete_count,star_count,elevation_profiles_light_url,elevation_profiles_dark_url,map_id,map_polyline,map_resource_state,athlete_segment_stats_pr_elapsed_time,athlete_segment_stats_pr_date,athlete_segment_stats_pr_visibility,athlete_segment_stats_pr_activity_id,athlete_segment_stats_pr_activity_visibility,athlete_segment_stats_effort_count,xoms_kom,xoms_qom,xoms_overall,xoms_destination_href,xoms_destination_type,xoms_destination_name,local_legend_athlete_id,local_legend_title,local_legend_profile,local_legend_effort_description,local_legend_effort_count,local_legend_effort_counts_overall,local_legend_effort_counts_female,local_legend_destination,local_legend
0,19517444,3,BRZE≈πNO-SOPOT,Run,3550.0,0.1,6.1,8.4,1.2,"[54.409473, 18.636221]","[54.428292, 18.593357]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Gda≈Ñsk,pomorskie,Poland,False,False,False,2018-12-20T11:44:06Z,2021-05-19T08:04:48Z,10.2,14917,4655,1,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s19517444,ezakIk{vpBe@~BWnBYlAe@jCs@tCc@rBy@hE{@bGYhDOj@...,3,1222,2024-11-23,everyone,12964095995,everyone,1,11:32,14:48,11:32,strava://segments/19517444/leaderboard,overall,All-Time,152263214.0,Igor Kania,https://dgalywyr863hv.cloudfront.net/pictures/...,24 efforts in the last 90 days,24.0,24 efforts,13 efforts,strava://segments/19517444/local_legend?catego...,
1,34463750,3,Aleja Jana Paw≈Ça II / John Paul II Avenue,Run,897.0,-0.0,1.3,115.8,114.7,"[52.235861, 20.997782]","[52.228274, 21.002288]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Warsaw,Masovian Voivodeship,Poland,False,False,False,2023-05-23T10:03:44Z,2023-05-23T10:20:35Z,0.0,21810,14546,0,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s34463750,ciy}Hccd_CxBUj@PnA_@f@Gd@Ob@W`@a@J@hAi@Vk@Hs@B...,3,246,2025-11-11,everyone,16426332819,everyone,1,1:53,2:56,1:53,strava://segments/34463750/leaderboard,overall,All-Time,64841306.0,Bartek Ryczek,https://dgalywyr863hv.cloudfront.net/pictures/...,6 efforts in the last 90 days,6.0,6 efforts,4 efforts,strava://segments/34463750/local_legend?catego...,
2,17455116,3,Folwarczna ≈õluza do swojczyckiego,Run,1533.3,0.1,0.8,115.0,112.9,"[51.104281, 17.124089]","[51.114371, 17.109618]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Wroc≈Çaw,Wojew√≥dztwo dolno≈õlƒÖskie,Poland,False,False,False,2018-04-22T13:45:42Z,2021-05-15T02:20:56Z,2.1,26055,3021,3,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s17455116,wh|vHopogBGX[b@]v@[\IVuAfBeCjD}ApCs@z@oBbDi@n@...,3,498,2021-10-25,everyone,6165241355,everyone,38,5:03,6:10,5:03,strava://segments/17455116/leaderboard,overall,All-Time,105315738.0,Marcin Przepi√≥rkowski,https://lh3.googleusercontent.com/a/ACg8ocJUZd...,26 efforts in the last 90 days,26.0,26 efforts,14 efforts,strava://segments/17455116/local_legend?catego...,
3,8544270,3,Wschodni ~2.8k Lap,Run,2784.3,0.0,11.0,101.2,99.0,"[51.085002, 17.075916]","[51.084648, 17.075653]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Wroc≈Çaw,Wojew√≥dztwo dolno≈õlƒÖskie,Poland,False,False,False,2014-11-22T16:03:40Z,2021-05-19T08:03:05Z,2.0,15012,1442,30,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s8544270,gpxvHmcfgBcA_ES{A]qAS_AMw@Gu@a@wBUgBEo@Y}AKyBI...,3,979,2023-01-11,everyone,8373136865,everyone,4,9:24,13:04,9:24,strava://segments/8544270/leaderboard,overall,All-Time,85087334.0,Pawe≈Ç Rakowiecki,https://graph.facebook.com/10225563450261374/p...,42 efforts in the last 90 days,42.0,42 efforts,20 efforts,strava://segments/8544270/local_legend?categor...,
4,6625302,3,Eisenbahnbr√ºcke - Garstnerbad,Run,939.4,-0.7,3.0,310.2,301.2,"[48.025566, 14.414919]","[48.019632, 14.413532]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Steyr,O√ñ,Austria,False,False,False,2014-02-09T11:06:16Z,2021-05-18T08:03:27Z,4.0,10409,954,15,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s6625302,w~bdHel~vA`@SxAe@|@a@dBe@fAe@hC{@bAWx@Kp@Ez@OZ...,3,372,2024-04-21,only_me,11232504363,everyone,1,3:01,3:31,2:51,strava://segments/6625302/leaderboard,overall,All-Time,189413086.0,Medina Hanic,avatar/athlete/large.png,29 efforts in the last 90 days,29.0,29 efforts,29 efforts,strava://segments/6625302/local_legend?categor...,


### Separate tables setup

In [9]:
dataframe_columns = {
  'activities' : [
    'id',
    'name',
    'distance',
    'moving_time',
    'elapsed_time',
    'total_elevation_gain',
    'type',
    'sport_type',
    'workout_type',
    'start_date',
    'start_date_local',
    'timezone',
    'achievement_count',
    'kudos_count',
    'comment_count',
    'athlete_count',
    'photo_count',
    'trainer',
    'commute',
    'manual',
    'visibility',
    'start_latlng',
    'end_latlng',
    'average_speed',
    'max_speed',
    'average_cadence',
    'average_watts',
    'max_watts',
    'weighted_average_watts',
    'has_heartrate',
    'average_heartrate',
    'max_heartrate',
    'elev_high',
    'elev_low',
    'pr_count',
    'total_photo_count',
    'suffer_score',
    'description',
    'calories',
    'device_name',
    'map_id',
    'gear_id'],
  'maps' : [
    'map_id',
    'map_polyline',
    'map_summary_polyline'],
  'gear' : [
    'gear_id',
    'gear_name',
    'gear_distance',
    'gear_converted_distance',
    'start_date',
    'start_date_local'],
  'segment_efforts' : [
    'id',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'start_index',
    'end_index',
    'average_cadence',
    'device_watts',
    'average_watts',
    'average_heartrate',
    'max_heartrate',
    'pr_rank',
    'achievements',
    'visibility',
    'kom_rank',
    'hidden',
    'activity_id',
    'segment_id'],
  'segments' : [
    'segment_id',
    'segment_name',
    'segment_activity_type',
    'segment_distance',
    'segment_average_grade',
    'segment_maximum_grade',
    'segment_elevation_high',
    'segment_elevation_low',
    'segment_start_latlng',
    'segment_end_latlng',
    'segment_climb_category',
    'segment_private',
    'segment_hazardous'],
  'laps' : [
    'id',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'average_speed',
    'max_speed',
    'lap_index',
    'split',
    'start_index',
    'end_index',
    'total_elevation_gain',
    'average_cadence',
    'device_watts',
    'average_watts',
    'average_heartrate',
    'max_heartrate',
    'pace_zone',
    'activity_id'],
  'best_efforts' : [
    'id',
    'activity_id',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'pr_rank',
    'achievements',
    'start_index',
    'end_index']
}

### Spliting data into tables

In [10]:
def select_cols(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
  """
  Select only the specified columns from a DataFrame if they exist.

  Parameters
  ----------
  df : pd.DataFrame
      The input DataFrame.
  cols : list of str
      List of column names to select.

  Returns
  -------
  pd.DataFrame
      A new DataFrame containing only the specified columns that exist 
      in the input DataFrame. If none of the columns exist, 
      an empty DataFrame is returned.
  """
  
  existing = [c for c in cols if c in df.columns]
  
  return df[existing].copy() if existing else pd.DataFrame()

def explode_normalize_json(df: pd.DataFrame, col: str, id_col: str | None = None, id_name: str | None = None) -> pd.DataFrame:
  """
  Explode a list-like column into multiple rows and normalize nested JSON/dict objects 
  into a flat tabular structure.

  This function is useful for columns containing arrays of JSON objects 
  (e.g. laps, segment efforts). Each element of the list becomes a separate row, 
  and nested fields are flattened into individual columns. Optionally, 
  a parent identifier column can be retained/renamed to act as a foreign key.

  Parameters
  ----------
  df : pd.DataFrame
      Input DataFrame containing the column to explode.
  col : str
      Name of the column with list- or dict-like values to explode and normalize.
  id_col : str, optional
      Name of the column in the input DataFrame to keep as a parent identifier.
      If provided, it will be included in the output.
  id_name : str, optional
      If provided together with `id_col`, renames the identifier column 
      in the result (e.g. from "id" to "activity_id").

  Returns
  -------
  pd.DataFrame
      A new DataFrame where:
        * each list element from `col` is a separate row,
        * nested JSON/dict objects are flattened into columns with names joined by "_",
        * the parent identifier (`id_col`) is preserved and optionally renamed.
      If `col` is missing or contains only null/empty values, 
      an empty DataFrame is returned.
  """

  if col not in df.columns:
    return pd.DataFrame()
  
  base_cols = [col]

  if id_col and id_col in df.columns:
    base_cols.insert(0, id_col)

  base = df[base_cols].copy()
  exploded = base.explode(col, ignore_index=True)
  values = exploded[col].dropna()

  if values.empty:
    return pd.DataFrame()
  
  norm = pd.json_normalize(values, sep='_')
  out = exploded.loc[values.index].drop(columns=[col]).reset_index(drop=True)
  res = pd.concat([out.reset_index(drop=True), norm.reset_index(drop=True)], axis=1)
  
  if id_col and id_name and id_col in df.columns:
    res = res.rename(columns={id_col: id_name})
  return res

In [11]:
# Activities
activities_cols = dataframe_columns['activities']
activities_df = select_cols(activities_details_df, activities_cols)
logging.info("DataFrame 'activities_df' created.")

# Maps
maps_cols = dataframe_columns['maps']
maps_df = select_cols(activities_details_df, maps_cols)
logging.info("DataFrame 'maps_df' created.")

# Gear
gear_cols = dataframe_columns['gear']
gear_df = select_cols(activities_details_df, gear_cols)
logging.info("DataFrame 'gear_df' created.")

# Segment efforts
seg_eff_cols = dataframe_columns['segment_efforts']
segments_eff_df = explode_normalize_json(activities_details_df, 'segment_efforts')
segments_eff_df = select_cols(segments_eff_df, seg_eff_cols)
logging.info("DataFrame 'segments_eff_df' created.")

# Segments
seg_cols = dataframe_columns['segments']
segments_df = explode_normalize_json(activities_details_df, 'segment_efforts')
segments_df = select_cols(segments_df, seg_cols)
logging.info("DataFrame 'segments_df' created.")

# Laps
lap_cols = dataframe_columns['laps']
laps_df = explode_normalize_json(activities_details_df, 'laps')
laps_df = select_cols(laps_df, lap_cols)
logging.info("DataFrame 'laps_df' created.")

# Best efforts
best_eff_cols = dataframe_columns['best_efforts']
best_eff_df = explode_normalize_json(activities_details_df, 'best_efforts')
best_eff_df = select_cols(best_eff_df, best_eff_cols)
logging.info("DataFrame 'best_eff_df' created.")

# All dataframes in dictionary
dataframes = {
    "activities": activities_df,
    "maps": maps_df,
    "gear": gear_df,
    "segment_efforts": segments_eff_df,
    "segments": segments_df,
    "laps": laps_df,
    "best_efforts": best_eff_df
}

2025-12-09 14:54:48,624 | INFO | DataFrame 'activities_df' created.
2025-12-09 14:54:48,625 | INFO | DataFrame 'maps_df' created.
2025-12-09 14:54:48,627 | INFO | DataFrame 'gear_df' created.
2025-12-09 14:54:48,703 | INFO | DataFrame 'segments_eff_df' created.
2025-12-09 14:54:48,772 | INFO | DataFrame 'segments_df' created.
2025-12-09 14:54:48,819 | INFO | DataFrame 'laps_df' created.
2025-12-09 14:54:48,835 | INFO | DataFrame 'best_eff_df' created.


### Activities Dataframe

In [12]:
def speed_to_pace_str(speed: float) -> str | None:
  """
  Convert speed in meters per second to running pace as a string.

  Parameters
  ----------
  speed : float
      Speed value in meters per second. Must be greater than zero.

  Returns
  -------
  str or None
      Running pace in the format "M:SS" representing minutes per kilometer.
      For example, "5:32" means 5 minutes and 32 seconds per kilometer.
      Returns None if the speed is less than or equal to zero.
  """

  if speed <= 0:
    return None
  
  seconds = 1000/speed
  minutes = int(seconds // 60)
  sec = int(round(seconds % 60))

  if sec == 60:
    minutes += 1
    sec = 0

  return f"{minutes}:{sec:02d}"

def speed_to_pace_float(speed: float) -> float | None:
  """
  Convert speed in meters per second to running pace as a float.

  Parameters
  ----------
  speed : float
      Speed value in meters per second. Must be greater than zero.

  Returns
  -------
  float or None
      Running pace in minutes per kilometer, represented as a float.
      For example, 5.53 means approximately 5 minutes and 32 seconds per kilometer.
      Returns None if the speed is less than or equal to zero.
  """

  if speed <= 0:
    return None
  
  return 1000 / speed / 60

def extract_timedelta(time: pd.Series) -> pd.Series:
  """
  Convert a Series of numeric values (seconds) into timedeltas.

  Parameters
  ----------
  time : pd.Series
      Series containing durations expressed in seconds (int/float). 
      Null values are preserved as None.

  Returns
  -------
  pd.Series
      Series of Python ``datetime.timedelta`` objects. 
      Each element corresponds to the given number of seconds or None if missing.
  """

  return pd.Series([(timedelta(seconds=int(t)) if pd.notnull(t) else None) for t in time], dtype="object")

def extract_latlng(latlng: pd.Series) -> pd.DataFrame:
  """
  Split a Series of latitude/longitude pairs into a DataFrame with separate columns.

  Parameters
  ----------
  latlng : pd.Series
      Series where each element is expected to be a list or tuple of length 2 
      (latitude, longitude). If the element is not a valid pair, it is replaced 
      with [None, None].

  Returns
  -------
  pd.DataFrame
      DataFrame with two columns:
        * first column: latitude
        * second column: longitude
      The index is preserved from the input Series.
  """
  latlng = latlng.apply(
    lambda row: row if isinstance(row, (list, tuple)) and len(row) == 2 else [None, None]
  )
  return pd.DataFrame(latlng.tolist(), index=latlng.index)

def etc_gmt_from_offset(minutes: int) -> str:
    """
    Convert a UTC offset (in minutes) to an IANA fixed-offset zone name ``Etc/GMT¬±N``.

    Parameters
    ----------
    minutes : int
        Offset from UTC in minutes.
        Positive values mean UTC+ (east of Greenwich), negative mean UTC‚àí (west).

    Returns
    -------
    str
        IANA timezone name. For whole-hour offsets the format is ``Etc/GMT¬±H``,
        e.g. ``Etc/GMT-2`` for +120 minutes and ``Etc/GMT+5`` for ‚àí300 minutes.
        For non-hour offsets, minutes are included, e.g. ``Etc/GMT-2:30``.

    Notes
    -----
    - The ``Etc/GMT`` naming convention uses an **inverted sign** relative to ISO 8601:
      ``UTC+02:00 ‚Üí Etc/GMT-2`` and ``UTC-05:00 ‚Üí Etc/GMT+5``.
    - ``Etc/GMT`` zones are fixed-offset and **do not observe DST**.

    Examples
    --------
    >>> etc_gmt_from_offset(120)
    'Etc/GMT-2'
    >>> etc_gmt_from_offset(-300)
    'Etc/GMT+5'
    >>> etc_gmt_from_offset(150)
    'Etc/GMT-2:30'
    """

    sign = '-' if minutes > 0 else '+'
    h, m = divmod(abs(minutes), 60)
    return f"Etc/GMT{sign}{h}" if m == 0 else f"Etc/GMT{sign}{h}:{m:02d}"

def create_datetime_tz_cols(df: pd.DataFrame, date_col: str, date_col_local: str) -> pd.DataFrame:
  """
  Derive a UTC timestamp and a fixed-offset timezone name from UTC and local datetimes.

  The function parses a UTC datetime column and a corresponding local datetime column
  (both representing the same instant), computes the offset in minutes
  ``local - utc``, and maps that offset to a fixed-offset IANA zone name
  using ``Etc/GMT¬±H[:MM]``. It returns a DataFrame with the UTC timestamp and
  the derived timezone name.

  Parameters
  ----------
  df : pd.DataFrame
      Input DataFrame containing the UTC and local datetime columns.
  date_col : str
      Name of the column with UTC datetimes (string or datetime-like).
      Values are parsed to a tz-aware UTC dtype.
  date_col_local : str
      Name of the column with local datetimes (string or datetime-like).
      Values are used only to infer the UTC offset.

  Returns
  -------
  pd.DataFrame
      A DataFrame with two columns:
        * ``start_date_utc_dt`` ‚Äî tz-aware UTC timestamp (dtype ``datetime64[ns, UTC]``),
        * ``tz`` ‚Äî fixed-offset IANA zone name in the ``Etc/GMT`` family
          (e.g., ``"Etc/GMT-2"`` for UTC+02:00, ``"Etc/GMT+5"`` for UTC‚àí05:00).

  Notes
  -----
  - ``Etc/GMT`` zones are fixed offsets and **do not observe DST**. The sign is
    intentionally inverted by IANA naming convention: UTC+02:00 ‚Üí ``Etc/GMT-2``.
  - Both columns must refer to the same moment in time; otherwise the inferred
    offset (and thus ``tz``) will be incorrect.

  Raises
  ------
  KeyError
      If ``date_col`` or ``date_col_local`` is missing in ``df``.
  ValueError
      If datetime parsing fails.
  """
  
  if date_col not in df.columns or date_col_local not in df.columns:
        raise KeyError(f"Missing required columns: {date_col}, {date_col_local}")

  temp_df = pd.DataFrame()
  temp_df["start_date_utc_dt"] = pd.to_datetime(df[date_col])
  temp_df["start_date_local_dt"] = pd.to_datetime(df[date_col_local])
  temp_df["utc_offset"] = (temp_df["start_date_local_dt"] - temp_df["start_date_utc_dt"]).dt.total_seconds() / 60

  temp_df["tz"] = temp_df["utc_offset"].apply(etc_gmt_from_offset)
  
  return temp_df[["start_date_utc_dt", "start_date_local_dt", "tz"]]

In [13]:
activities_df.loc[:, "moving_time_td"]  = extract_timedelta(activities_df["moving_time"])
activities_df.loc[:, "elapsed_time_td"] = extract_timedelta(activities_df["elapsed_time"])

activities_df[["start_date_utc_dt", "start_date_local_dt", "local_timezone"]] = create_datetime_tz_cols(activities_df, "start_date", "start_date_local")
activities_df["tz_name"] = activities_df["timezone"].astype(str).str.split(" ").str[-1].str.strip()
activities_df["start_date_local_dt"] = activities_df.apply(lambda row: row["start_date_utc_dt"].tz_convert(row["tz_name"]), axis=1)

activities_df[["start_lat", "start_lng"]]  = extract_latlng(activities_df["start_latlng"])
activities_df[["end_lat", "end_lng"]] = extract_latlng(activities_df["end_latlng"])

is_run = activities_df['type'] == 'Run'

activities_df.loc[is_run, 'average_cadence'] = activities_df['average_cadence'].apply(lambda x: x * 2)

activities_df.loc[is_run, 'avg_pace_str'] = activities_df['average_speed'].apply(speed_to_pace_str)
activities_df.loc[is_run, 'avg_pace_float'] = activities_df['average_speed'].apply(speed_to_pace_float)

activities_df.loc[is_run, 'max_pace_str'] = activities_df['max_speed'].apply(speed_to_pace_str)
activities_df.loc[is_run, 'max_pace_float'] = activities_df['max_speed'].apply(speed_to_pace_float)

activities_df['gear_id'] = activities_df['gear_id'].fillna(NO_GEAR_ID)


In [14]:
activities_df['gear_id'] = np.where(
    (activities_df['start_date_utc_dt'] < '2020-01-01') &
    (activities_df['gear_id'] == 'g9239745'),
    NO_GEAR_ID,
    activities_df['gear_id']
)

In [15]:
activities_cols_clean = [
    'id',
    'name',
    'start_date_utc_dt',
    'start_date_local_dt',
    'local_timezone',
    'distance',
    'moving_time',
    'moving_time_td',
    'elapsed_time',
    'elapsed_time_td',
    'total_elevation_gain',
    'elev_low',
    'elev_high',
    'type',
    'sport_type',
    'workout_type',
    'achievement_count',
    'kudos_count',
    'comment_count',
    'athlete_count',
    'photo_count',
    'trainer',
    'commute',
    'manual',
    'visibility',
    'average_speed',
    'avg_pace_str',
    'avg_pace_float',
    'max_speed',
    'max_pace_str',
    'max_pace_float',
    'average_cadence',
    'average_watts',
    'max_watts',
    'weighted_average_watts',
    'has_heartrate',
    'average_heartrate',
    'max_heartrate',
    'pr_count',
    'total_photo_count',
    'suffer_score',
    'description',
    'calories',
    'device_name',
    'start_lat',
    'start_lng',
    'map_id',
    'gear_id'
]
activities_df = activities_df[activities_cols_clean]
activities_df = activities_df.sort_values(by='start_date_utc_dt', ascending=False)

In [16]:
activities_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,elev_low,elev_high,type,sport_type,workout_type,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,visibility,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_watts,max_watts,weighted_average_watts,has_heartrate,average_heartrate,max_heartrate,pr_count,total_photo_count,suffer_score,description,calories,device_name,start_lat,start_lng,map_id,gear_id
1173,16691408683,Morning Yoga,2025-12-09 07:02:45+00:00,2025-12-09 08:02:45+01:00,Etc/GMT-1.0,0.0,4537,1:15:37,4537,1:15:37,0.0,0.0,0.0,Yoga,Yoga,31.0,0,1,0,1,0,True,False,False,everyone,0.0,,,0.0,,,,,,,True,77.4,123.0,0,0,6.0,,233.0,Garmin Forerunner 970,,,a16691408683,x00000000
1172,16684277150,6.5km Easy RunüôÇ‚Äç‚ÜîÔ∏è,2025-12-08 12:45:47+00:00,2025-12-08 13:45:47+01:00,Etc/GMT-1.0,6565.0,2317,0:38:37,2355,0:39:15,62.0,21.6,60.4,Run,Run,,4,8,0,1,0,False,False,False,everyone,2.833,5:53,5.883045,3.8,4:23,4.385965,168.6,351.5,536.0,347.0,True,142.8,155.0,3,0,25.0,Easy z g√≥rkamiüóª\n\n6.5km easy run at a convers...,510.0,Garmin Forerunner 970,52.755966,15.226957,a16684277150,g24134620
1171,16678081919,Drop Set Hill Repsüè°,2025-12-07 16:57:40+00:00,2025-12-07 17:57:40+01:00,Etc/GMT-1.0,9222.0,3286,0:54:46,3292,0:54:52,194.0,23.4,53.0,Run,Run,3.0,1,9,0,1,0,False,False,False,everyone,2.806,5:56,5.939653,4.5,3:42,3.703704,164.4,352.8,765.0,369.0,True,148.6,173.0,0,0,51.0,Gorzowskie g√≥ry > Wroc≈Çawskie pag√≥rkiüôÇ‚Äç‚ÜîÔ∏è\n\n2...,745.0,Garmin Forerunner 970,52.755944,15.226775,a16678081919,g24134620
1170,16663625820,24km Long Runüá™üá∏,2025-12-06 07:19:02+00:00,2025-12-06 08:19:02+01:00,Etc/GMT-1.0,24159.5,8178,2:16:18,9074,2:31:14,61.0,112.2,125.0,Run,Run,2.0,15,8,0,1,0,False,False,False,everyone,2.954,5:39,5.642067,4.54,3:40,3.671072,171.6,344.4,507.0,327.0,True,143.5,155.0,7,0,92.0,Mini Walencja w domu:\n\n(Trochƒô wiƒôcej stania...,1755.0,Garmin Forerunner 970,51.107712,17.124042,a16663625820,g23642256
1169,16648928719,12km Easy Runüòã,2025-12-04 14:20:16+00:00,2025-12-04 15:20:16+01:00,Etc/GMT-1.0,12328.4,4326,1:12:06,4366,1:12:46,42.0,106.6,120.6,Run,Run,,9,8,0,1,0,False,False,False,everyone,2.85,5:51,5.847953,3.48,4:47,4.789272,170.2,322.6,411.0,321.0,True,142.2,152.0,6,0,43.0,Lu≈∫ne klepanieü§ùüèª\n\n12km easy run at a convers...,960.0,Garmin Forerunner 970,51.107732,17.123874,a16648928719,g24134620


Load to PostgreSQL will be made after extracting location from coordinates

## Gear Dataframe

In [17]:
gear_df.columns = gear_df.columns.str.replace("^gear_", "", regex=True)
gear_df['id'] = gear_df['id'].fillna(NO_GEAR_ID)
gear_df['name'] = gear_df['name'].fillna('No gear')
gear_df = gear_df.fillna(0)
gear_df = gear_df.rename(columns={'distance' : 'distance_m', 'converted_distance' : 'distance_km'})
gear_df[["start_date_utc_dt", "start_date_local_dt", "local_timezone"]] = create_datetime_tz_cols(gear_df, "start_date", "start_date_local")
gear_df = (
  gear_df.sort_values(by='start_date_utc_dt', ascending=False)
  .drop_duplicates(subset=["id"], keep="first")
  .reset_index(drop=True)
)
gear_df = gear_df[['id', 'name', 'distance_m', 'distance_km']]

In [18]:
gear_df.head()

Unnamed: 0,id,name,distance_m,distance_km
0,x00000000,No gear,0.0,0.0
1,g24134620,ASICS Novablast 5,579895.0,579.9
2,g23642256,Adidas EVO SL,345476.0,345.5
3,b13100260,Cube Nuroad Pro Wirtualnie,567349.0,567.3
4,g27111424,Nike Vaporfly 3,10036.0,10.0


In [19]:
gear_df_dtype_map = {
    "id": String,
    "name": String,
    "distance_m": Float,
    "distance_km": Float,
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{GEAR_S_TABLE} will be overwritten.")

gear_df.to_sql(
    name=GEAR_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=gear_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

## Segments efforts Dataframe

In [20]:
segments_types_df = segments_df[['segment_id', 'segment_activity_type']].copy()
segments_types_df.drop_duplicates(inplace=True)

segments_eff_df = pd.merge(segments_eff_df, segments_types_df, on='segment_id', how='left')
segments_eff_df = pd.merge(segments_eff_df.drop(columns="achievements"), explode_normalize_json(segments_eff_df, 'achievements', 'id'), on='id', how='left')

segments_eff_df.loc[:, "moving_time_td"]  = extract_timedelta(segments_eff_df["moving_time"])
segments_eff_df.loc[:, "elapsed_time_td"] = extract_timedelta(segments_eff_df["elapsed_time"])

segments_eff_df[["start_date_utc_dt", "start_date_local_dt", "local_timezone"]] = create_datetime_tz_cols(segments_eff_df, "start_date", "start_date_local")

is_run = segments_eff_df['segment_activity_type'] == 'Run'

segments_eff_df.loc[is_run, 'average_cadence'] = segments_eff_df['average_cadence'].apply(lambda x: x * 2)

In [21]:
segments_eff_cols_clean = [
  'id',
  'name',
  'start_date_utc_dt',
  'start_date_local_dt',
  'local_timezone',
  'distance',
  'moving_time',
  'moving_time_td',
  'elapsed_time',
  'elapsed_time_td',
  'average_cadence',
  'device_watts',
  'average_watts',
  'average_heartrate',
  'max_heartrate',
  'pr_rank',
  'visibility',
  'kom_rank',
  'hidden',
  'rank',
  'type',
  'activity_id',
  'segment_id'
]
segments_eff_df = segments_eff_df[segments_eff_cols_clean]

In [22]:
segments_eff_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,pr_rank,visibility,kom_rank,hidden,rank,type,activity_id,segment_id
0,3404548530909393958,Odcinek miƒôdzy kana≈Çami,2025-09-20 17:06:32+00:00,2025-09-20 19:06:32+00:00,Etc/GMT-2.0,1386.8,310,0:05:10,310,0:05:10,,False,,82.5,91.0,,followers_only,,False,,,15879687027,11065825
1,3404548530908871718,Most Swojczycki- zjazd pod mosty Jagielo≈Ñskie,2025-09-20 17:13:13+00:00,2025-09-20 19:13:13+00:00,Etc/GMT-2.0,2310.6,519,0:08:39,519,0:08:39,,False,,87.5,99.0,,followers_only,,False,,,15879687027,11740949
2,3404548530910437414,Kana≈Çowa,2025-09-20 17:23:51+00:00,2025-09-20 19:23:51+00:00,Etc/GMT-2.0,549.4,116,0:01:56,116,0:01:56,,False,,91.9,106.0,,followers_only,,False,,,15879687027,11584293
3,3404548530907957286,Stara Odra do centrum,2025-09-20 17:26:00+00:00,2025-09-20 19:26:00+00:00,Etc/GMT-2.0,1490.4,360,0:06:00,376,0:06:16,,False,,89.8,101.0,,followers_only,,False,,,15879687027,12380127
4,3404548530908100646,Z g√≥rki na pazurki (Odra-wa≈Çy-od Grunwaldzkiej...,2025-09-20 17:34:16+00:00,2025-09-20 19:34:16+00:00,Etc/GMT-2.0,443.8,101,0:01:41,101,0:01:41,,False,,93.6,107.0,,followers_only,,False,,,15879687027,7667882


In [23]:
segments_eff_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "start_date_utc_dt": DateTime(timezone=False),
    "start_date_local_dt": DateTime(timezone=False),
    "distance": Float,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "average_cadence": Float,
    "device_watts": Boolean,
    "average_watts": Float,
    "average_heartrate": Float,
    "max_heartrate": Float,
    "pr_rank": Integer,
    "visibility": String,
    "kom_rank": Integer,
    "hidden": Boolean,
    "rank": Integer,
    "type": String,
    "activity_id": BigInteger,
    "segment_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{SEGMENTS_EFFORTS_S_TABLE} will be overwritten.")

segments_eff_df.to_sql(
    name=SEGMENTS_EFFORTS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=segments_eff_df_dtype_map,
    method="multi",
    chunksize=1000
)



-7

## Segments Dataframe

In [24]:
segments_df.columns = segments_df.columns.str.replace("^segment_", "", regex=True)

segments_df[["start_lat", "start_lng"]]  = extract_latlng(segments_df["start_latlng"])
segments_df[["end_lat", "end_lng"]] = extract_latlng(segments_df["end_latlng"])

segments_df = segments_df.drop(columns=["start_latlng", "end_latlng"])
segments_df = segments_df.drop_duplicates()

In [25]:
segments_df[segments_df['id'] == 13979314]

Unnamed: 0,id,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,climb_category,private,hazardous,start_lat,start_lng,end_lat,end_lng
3487,13979314,Parkrun Wroc≈Çaw,Run,4988.0,0.1,11.2,132.6,126.2,0,False,False,51.100648,17.120626,51.10091,17.120358


In [26]:
segments_df.head()

Unnamed: 0,id,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,climb_category,private,hazardous,start_lat,start_lng,end_lat,end_lng
0,11065825,Odcinek miƒôdzy kana≈Çami,Ride,1386.8,0.2,2.3,118.8,114.6,0,False,False,51.104562,17.123725,51.113834,17.110484
1,11740949,Most Swojczycki- zjazd pod mosty Jagielo≈Ñskie,Ride,2310.6,0.0,5.4,124.0,115.1,0,False,False,51.113875,17.107271,51.126718,17.083225
2,11584293,Kana≈Çowa,Ride,549.4,0.0,1.1,115.0,113.4,0,False,False,51.127217,17.075984,51.127622,17.068148
3,12380127,Stara Odra do centrum,Ride,1490.4,0.3,5.8,119.9,113.7,0,False,False,51.127377,17.067455,51.114944,17.072457
4,7667882,Z g√≥rki na pazurki (Odra-wa≈Çy-od Grunwaldzkiej...,Ride,443.8,-0.5,2.6,122.3,117.0,0,False,False,51.11465,17.072658,51.110929,17.071034


Load to PostgreSQL will be made after extracting location from coordinates

## Laps Dataframe

In [27]:
laps_types_df = activities_df[['id', 'type']].copy()
laps_types_df.drop_duplicates(inplace=True)
laps_df = pd.merge(laps_df, laps_types_df, left_on='activity_id', right_on='id', how='left')

In [28]:
laps_df.loc[:, "moving_time_td"]  = extract_timedelta(laps_df["moving_time"])
laps_df.loc[:, "elapsed_time_td"] = extract_timedelta(laps_df["elapsed_time"])

laps_df[["start_date_utc_dt", "start_date_local_dt", "local_timezone"]] = create_datetime_tz_cols(laps_df, "start_date", "start_date_local")

is_run = laps_df['type'] == 'Run'

laps_df.loc[is_run, 'average_cadence'] = laps_df['average_cadence'].apply(lambda x: x * 2)

laps_df.loc[is_run, 'avg_pace_str'] = laps_df['average_speed'].apply(speed_to_pace_str)
laps_df.loc[is_run, 'avg_pace_float'] = laps_df['average_speed'].apply(speed_to_pace_float)

laps_df.loc[is_run, 'max_pace_str'] = laps_df['max_speed'].apply(speed_to_pace_str)
laps_df.loc[is_run, 'max_pace_float'] = laps_df['max_speed'].apply(speed_to_pace_float)


In [29]:
laps_cols_df_clean = [
    'id_x',
    'name',
    'lap_index',
    'split',
    'start_date_utc_dt',
    'start_date_local_dt',
    'local_timezone',
    'distance',
    'moving_time',
    'moving_time_td',
    'elapsed_time',
    'elapsed_time_td',
    'total_elevation_gain',
    'type',
    'average_speed',
    'avg_pace_str',
    'avg_pace_float',
    'pace_zone',
    'max_speed',
    'max_pace_str',
    'max_pace_float',
    'average_cadence',
    'device_watts',
    'average_watts',
    'average_heartrate',
    'max_heartrate',
    'activity_id'
]

laps_df = laps_df[laps_cols_df_clean]
laps_df = laps_df.rename(columns={'id_x': 'id'})

In [30]:
laps_df.head()

Unnamed: 0,id,name,lap_index,split,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,type,average_speed,avg_pace_str,avg_pace_float,pace_zone,max_speed,max_pace_str,max_pace_float,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,activity_id
0,56505060057,Lap 1,1,1,2025-09-19 13:02:12+00:00,2025-09-19 15:02:12+00:00,Etc/GMT-2.0,0.0,3673,1:01:13,3673,1:01:13,0.0,Workout,0.0,,,,0.0,,,,False,,101.1,153.0,15865360447
1,56559813306,Lap 1,1,1,2025-09-20 17:03:29+00:00,2025-09-20 19:03:29+00:00,Etc/GMT-2.0,5000.0,1153,0:19:13,1153,0:19:13,12.4,Ride,4.34,,,,5.98,,,,False,88.2,87.1,109.0,15879687027
2,56559813315,Lap 2,2,2,2025-09-20 17:22:44+00:00,2025-09-20 19:22:44+00:00,Etc/GMT-2.0,5000.0,1189,0:19:49,1204,0:20:04,15.8,Ride,4.21,,,,8.0,,,,False,88.5,92.9,116.0,15879687027
3,56559813324,Lap 3,3,3,2025-09-20 17:42:48+00:00,2025-09-20 19:42:48+00:00,Etc/GMT-2.0,5000.0,1007,0:16:47,1041,0:17:21,31.2,Ride,4.97,,,,9.98,,,,False,112.0,93.9,114.0,15879687027
4,56559813335,Lap 4,4,4,2025-09-20 18:00:10+00:00,2025-09-20 20:00:10+00:00,Etc/GMT-2.0,4351.3,915,0:15:15,976,0:16:16,10.6,Ride,4.76,,,,11.5,,,,False,104.3,91.8,134.0,15879687027


In [31]:
laps_df_dtype_map = {
"id": BigInteger,
"name": String,
"lap_index": Integer,
"split": Integer,
"start_date_utc_dt": DateTime(timezone=False),
"start_date_local_dt": DateTime(timezone=False),
'local_timezone' : String,
"distance": Float,
"moving_time": Integer,
"moving_time_td": Interval,
"elapsed_time": Integer,
"elapsed_time_td": Interval,
"total_elevation_gain": Float,
"type": String,
"average_speed": Float,
"avg_pace_str": String,
"avg_pace_float": Float,
"pace_zone": Float,
"max_speed": Float,
"max_pace_str": String,
"max_pace_float": Float,
"average_cadence": Float,
"device_watts": Boolean,
"average_watts": Float,
"average_heartrate": Float,
"max_heartrate": Float,
"activity_id": BigInteger,
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{LAPS_S_TABLE} will be overwritten.")

laps_df.to_sql(
    name=LAPS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=laps_df_dtype_map,
    method="multi",
    chunksize=1000
)




-9

## Best efforts Dataframe

In [32]:

best_eff_df.loc[:, "moving_time_td"]  = extract_timedelta(best_eff_df["moving_time"])
best_eff_df.loc[:, "elapsed_time_td"] = extract_timedelta(best_eff_df["elapsed_time"])

best_eff_df[["start_date_utc_dt", "start_date_local_dt", "local_timezone"]] = create_datetime_tz_cols(best_eff_df, "start_date", "start_date_local")
best_eff_df = pd.merge(best_eff_df.drop(columns="achievements"), explode_normalize_json(best_eff_df, 'achievements', 'id'), on='id', how='left')

In [33]:
best_eff_df_cols_clean = [
    'id',
    'name',
    'start_date_utc_dt',
    'start_date_local_dt',
    'local_timezone',
    'distance',
    'moving_time',
    'moving_time_td',
    'elapsed_time',
    'elapsed_time_td',
    'rank',
    'type',
    'activity_id'
]
best_eff_df = best_eff_df[best_eff_df_cols_clean]

In [34]:
best_eff_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,rank,type,activity_id
0,31466450585,400m,2021-01-19 08:05:55+00:00,2021-01-19 09:05:55+00:00,Etc/GMT-1.0,400,142,0:02:22,142,0:02:22,,,4961015103
1,31466450824,1/2 mile,2021-01-19 08:03:43+00:00,2021-01-19 09:03:43+00:00,Etc/GMT-1.0,805,286,0:04:46,286,0:04:46,,,4961015103
2,31466451099,1K,2021-01-19 08:02:40+00:00,2021-01-19 09:02:40+00:00,Etc/GMT-1.0,1000,367,0:06:07,367,0:06:07,,,4961015103
3,31466451392,1 mile,2021-01-19 08:03:21+00:00,2021-01-19 09:03:21+00:00,Etc/GMT-1.0,1609,629,0:10:29,629,0:10:29,,,4961015103
4,31466450030,2 mile,2021-01-19 08:03:34+00:00,2021-01-19 09:03:34+00:00,Etc/GMT-1.0,3219,1323,0:22:03,1323,0:22:03,,,4961015103


In [35]:
best_eff_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "start_date_utc_dt": DateTime(timezone=False),
    "start_date_local_dt": DateTime(timezone=False),
    "local_timezone": String,
    "distance": Float,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "rank": Integer,
    "type": String,
    "activity_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{BEST_EFFORTS_S_TABLE} will be overwritten.")

best_eff_df.to_sql(
    name=BEST_EFFORTS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=best_eff_df_dtype_map,
    method="multi",
    chunksize=1000
)



-4

## Kudos Dataframe

In [36]:
kudos_df['full_name'] = kudos_df[['firstname', 'lastname']].astype('string').agg(' '.join, axis=1)
kudos_df_cols_clean = [
    'firstname',
    'lastname',
    'full_name',
    'activity_id'
]
kudos_df = kudos_df[kudos_df_cols_clean]
kudos_df = kudos_df.rename(columns={'firstname': 'first_name', 'lastname': 'last_name'})

In [37]:
kudos_df.head()

Unnamed: 0,first_name,last_name,full_name,activity_id
0,Kacper,G.,Kacper G.,15716821076
1,Jan,K.,Jan K.,15716821076
2,Jacek,S.,Jacek S.,15716821076
3,Ola,≈Å.,Ola ≈Å.,15716821076
4,Kacper,K.,Kacper K.,15716821076


In [38]:
kudos_dtype_map = {
    "firs_tname": String,
    "last_name": String,
    "full_name": String,
    "activity_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{KUDOS_S_TABLE} will be overwritten.")

kudos_df.to_sql(
    name=KUDOS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=kudos_dtype_map,
    method="multi",
    chunksize=1000
)



-9

## Activities Zones Dataframe

In [39]:
def zones_sort_add_names(bucket: list, zone_type: str) -> list:
  """
  Sorts a list of zone dicts by their 'min' boundary, assigns a 1-based
  'zone_number' to each entry, and sets a human-readable 'zone_name'
  based on the provided zone type.

  Behavior
  --------
  - The list is sorted ascending by the 'min' key.
  - Each zone dict is mutated in place by adding:
      * 'zone_number' (int): 1, 2, 3, ...
      * 'zone_name' (str): depends on `type` and the assigned number.
  - For type='heartrate':
      1‚Üí'Z1 - Recovery', 2‚Üí'Z2 - Endurance', 3‚Üí'Z3 - Tempo',
      4‚Üí'Z4 - Threshold', 5‚Üí'Z5 - Anaerobic'; other numbers ‚Üí ''.
  - For type='pace':
      1‚Üí'Z1 - Recovery', 2‚Üí'Z2 - Endurance', 3‚Üí'Z3 - Tempo',
      4‚Üí'Z4 - Threshold', 5‚Üí'Z5 - VO2 Max', 6‚Üí'Z6 - Anaerobic';
      other numbers ‚Üí ''.
  - Any other `type` produces an empty 'zone_name' for all entries.

  Parameters
  ----------
  bucket : list[dict]
      List of dictionaries representing zones. Each dict must contain
      a numeric 'min' key used for sorting.
      NOTE: The dictionaries are modified in place.
  type : str
      Zone classification to use for naming. Expected values:
      'heartrate' or 'pace'. Others fall back to empty names.

  Returns
  -------
  list[dict]
      A new list with the same dict objects, sorted by 'min' and
      with 'zone_number' and 'zone_name' added.

  Raises
  ------
  KeyError
      If any dict is missing the 'min' key.
  TypeError
      If 'min' values are not comparable (e.g., non-numeric types).
  """

  if zone_type not in ['heartrate', 'pace']:
        raise ValueError(f"Unsupported zone_type: {zone_type!r}")

  sorted_bucket = sorted(bucket, key=lambda d: d['min'])

  for i, zone in enumerate(sorted_bucket):
    zone['zone_number'] = i + 1

    if zone_type == 'heartrate':
      match zone['zone_number']:
        case 1:
          zone['zone_name'] = 'Z1 - Recovery'
        case 2:
          zone['zone_name'] = 'Z2 - Endurance'
        case 3:
          zone['zone_name'] = 'Z3 - Tempo'
        case 4:
          zone['zone_name'] = 'Z4 - Threshold'
        case 5:
          zone['zone_name'] = 'Z5 - Anaerobic'
        case _:
          zone['zone_name'] = ''
    elif zone_type == 'pace':
      match zone['zone_number']:
        case 1:
          zone['zone_name'] = 'Z1 - Recovery'
        case 2:
          zone['zone_name'] = 'Z2 - Endurance'
        case 3:
          zone['zone_name'] = 'Z3 - Tempo'
        case 4:
          zone['zone_name'] = 'Z4 - Threshold'
        case 5:
          zone['zone_name'] = 'Z5 - VO2 Max'
        case 6:
          zone['zone_name'] = 'Z6 - Anaerobic'
        case _:
          zone['zone_name'] = ''
    else:
      zone['zone_name'] = ''
  
  return sorted_bucket

In [40]:
zones_df = activities_zones_df.copy()

In [41]:
zones_df = zones_df[zones_df['type'].isin(['heartrate', 'pace'])]

In [42]:
zones_df['distribution_buckets_clean'] = zones_df.apply(
  lambda row: zones_sort_add_names(row['distribution_buckets'], row['type']), axis=1)
zones_df = pd.merge(zones_df.drop(columns="distribution_buckets_clean"), explode_normalize_json(zones_df, 'distribution_buckets_clean', 'id'), on='id', how='left')
zones_df['id'] = zones_df[['activity_id', 'type', 'zone_number']].astype('string').agg('-'.join, axis=1)


In [43]:
zones_df.head(15)

Unnamed: 0,score,distribution_buckets,type,resource_state,sensor_based,points,custom_zones,activity_id,id,max,min,time,zone_number,zone_name
0,39.0,"[{'max': 133, 'min': 0, 'time': 73.0, 'zone_nu...",heartrate,3,True,0.0,True,15923268347,15923268347-heartrate-1,133.0,0.0,73.0,1,Z1 - Recovery
1,39.0,"[{'max': 133, 'min': 0, 'time': 73.0, 'zone_nu...",heartrate,3,True,0.0,True,15923268347,15923268347-heartrate-2,147.0,134.0,1706.0,2,Z2 - Endurance
2,39.0,"[{'max': 133, 'min': 0, 'time': 73.0, 'zone_nu...",heartrate,3,True,0.0,True,15923268347,15923268347-heartrate-3,160.0,148.0,1290.0,3,Z3 - Tempo
3,39.0,"[{'max': 133, 'min': 0, 'time': 73.0, 'zone_nu...",heartrate,3,True,0.0,True,15923268347,15923268347-heartrate-4,166.0,161.0,0.0,4,Z4 - Threshold
4,39.0,"[{'max': 133, 'min': 0, 'time': 73.0, 'zone_nu...",heartrate,3,True,0.0,True,15923268347,15923268347-heartrate-5,-1.0,167.0,0.0,5,Z5 - Anaerobic
5,7.0,"[{'max': 2.542, 'min': 0, 'time': 0.0, 'zone_n...",pace,3,True,,,15923268347,15923268347-pace-1,2.542,0.0,0.0,1,Z1 - Recovery
6,7.0,"[{'max': 2.542, 'min': 0, 'time': 0.0, 'zone_n...",pace,3,True,,,15923268347,15923268347-pace-2,2.953,2.542,1409.0,2,Z2 - Endurance
7,7.0,"[{'max': 2.542, 'min': 0, 'time': 0.0, 'zone_n...",pace,3,True,,,15923268347,15923268347-pace-3,3.289,2.953,1638.0,3,Z3 - Tempo
8,7.0,"[{'max': 2.542, 'min': 0, 'time': 0.0, 'zone_n...",pace,3,True,,,15923268347,15923268347-pace-4,3.514,3.289,22.0,4,Z4 - Threshold
9,7.0,"[{'max': 2.542, 'min': 0, 'time': 0.0, 'zone_n...",pace,3,True,,,15923268347,15923268347-pace-5,3.738,3.514,0.0,5,Z5 - VO2 Max


In [44]:
zones_df_cols_clean = [
    'id',
    'activity_id',
    'type',
    'zone_number',
    'zone_name',
    'time',
    'min',
    'max'
]
zones_df = zones_df[zones_df_cols_clean]

In [45]:
zones_df.head()

Unnamed: 0,id,activity_id,type,zone_number,zone_name,time,min,max
0,15923268347-heartrate-1,15923268347,heartrate,1,Z1 - Recovery,73.0,0.0,133.0
1,15923268347-heartrate-2,15923268347,heartrate,2,Z2 - Endurance,1706.0,134.0,147.0
2,15923268347-heartrate-3,15923268347,heartrate,3,Z3 - Tempo,1290.0,148.0,160.0
3,15923268347-heartrate-4,15923268347,heartrate,4,Z4 - Threshold,0.0,161.0,166.0
4,15923268347-heartrate-5,15923268347,heartrate,5,Z5 - Anaerobic,0.0,167.0,-1.0


In [46]:
zones_dtype_map = {
    "id": String,
    "activity_id": BigInteger,
    "type": String,
    'zone_number': Integer,
    "zone_name": String,
    "time": Float,
    "min": Float,
    "max": Float
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{ZONES_S_TABLE} will be overwritten.")

zones_df.to_sql(
    name=ZONES_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=zones_dtype_map,
    method="multi",
    chunksize=1000
)



-8

## Relative Effort Dataframe

In [47]:
relative_effort_df = activities_zones_df.copy()

In [48]:
relative_effort_df = relative_effort_df[relative_effort_df['type'] == 'heartrate']
relative_effort_df = relative_effort_df[['activity_id', 'score']]
relative_effort_df = relative_effort_df.rename(columns={'score': 'relative_effort'})

In [49]:
relative_effort_df.head()

Unnamed: 0,activity_id,relative_effort
0,15923268347,39.0
3,8254517069,17.0
5,8252893698,13.0
6,8247520568,23.0
8,8239641985,16.0


In [50]:
relative_effort_dtype_map = {
    "activity_id": BigInteger,
    "relative_effort": Float
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{RELATIVE_EFFORT_S_TABLE} will be overwritten.")

relative_effort_df.to_sql(
    name=RELATIVE_EFFORT_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=relative_effort_dtype_map,
    method="multi",
    chunksize=1000
)



-2

## Segment Details Dataframe

In [51]:
segment_details_df = segment_detailed_df.copy()

In [52]:
segment_detailed_df

Unnamed: 0,id,resource_state,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,start_latlng,end_latlng,elevation_profile,climb_category,city,state,country,private,hazardous,starred,created_at,updated_at,total_elevation_gain,effort_count,athlete_count,star_count,elevation_profiles_light_url,elevation_profiles_dark_url,map_id,map_polyline,map_resource_state,athlete_segment_stats_pr_elapsed_time,athlete_segment_stats_pr_date,athlete_segment_stats_pr_visibility,athlete_segment_stats_pr_activity_id,athlete_segment_stats_pr_activity_visibility,athlete_segment_stats_effort_count,xoms_kom,xoms_qom,xoms_overall,xoms_destination_href,xoms_destination_type,xoms_destination_name,local_legend_athlete_id,local_legend_title,local_legend_profile,local_legend_effort_description,local_legend_effort_count,local_legend_effort_counts_overall,local_legend_effort_counts_female,local_legend_destination,local_legend
0,19517444,3,BRZE≈πNO-SOPOT,Run,3550.0,0.1,6.1,8.4,1.2,"[54.409473, 18.636221]","[54.428292, 18.593357]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Gda≈Ñsk,pomorskie,Poland,False,False,False,2018-12-20T11:44:06Z,2021-05-19T08:04:48Z,10.2,14917,4655,1,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s19517444,ezakIk{vpBe@~BWnBYlAe@jCs@tCc@rBy@hE{@bGYhDOj@...,3,1222,2024-11-23,everyone,12964095995,everyone,1,11:32,14:48,11:32,strava://segments/19517444/leaderboard,overall,All-Time,152263214.0,Igor Kania,https://dgalywyr863hv.cloudfront.net/pictures/...,24 efforts in the last 90 days,24.0,24 efforts,13 efforts,strava://segments/19517444/local_legend?catego...,
1,34463750,3,Aleja Jana Paw≈Ça II / John Paul II Avenue,Run,897.0,-0.0,1.3,115.8,114.7,"[52.235861, 20.997782]","[52.228274, 21.002288]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Warsaw,Masovian Voivodeship,Poland,False,False,False,2023-05-23T10:03:44Z,2023-05-23T10:20:35Z,0.0,21810,14546,0,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s34463750,ciy}Hccd_CxBUj@PnA_@f@Gd@Ob@W`@a@J@hAi@Vk@Hs@B...,3,246,2025-11-11,everyone,16426332819,everyone,1,1:53,2:56,1:53,strava://segments/34463750/leaderboard,overall,All-Time,64841306.0,Bartek Ryczek,https://dgalywyr863hv.cloudfront.net/pictures/...,6 efforts in the last 90 days,6.0,6 efforts,4 efforts,strava://segments/34463750/local_legend?catego...,
2,17455116,3,Folwarczna ≈õluza do swojczyckiego,Run,1533.3,0.1,0.8,115.0,112.9,"[51.104281, 17.124089]","[51.114371, 17.109618]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Wroc≈Çaw,Wojew√≥dztwo dolno≈õlƒÖskie,Poland,False,False,False,2018-04-22T13:45:42Z,2021-05-15T02:20:56Z,2.1,26055,3021,3,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s17455116,wh|vHopogBGX[b@]v@[\IVuAfBeCjD}ApCs@z@oBbDi@n@...,3,498,2021-10-25,everyone,6165241355,everyone,38,5:03,6:10,5:03,strava://segments/17455116/leaderboard,overall,All-Time,105315738.0,Marcin Przepi√≥rkowski,https://lh3.googleusercontent.com/a/ACg8ocJUZd...,26 efforts in the last 90 days,26.0,26 efforts,14 efforts,strava://segments/17455116/local_legend?catego...,
3,8544270,3,Wschodni ~2.8k Lap,Run,2784.3,0.0,11.0,101.2,99.0,"[51.085002, 17.075916]","[51.084648, 17.075653]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Wroc≈Çaw,Wojew√≥dztwo dolno≈õlƒÖskie,Poland,False,False,False,2014-11-22T16:03:40Z,2021-05-19T08:03:05Z,2.0,15012,1442,30,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s8544270,gpxvHmcfgBcA_ES{A]qAS_AMw@Gu@a@wBUgBEo@Y}AKyBI...,3,979,2023-01-11,everyone,8373136865,everyone,4,9:24,13:04,9:24,strava://segments/8544270/leaderboard,overall,All-Time,85087334.0,Pawe≈Ç Rakowiecki,https://graph.facebook.com/10225563450261374/p...,42 efforts in the last 90 days,42.0,42 efforts,20 efforts,strava://segments/8544270/local_legend?categor...,
4,6625302,3,Eisenbahnbr√ºcke - Garstnerbad,Run,939.4,-0.7,3.0,310.2,301.2,"[48.025566, 14.414919]","[48.019632, 14.413532]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Steyr,O√ñ,Austria,False,False,False,2014-02-09T11:06:16Z,2021-05-18T08:03:27Z,4.0,10409,954,15,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s6625302,w~bdHel~vA`@SxAe@|@a@dBe@fAe@hC{@bAWx@Kp@Ez@OZ...,3,372,2024-04-21,only_me,11232504363,everyone,1,3:01,3:31,2:51,strava://segments/6625302/leaderboard,overall,All-Time,189413086.0,Medina Hanic,avatar/athlete/large.png,29 efforts in the last 90 days,29.0,29 efforts,29 efforts,strava://segments/6625302/local_legend?categor...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,20789221,3,Marat√≥n de Valencia 13¬∫ km,Run,1007.5,0.3,5.5,11.6,5.3,"[39.481807, -0.351915]","[39.490245, -0.354608]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Val√®ncia,Comunidad Valenciana,Spain,False,False,False,2019-06-05T10:40:34Z,2021-05-20T08:05:07Z,9.4,104347,68081,11,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s20789221,gh~oFnvcAUKy@k@UKc@Ii@DkBVoA\{AV_BTWLu@RmANc@L...,3,391,2024-12-01,everyone,13025308514,everyone,1,2:47,3:18,2:47,strava://segments/20789221/leaderboard,overall,All-Time,127911140.0,Rafa Herrero Gil,avatar/athlete/large.png,153 efforts in the last 90 days,153.0,153 efforts,10 efforts,strava://segments/20789221/local_legend?catego...,
642,26572793,3,Od ronda do Namys≈Çowskiej,Run,609.0,0.6,8.8,80.8,74.8,"[52.263606, 21.022472]","[52.266444, 21.03009]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Warszawa,Mazowieckie,Poland,False,False,False,2020-11-28T15:38:20Z,2021-05-15T08:06:33Z,6.0,23389,10492,8,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s26572793,ov~}Hm}h_CSCa@oAGg@GU]y@e@{@y@wBmCgJ}@}DG}@Q{@...,3,211,2022-03-27,everyone,6890108798,everyone,1,1:42,2:05,1:42,strava://segments/26572793/leaderboard,overall,All-Time,174323121.0,Man On Skins,https://dgalywyr863hv.cloudfront.net/pictures/...,45 efforts in the last 90 days,45.0,45 efforts,32 efforts,strava://segments/26572793/local_legend?catego...,
643,12382202,3,Calle Peru selva/josep pla,Run,398.2,-0.5,6.5,16.6,11.7,"[41.413611, 2.202658]","[41.416092, 2.206025]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Barcelona,Catalunya,Spain,False,False,False,2016-06-16T18:13:08Z,2021-05-20T08:03:51Z,4.9,64040,47030,8,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s12382202,arw{FqemLi@aAQUIUqA}BQWwA{Ai@}@a@e@iAuBWYq@gAME,3,152,2023-02-19,everyone,8586119117,everyone,1,1:05,1:18,1:04,strava://segments/12382202/leaderboard,overall,All-Time,122677592.0,Fran Aguirre Valverde,https://dgalywyr863hv.cloudfront.net/pictures/...,15 efforts in the last 90 days,15.0,15 efforts,2 efforts,strava://segments/12382202/local_legend?catego...,
644,8548350,3,an der kleinen Weser,Run,1291.0,-0.4,3.3,12.5,5.0,"[53.073067, 8.801243]","[53.063243, 8.810368]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Bremen,Bremen,Germany,False,False,False,2014-11-23T12:26:29Z,2021-05-17T08:05:27Z,4.4,9266,1455,13,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s8548350,sy|bIw~ut@N@TMf@a@\Sz@q@bBgARUAa@^AJ@h@KP?FA~@...,3,501,2019-05-27,everyone,6012655362,everyone,1,4:29,5:34,4:29,strava://segments/8548350/leaderboard,overall,All-Time,188445404.0,Lasse Portillo,https://lh3.googleusercontent.com/a/ACg8ocLwHW...,25 efforts in the last 90 days,25.0,25 efforts,19 efforts,strava://segments/8548350/local_legend?categor...,


In [53]:
segment_details_df

Unnamed: 0,id,resource_state,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,start_latlng,end_latlng,elevation_profile,climb_category,city,state,country,private,hazardous,starred,created_at,updated_at,total_elevation_gain,effort_count,athlete_count,star_count,elevation_profiles_light_url,elevation_profiles_dark_url,map_id,map_polyline,map_resource_state,athlete_segment_stats_pr_elapsed_time,athlete_segment_stats_pr_date,athlete_segment_stats_pr_visibility,athlete_segment_stats_pr_activity_id,athlete_segment_stats_pr_activity_visibility,athlete_segment_stats_effort_count,xoms_kom,xoms_qom,xoms_overall,xoms_destination_href,xoms_destination_type,xoms_destination_name,local_legend_athlete_id,local_legend_title,local_legend_profile,local_legend_effort_description,local_legend_effort_count,local_legend_effort_counts_overall,local_legend_effort_counts_female,local_legend_destination,local_legend
0,19517444,3,BRZE≈πNO-SOPOT,Run,3550.0,0.1,6.1,8.4,1.2,"[54.409473, 18.636221]","[54.428292, 18.593357]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Gda≈Ñsk,pomorskie,Poland,False,False,False,2018-12-20T11:44:06Z,2021-05-19T08:04:48Z,10.2,14917,4655,1,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s19517444,ezakIk{vpBe@~BWnBYlAe@jCs@tCc@rBy@hE{@bGYhDOj@...,3,1222,2024-11-23,everyone,12964095995,everyone,1,11:32,14:48,11:32,strava://segments/19517444/leaderboard,overall,All-Time,152263214.0,Igor Kania,https://dgalywyr863hv.cloudfront.net/pictures/...,24 efforts in the last 90 days,24.0,24 efforts,13 efforts,strava://segments/19517444/local_legend?catego...,
1,34463750,3,Aleja Jana Paw≈Ça II / John Paul II Avenue,Run,897.0,-0.0,1.3,115.8,114.7,"[52.235861, 20.997782]","[52.228274, 21.002288]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Warsaw,Masovian Voivodeship,Poland,False,False,False,2023-05-23T10:03:44Z,2023-05-23T10:20:35Z,0.0,21810,14546,0,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s34463750,ciy}Hccd_CxBUj@PnA_@f@Gd@Ob@W`@a@J@hAi@Vk@Hs@B...,3,246,2025-11-11,everyone,16426332819,everyone,1,1:53,2:56,1:53,strava://segments/34463750/leaderboard,overall,All-Time,64841306.0,Bartek Ryczek,https://dgalywyr863hv.cloudfront.net/pictures/...,6 efforts in the last 90 days,6.0,6 efforts,4 efforts,strava://segments/34463750/local_legend?catego...,
2,17455116,3,Folwarczna ≈õluza do swojczyckiego,Run,1533.3,0.1,0.8,115.0,112.9,"[51.104281, 17.124089]","[51.114371, 17.109618]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Wroc≈Çaw,Wojew√≥dztwo dolno≈õlƒÖskie,Poland,False,False,False,2018-04-22T13:45:42Z,2021-05-15T02:20:56Z,2.1,26055,3021,3,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s17455116,wh|vHopogBGX[b@]v@[\IVuAfBeCjD}ApCs@z@oBbDi@n@...,3,498,2021-10-25,everyone,6165241355,everyone,38,5:03,6:10,5:03,strava://segments/17455116/leaderboard,overall,All-Time,105315738.0,Marcin Przepi√≥rkowski,https://lh3.googleusercontent.com/a/ACg8ocJUZd...,26 efforts in the last 90 days,26.0,26 efforts,14 efforts,strava://segments/17455116/local_legend?catego...,
3,8544270,3,Wschodni ~2.8k Lap,Run,2784.3,0.0,11.0,101.2,99.0,"[51.085002, 17.075916]","[51.084648, 17.075653]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Wroc≈Çaw,Wojew√≥dztwo dolno≈õlƒÖskie,Poland,False,False,False,2014-11-22T16:03:40Z,2021-05-19T08:03:05Z,2.0,15012,1442,30,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s8544270,gpxvHmcfgBcA_ES{A]qAS_AMw@Gu@a@wBUgBEo@Y}AKyBI...,3,979,2023-01-11,everyone,8373136865,everyone,4,9:24,13:04,9:24,strava://segments/8544270/leaderboard,overall,All-Time,85087334.0,Pawe≈Ç Rakowiecki,https://graph.facebook.com/10225563450261374/p...,42 efforts in the last 90 days,42.0,42 efforts,20 efforts,strava://segments/8544270/local_legend?categor...,
4,6625302,3,Eisenbahnbr√ºcke - Garstnerbad,Run,939.4,-0.7,3.0,310.2,301.2,"[48.025566, 14.414919]","[48.019632, 14.413532]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Steyr,O√ñ,Austria,False,False,False,2014-02-09T11:06:16Z,2021-05-18T08:03:27Z,4.0,10409,954,15,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s6625302,w~bdHel~vA`@SxAe@|@a@dBe@fAe@hC{@bAWx@Kp@Ez@OZ...,3,372,2024-04-21,only_me,11232504363,everyone,1,3:01,3:31,2:51,strava://segments/6625302/leaderboard,overall,All-Time,189413086.0,Medina Hanic,avatar/athlete/large.png,29 efforts in the last 90 days,29.0,29 efforts,29 efforts,strava://segments/6625302/local_legend?categor...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,20789221,3,Marat√≥n de Valencia 13¬∫ km,Run,1007.5,0.3,5.5,11.6,5.3,"[39.481807, -0.351915]","[39.490245, -0.354608]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Val√®ncia,Comunidad Valenciana,Spain,False,False,False,2019-06-05T10:40:34Z,2021-05-20T08:05:07Z,9.4,104347,68081,11,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s20789221,gh~oFnvcAUKy@k@UKc@Ii@DkBVoA\{AV_BTWLu@RmANc@L...,3,391,2024-12-01,everyone,13025308514,everyone,1,2:47,3:18,2:47,strava://segments/20789221/leaderboard,overall,All-Time,127911140.0,Rafa Herrero Gil,avatar/athlete/large.png,153 efforts in the last 90 days,153.0,153 efforts,10 efforts,strava://segments/20789221/local_legend?catego...,
642,26572793,3,Od ronda do Namys≈Çowskiej,Run,609.0,0.6,8.8,80.8,74.8,"[52.263606, 21.022472]","[52.266444, 21.03009]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Warszawa,Mazowieckie,Poland,False,False,False,2020-11-28T15:38:20Z,2021-05-15T08:06:33Z,6.0,23389,10492,8,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s26572793,ov~}Hm}h_CSCa@oAGg@GU]y@e@{@y@wBmCgJ}@}DG}@Q{@...,3,211,2022-03-27,everyone,6890108798,everyone,1,1:42,2:05,1:42,strava://segments/26572793/leaderboard,overall,All-Time,174323121.0,Man On Skins,https://dgalywyr863hv.cloudfront.net/pictures/...,45 efforts in the last 90 days,45.0,45 efforts,32 efforts,strava://segments/26572793/local_legend?catego...,
643,12382202,3,Calle Peru selva/josep pla,Run,398.2,-0.5,6.5,16.6,11.7,"[41.413611, 2.202658]","[41.416092, 2.206025]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Barcelona,Catalunya,Spain,False,False,False,2016-06-16T18:13:08Z,2021-05-20T08:03:51Z,4.9,64040,47030,8,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s12382202,arw{FqemLi@aAQUIUqA}BQWwA{Ai@}@a@e@iAuBWYq@gAME,3,152,2023-02-19,everyone,8586119117,everyone,1,1:05,1:18,1:04,strava://segments/12382202/leaderboard,overall,All-Time,122677592.0,Fran Aguirre Valverde,https://dgalywyr863hv.cloudfront.net/pictures/...,15 efforts in the last 90 days,15.0,15 efforts,2 efforts,strava://segments/12382202/local_legend?catego...,
644,8548350,3,an der kleinen Weser,Run,1291.0,-0.4,3.3,12.5,5.0,"[53.073067, 8.801243]","[53.063243, 8.810368]",https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,0,Bremen,Bremen,Germany,False,False,False,2014-11-23T12:26:29Z,2021-05-17T08:05:27Z,4.4,9266,1455,13,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s8548350,sy|bIw~ut@N@TMf@a@\Sz@q@bBgARUAa@^AJ@h@KP?FA~@...,3,501,2019-05-27,everyone,6012655362,everyone,1,4:29,5:34,4:29,strava://segments/8548350/leaderboard,overall,All-Time,188445404.0,Lasse Portillo,https://lh3.googleusercontent.com/a/ACg8ocLwHW...,25 efforts in the last 90 days,25.0,25 efforts,19 efforts,strava://segments/8548350/local_legend?categor...,


In [54]:
segment_details_df['local_legend_profile'] = segment_details_df['local_legend_profile'].replace('avatar/athlete/large.png', 'https://d3nn82uaxijpm6.cloudfront.net/sweaters/assets/large.png')

In [55]:
segment_details_df['created_date'] = pd.to_datetime(segment_details_df['created_at']).dt.date
segment_details_df['updated_date'] = pd.to_datetime(segment_details_df['updated_at']).dt.date

In [56]:
segment_details_df_cols_clean = [
    'id',
    'elevation_profile',
    'created_date',
    'updated_date',
    'total_elevation_gain',
    'effort_count',
    'athlete_count',
    'star_count',
    'elevation_profiles_light_url',
    'elevation_profiles_dark_url',
    'map_id',
    'athlete_segment_stats_pr_elapsed_time',
    'athlete_segment_stats_pr_date',
    'athlete_segment_stats_pr_visibility',
    'athlete_segment_stats_pr_activity_id',
    'athlete_segment_stats_pr_activity_visibility',
    'athlete_segment_stats_effort_count',
    'xoms_kom',
    'xoms_qom',
    'xoms_overall',
    'xoms_destination_type',
    'xoms_destination_name',
    'local_legend_athlete_id',
    'local_legend_title',
    'local_legend_profile',
    'local_legend_effort_description',
    'local_legend_effort_count',
    'local_legend_effort_counts_overall'
]
segment_details_df = segment_details_df[segment_details_df_cols_clean]

In [57]:
segment_details_df.head()

Unnamed: 0,id,elevation_profile,created_date,updated_date,total_elevation_gain,effort_count,athlete_count,star_count,elevation_profiles_light_url,elevation_profiles_dark_url,map_id,athlete_segment_stats_pr_elapsed_time,athlete_segment_stats_pr_date,athlete_segment_stats_pr_visibility,athlete_segment_stats_pr_activity_id,athlete_segment_stats_pr_activity_visibility,athlete_segment_stats_effort_count,xoms_kom,xoms_qom,xoms_overall,xoms_destination_type,xoms_destination_name,local_legend_athlete_id,local_legend_title,local_legend_profile,local_legend_effort_description,local_legend_effort_count,local_legend_effort_counts_overall
0,19517444,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,2018-12-20,2021-05-19,10.2,14917,4655,1,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s19517444,1222,2024-11-23,everyone,12964095995,everyone,1,11:32,14:48,11:32,overall,All-Time,152263214.0,Igor Kania,https://dgalywyr863hv.cloudfront.net/pictures/...,24 efforts in the last 90 days,24.0,24 efforts
1,34463750,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,2023-05-23,2023-05-23,0.0,21810,14546,0,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s34463750,246,2025-11-11,everyone,16426332819,everyone,1,1:53,2:56,1:53,overall,All-Time,64841306.0,Bartek Ryczek,https://dgalywyr863hv.cloudfront.net/pictures/...,6 efforts in the last 90 days,6.0,6 efforts
2,17455116,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,2018-04-22,2021-05-15,2.1,26055,3021,3,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s17455116,498,2021-10-25,everyone,6165241355,everyone,38,5:03,6:10,5:03,overall,All-Time,105315738.0,Marcin Przepi√≥rkowski,https://lh3.googleusercontent.com/a/ACg8ocJUZd...,26 efforts in the last 90 days,26.0,26 efforts
3,8544270,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,2014-11-22,2021-05-19,2.0,15012,1442,30,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s8544270,979,2023-01-11,everyone,8373136865,everyone,4,9:24,13:04,9:24,overall,All-Time,85087334.0,Pawe≈Ç Rakowiecki,https://graph.facebook.com/10225563450261374/p...,42 efforts in the last 90 days,42.0,42 efforts
4,6625302,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,2014-02-09,2021-05-18,4.0,10409,954,15,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,https://d3o5xota0a1fcr.cloudfront.net/v6/chart...,s6625302,372,2024-04-21,only_me,11232504363,everyone,1,3:01,3:31,2:51,overall,All-Time,189413086.0,Medina Hanic,https://d3nn82uaxijpm6.cloudfront.net/sweaters...,29 efforts in the last 90 days,29.0,29 efforts


## Segment Maps Dataframe

In [58]:
segment_maps_df = segment_detailed_df.copy()

In [59]:
segment_maps_df_cols_clean = [
    'map_id',
    'map_polyline'
]
segment_maps_df = segment_maps_df[segment_maps_df_cols_clean]

In [60]:
segment_maps_df.columns = segment_maps_df.columns.str.replace("^map_", "", regex=True)

## Maps Dataframe

In [61]:
maps_df.columns = maps_df.columns.str.replace("^map_", "", regex=True)

In [62]:
maps_cols_clean = [
  'id',
  'polyline'
]
maps_df = maps_df[maps_cols_clean]

In [63]:
maps_df = pd.concat([maps_df, segment_maps_df])

In [64]:
maps_df['latlng'] = maps_df.apply(
  lambda row: polyline.decode(row['polyline']) if isinstance(row['polyline'], str) else [], axis=1
)
maps_df = maps_df.explode('latlng', ignore_index=False)
maps_df['point_id'] = maps_df.groupby(level=0).cumcount()
maps_df[['lat','lng']] = extract_latlng(maps_df['latlng'])
maps_df = maps_df.drop(columns=['polyline', 'latlng'])

In [65]:
maps_df.head()

Unnamed: 0,id,point_id,lat,lng
0,a15865360447,0,,
1,a15879687027,0,51.10732,17.12439
1,a15879687027,1,51.10729,17.12443
1,a15879687027,2,51.10717,17.12455
1,a15879687027,3,51.10692,17.12486


In [66]:
maps_df_dtype_map = {
    "id": String,
    "point_id": Integer,
    "lat": Float,
    "lng": Float
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{MAPS_S_TABLE} will be overwritten.")

maps_df.to_sql(
    name=MAPS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=maps_df_dtype_map,
    method="multi",
    chunksize=1000
)



-440

## Decoding coordinates with geopy

### Create list of unique locations

In [67]:
lat_lng_points_df = pd.concat([activities_df[['start_lat', 'start_lng']], segments_df[['start_lat', 'start_lng']]]).drop_duplicates()

In [68]:
geo = Nominatim(user_agent=USER_AGENT)
reverse = RateLimiter(geo.reverse, min_delay_seconds=1.1, max_retries=3, error_wait_seconds=5)

def cache_key(lat, lng, precision):
  """
  Create a stable cache key for a coordinate cell.

  Parameters
  ----------
  lat : float
      Latitude in decimal degrees.
  lng : float
      Longitude in decimal degrees.
  precision : int
      Number of decimal places used to format the coordinates.
      This effectively defines the grid cell size (e.g., 2 dp ‚âà city level).

  Returns
  -------
  str
      Key formatted as ``"lat,lng"`` with fixed precision, e.g. ``"51.11,17.02"``.
      Using a string avoids floating-point representation issues and works as a JSON key.
  """

  return f"{lat:.{precision}f},{lng:.{precision}f}"

def pick_locality(addr):
  """
  Select a locality (city/town/village) from a Nominatim ``address`` mapping.

  The function returns the first non-empty value in the following order of preference:
  ``city`` ‚Üí ``town`` ‚Üí ``village`` ‚Üí ``hamlet`` ‚Üí ``municipality`` ‚Üí ``locality`` ‚Üí ``county`` (fallback).

  Parameters
  ----------
  addr : Mapping[str, str]
      The ``address`` object from a Nominatim response (``loc.raw['address']``).

  Returns
  -------
  str or None
      Locality name or ``None`` if none of the keys are available.

  Notes
  -----
  Including ``county`` as the last-resort fallback may return a county name
  in places where a true locality is missing in OSM data (useful in some countries),
  but it can be more general than a town/city.
  """
  
  return (addr.get("city") or addr.get("town") or addr.get("village") or addr.get("hamlet") or addr.get("municipality") or addr.get("locality") or addr.get("county"))

def pick_region(addr):
  """
  Select a region/state from a Nominatim ``address`` mapping.

  The function returns the first non-empty value in the following order of preference:
  ``state`` ‚Üí ``region`` ‚Üí ``state_district`` ‚Üí ``province`` ‚Üí ``county`` (fallback).

  Parameters
  ----------
  addr : Mapping[str, str]
      The ``address`` object from a Nominatim response (``loc.raw['address']``).

  Returns
  -------
  str or None
      Region/administrative area name or ``None`` if not present.

  Notes
  -----
  In Poland and many countries ``state`` corresponds to the top-level region
  (e.g., voivodeship/province). ``county`` is typically a lower level and is used
  here only as a fallback for countries where counties act as primary regions.
  """
  
  return (addr.get("state") or addr.get("region") or addr.get("state_district") or addr.get("province") or addr.get("county"))

def address_fields(reverse_fn, lat, lng):
  """
  Perform reverse geocoding and extract minimal address fields.

  Parameters
  ----------
  reverse_fn : Callable
      A function compatible with ``geopy.Nominatim.reverse`` (optionally wrapped
      with ``RateLimiter``) that accepts ``(lat, lng)`` and returns a Location-like
      object with a ``.raw`` dict payload.
  lat : float
      Latitude in decimal degrees.
  lng : float
      Longitude in decimal degrees.

  Returns
  -------
  dict
      Dictionary with three keys:
      - ``locality`` : str or None ‚Äî city/town/village (best available),
      - ``region``   : str or None ‚Äî region/state/province,
      - ``country``  : str or None ‚Äî country name.

  Notes
  -----
  The function calls ``reverse_fn`` with ``language="en"`` and ``addressdetails=True``.
  Change the language parameter if localized names are desired.
  """

  loc = reverse_fn((lat, lng), language="en", addressdetails=True)
  if not loc:
      return {"locality": None, "region": None, "country": None}
  address = (loc.raw or {}).get("address", {})
  return {
      "locality": pick_locality(address),
      "region": pick_region(address),
      "country": address.get("country")
  }

def decode_coordinates(coordinates_df: pd.DataFrame, lat_col: str, lng_col: str, cache_path: str, precision: int, geo_reverse_fn) -> pd.DataFrame:
  """
  Append ``locality``, ``region``, and ``country`` to a DataFrame using reverse geocoding with caching.

  For each unique coordinate pair (rounded to ``precision`` decimal places), the function performs
  a reverse geocode via ``geo_reverse_fn`` and stores results in a JSON cache. Subsequent runs read
  from the cache to minimize API calls.

  Parameters
  ----------
  coordinates_df : pd.DataFrame
      Input DataFrame containing coordinate columns.
  lat_col : str
      Name of the latitude column (e.g., ``"start_lat"``).
  lng_col : str
      Name of the longitude column (e.g., ``"start_lng"``).
  cache_path : str
      File path to the JSON cache. The file will be created/updated as needed.
  precision : int
      Number of decimal places for rounding coordinates and building the cache key
      (e.g., 2 ‚âà city-level granularity).
  geo_reverse_fn : Callable
      Reverse geocoding function (typically a ``RateLimiter(Nominatim.reverse, ...)``).

  Returns
  -------
  pd.DataFrame
      A copy of the input DataFrame with three additional columns:
      ``locality``, ``region``, and ``country``.

  Raises
  ------
  KeyError
      If the required ``lat_col`` or ``lng_col`` is missing from ``coordinates_df``.
  json.JSONDecodeError
      If the cache file exists but contains invalid JSON.
  Exception
      Any exception propagated from the reverse geocoding function or file I/O.

  Notes
  -----
  - The function writes the cache to ``cache_path`` whenever a new key is added.
  - A small sleep (``time.sleep(0.5)``) is used per new lookup; adjust to comply with
    your provider's rate limits (public Nominatim typically requires ‚â§1 request/sec).
  - Change the language in ``address_fields`` if you need localized names.
  """

  temp_df = coordinates_df.copy()

  if lat_col not in temp_df.columns or lng_col not in temp_df.columns:
    raise KeyError(f"Missing required columns: {lat_col}, {lng_col}")
  
  temp_df['lat_round'] = temp_df[lat_col].round(precision)
  temp_df['lng_round'] = temp_df[lng_col].round(precision)
  temp_df = temp_df.dropna(subset=[lat_col, lng_col])
  lat_lng_df = temp_df[['lat_round', 'lng_round']].drop_duplicates().reset_index(drop=True)

  cache = {}
  if os.path.exists(cache_path):
    with open(cache_path, "r", encoding="utf-8") as f:
      cache = json.load(f)

  records = []
  for i, row in tqdm(lat_lng_df.iterrows(), total=lat_lng_df.shape[0]):

    key = cache_key(row['lat_round'], row['lng_round'], precision)

    if key in cache:
      resp = cache[key]

    else:
      resp = address_fields(geo_reverse_fn, row['lat_round'], row['lng_round'])
      cache[key] = resp
      time.sleep(0.5)

      with open(cache_path, "w", encoding="utf-8") as f:
        json.dump(cache, f, ensure_ascii=False, indent=2)

    records.append({"lat_round": row['lat_round'], "lng_round": row['lng_round'], **resp})

  loc_df = pd.DataFrame(records)
  
  result = temp_df.merge(loc_df, on=["lat_round","lng_round"], how="left")
  result = result.drop(columns=["lat_round","lng_round"])

  return result

In [69]:
lat_lng_points_df = decode_coordinates(lat_lng_points_df, 'start_lat', 'start_lng', CACHE_PATH, PRECISION, reverse)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1620/1620 [00:01<00:00, 1234.37it/s]


In [70]:
lat_lng_points_df.head()

Unnamed: 0,start_lat,start_lng,locality,region,country
0,52.755966,15.226957,Gorzow Wielkopolski,Lubusz Voivodeship,Poland
1,52.755944,15.226775,Gorzow Wielkopolski,Lubusz Voivodeship,Poland
2,51.107712,17.124042,Wroc≈Çaw,Lower Silesian Voivodeship,Poland
3,51.107732,17.123874,Wroc≈Çaw,Lower Silesian Voivodeship,Poland
4,38.787526,-123.411065,Mendocino County,California,United States


In [71]:
locations_df = lat_lng_points_df[['locality', 'region', 'country']].copy()
locations_df = locations_df.drop_duplicates()
locations_df = locations_df.sort_values(by=['country', 'region', 'locality']).reset_index(drop=True)
locations_df['location_id'] = 1000 + np.arange(len(locations_df))
locations_df = locations_df[['location_id', 'country', 'region', 'locality']]

In [72]:
locations_df.head()

Unnamed: 0,location_id,country,region,locality
0,1000,Austria,Lower Austria,Sankt Michael am Bruckbach
1,1001,Austria,Lower Austria,Waidhofen an der Ybbs
2,1002,Austria,Upper Austria,Garsten
3,1003,Austria,Upper Austria,Gr√ºnburg
4,1004,Austria,Upper Austria,Linz


In [73]:
lat_lng_points_df = pd.merge(lat_lng_points_df, locations_df, how='left', on=['country', 'region', 'locality'])

In [74]:
lat_lng_points_df.head()

Unnamed: 0,start_lat,start_lng,locality,region,country,location_id
0,52.755966,15.226957,Gorzow Wielkopolski,Lubusz Voivodeship,Poland,1203
1,52.755944,15.226775,Gorzow Wielkopolski,Lubusz Voivodeship,Poland,1203
2,51.107712,17.124042,Wroc≈Çaw,Lower Silesian Voivodeship,Poland,1183
3,51.107732,17.123874,Wroc≈Çaw,Lower Silesian Voivodeship,Poland,1183
4,38.787526,-123.411065,Mendocino County,California,United States,1246


In [75]:
activities_df = pd.merge(activities_df, lat_lng_points_df[['start_lat', 'start_lng', 'location_id']], how='left', on=['start_lat', 'start_lng'])

In [76]:
activities_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,elev_low,elev_high,type,sport_type,workout_type,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,visibility,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_watts,max_watts,weighted_average_watts,has_heartrate,average_heartrate,max_heartrate,pr_count,total_photo_count,suffer_score,description,calories,device_name,start_lat,start_lng,map_id,gear_id,location_id
0,16691408683,Morning Yoga,2025-12-09 07:02:45+00:00,2025-12-09 08:02:45+01:00,Etc/GMT-1.0,0.0,4537,1:15:37,4537,1:15:37,0.0,0.0,0.0,Yoga,Yoga,31.0,0,1,0,1,0,True,False,False,everyone,0.0,,,0.0,,,,,,,True,77.4,123.0,0,0,6.0,,233.0,Garmin Forerunner 970,,,a16691408683,x00000000,
1,16684277150,6.5km Easy RunüôÇ‚Äç‚ÜîÔ∏è,2025-12-08 12:45:47+00:00,2025-12-08 13:45:47+01:00,Etc/GMT-1.0,6565.0,2317,0:38:37,2355,0:39:15,62.0,21.6,60.4,Run,Run,,4,8,0,1,0,False,False,False,everyone,2.833,5:53,5.883045,3.8,4:23,4.385965,168.6,351.5,536.0,347.0,True,142.8,155.0,3,0,25.0,Easy z g√≥rkamiüóª\n\n6.5km easy run at a convers...,510.0,Garmin Forerunner 970,52.755966,15.226957,a16684277150,g24134620,1203.0
2,16678081919,Drop Set Hill Repsüè°,2025-12-07 16:57:40+00:00,2025-12-07 17:57:40+01:00,Etc/GMT-1.0,9222.0,3286,0:54:46,3292,0:54:52,194.0,23.4,53.0,Run,Run,3.0,1,9,0,1,0,False,False,False,everyone,2.806,5:56,5.939653,4.5,3:42,3.703704,164.4,352.8,765.0,369.0,True,148.6,173.0,0,0,51.0,Gorzowskie g√≥ry > Wroc≈Çawskie pag√≥rkiüôÇ‚Äç‚ÜîÔ∏è\n\n2...,745.0,Garmin Forerunner 970,52.755944,15.226775,a16678081919,g24134620,1203.0
3,16663625820,24km Long Runüá™üá∏,2025-12-06 07:19:02+00:00,2025-12-06 08:19:02+01:00,Etc/GMT-1.0,24159.5,8178,2:16:18,9074,2:31:14,61.0,112.2,125.0,Run,Run,2.0,15,8,0,1,0,False,False,False,everyone,2.954,5:39,5.642067,4.54,3:40,3.671072,171.6,344.4,507.0,327.0,True,143.5,155.0,7,0,92.0,Mini Walencja w domu:\n\n(Trochƒô wiƒôcej stania...,1755.0,Garmin Forerunner 970,51.107712,17.124042,a16663625820,g23642256,1183.0
4,16648928719,12km Easy Runüòã,2025-12-04 14:20:16+00:00,2025-12-04 15:20:16+01:00,Etc/GMT-1.0,12328.4,4326,1:12:06,4366,1:12:46,42.0,106.6,120.6,Run,Run,,9,8,0,1,0,False,False,False,everyone,2.85,5:51,5.847953,3.48,4:47,4.789272,170.2,322.6,411.0,321.0,True,142.2,152.0,6,0,43.0,Lu≈∫ne klepanieü§ùüèª\n\n12km easy run at a convers...,960.0,Garmin Forerunner 970,51.107732,17.123874,a16648928719,g24134620,1183.0


In [77]:
activities_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "start_date_utc_dt": DateTime(timezone=False),
    "start_date_local_dt": DateTime(timezone=False),
    "local_timezone":  String,
    "distance": Float,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "total_elevation_gain": Float,
    "elev_low": Float,
    "elev_high": Float,
    "type": String,
    "sport_type": String,
    "workout_type": Integer,
    "achievement_count": Integer,
    "kudos_count": Integer,
    "comment_count": Integer,
    "athlete_count": Integer,
    "photo_count": Integer,
    "trainer": Boolean,
    "commute": Boolean,
    "manual": Boolean,
    "visibility": String,
    "average_speed": Float,
    "avg_pace_str": String,
    "avg_pace_float": Float,
    "max_speed": Float,
    "max_pace_str": String,
    "max_pace_float": Float,
    "average_cadence": Float,
    "average_watts": Float,
    "max_watts": Float,
    "weighted_average_watts": Float,
    "has_heartrate": Boolean,
    "average_heartrate": Float,
    "max_heartrate": Float,
    "pr_count": Integer,
    "total_photo_count": Integer,
    "suffer_score": Float,
    "description": Text,
    "calories": Float,
    "device_name": String,
    'start_lat' : Float,
    'start_lng' : Float,
    "map_id": String,
    "gear_id": String,
    "location_id": Integer
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE} will be overwritten.")

activities_df.to_sql(
    name=ACTIVITIES_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=activities_df_dtype_map,
    method="multi",
    chunksize=1000
)



-2

In [78]:
activities_df.dtypes

id                                      int64
name                                   object
start_date_utc_dt         datetime64[ns, UTC]
start_date_local_dt                    object
local_timezone                         object
distance                              float64
moving_time                             int64
moving_time_td                         object
elapsed_time                            int64
elapsed_time_td                        object
total_elevation_gain                  float64
elev_low                              float64
elev_high                             float64
type                                   object
sport_type                             object
workout_type                          float64
achievement_count                       int64
kudos_count                             int64
comment_count                           int64
athlete_count                           int64
photo_count                             int64
trainer                           

In [79]:
segments_df = pd.merge(segments_df, lat_lng_points_df[['start_lat', 'start_lng',  'location_id']], how='left', on=['start_lat', 'start_lng'])

In [80]:
segments_df[segments_df['id'] == 13979314]

Unnamed: 0,id,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,climb_category,private,hazardous,start_lat,start_lng,end_lat,end_lng,location_id
1123,13979314,Parkrun Wroc≈Çaw,Run,4988.0,0.1,11.2,132.6,126.2,0,False,False,51.100648,17.120626,51.10091,17.120358,1183


In [81]:
segments_df = pd.merge(segments_df, segment_details_df, how='left', on='id')

In [82]:
segments_df.head()

Unnamed: 0,id,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,climb_category,private,hazardous,start_lat,start_lng,end_lat,end_lng,location_id,elevation_profile,created_date,updated_date,total_elevation_gain,effort_count,athlete_count,star_count,elevation_profiles_light_url,elevation_profiles_dark_url,map_id,athlete_segment_stats_pr_elapsed_time,athlete_segment_stats_pr_date,athlete_segment_stats_pr_visibility,athlete_segment_stats_pr_activity_id,athlete_segment_stats_pr_activity_visibility,athlete_segment_stats_effort_count,xoms_kom,xoms_qom,xoms_overall,xoms_destination_type,xoms_destination_name,local_legend_athlete_id,local_legend_title,local_legend_profile,local_legend_effort_description,local_legend_effort_count,local_legend_effort_counts_overall
0,11065825,Odcinek miƒôdzy kana≈Çami,Ride,1386.8,0.2,2.3,118.8,114.6,0,False,False,51.104562,17.123725,51.113834,17.110484,1183,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,11740949,Most Swojczycki- zjazd pod mosty Jagielo≈Ñskie,Ride,2310.6,0.0,5.4,124.0,115.1,0,False,False,51.113875,17.107271,51.126718,17.083225,1183,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,11584293,Kana≈Çowa,Ride,549.4,0.0,1.1,115.0,113.4,0,False,False,51.127217,17.075984,51.127622,17.068148,1183,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,12380127,Stara Odra do centrum,Ride,1490.4,0.3,5.8,119.9,113.7,0,False,False,51.127377,17.067455,51.114944,17.072457,1183,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,7667882,Z g√≥rki na pazurki (Odra-wa≈Çy-od Grunwaldzkiej...,Ride,443.8,-0.5,2.6,122.3,117.0,0,False,False,51.11465,17.072658,51.110929,17.071034,1183,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [83]:
segments_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "activity_type": String,
    "distance": Float,
    "average_grade": Float,
    "maximum_grade": Float,
    "elevation_high": Float,
    "elevation_low": Float,
    "climb_category": Float,
    "private": Boolean,
    "hazardous": Boolean,
    "start_lat": Float,
    "start_lng": Float,
    "end_lat": Float,
    "end_lng": Float,
    "location_id": Integer,
    "elevation_profile": Text,
    "created_date": Date,
    "updated_date": Date,
    "total_elevation_gain": Float,
    "effort_count": Integer,
    "athlete_count": Integer,
    "star_count": Integer,
    "elevation_profiles_light_url": Text,
    "elevation_profiles_dark_url": Text,
    "map_id": Text,
    "athlete_segment_stats_pr_elapsed_time": Text,
    "athlete_segment_stats_pr_date": Text,
    "athlete_segment_stats_pr_visibility": Text,
    "athlete_segment_stats_pr_activity_id": BigInteger,
    "athlete_segment_stats_pr_activity_visibility": Text,
    "athlete_segment_stats_effort_count": Integer,
    "xoms_kom": Text,
    "xoms_qom": Text,
    "xoms_overall": Text,
    "xoms_destination_type": Text,
    "xoms_destination_name": Text,
    "local_legend_athlete_id": BigInteger,
    "local_legend_title": Text,
    "local_legend_profile": Text,
    "local_legend_effort_description": Text,
    "local_legend_effort_count": Integer,
    "local_legend_effort_counts_overall": Text
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{SEGMENTS_S_TABLE} will be overwritten.")

segments_df.to_sql(
    name=SEGMENTS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=segments_df_dtype_map,
    method="multi",
    chunksize=1000
)



-2

In [84]:
locations_df = locations_df.rename(columns={'location_id': 'id'})

In [85]:
locations_df.head()

Unnamed: 0,id,country,region,locality
0,1000,Austria,Lower Austria,Sankt Michael am Bruckbach
1,1001,Austria,Lower Austria,Waidhofen an der Ybbs
2,1002,Austria,Upper Austria,Garsten
3,1003,Austria,Upper Austria,Gr√ºnburg
4,1004,Austria,Upper Austria,Linz


In [86]:
locations_df_dtype_map = {
    "id": Integer,
    "locality": String,
    "region	": String,
    "country": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{LOCATIONS_S_TABLE} will be overwritten.")

locations_df.to_sql(
    name=LOCATIONS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=locations_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Primary keys definition

In [87]:
keys_instructions= [
    f"""CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};""",
    # ********** PRIMARY KEYS **********
    # --- activities ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE}
          ADD CONSTRAINT {ACTIVITIES_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- gear ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{GEAR_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{GEAR_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{GEAR_S_TABLE}
          ADD CONSTRAINT {GEAR_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- segments ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{SEGMENTS_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{SEGMENTS_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{SEGMENTS_S_TABLE}
          ADD CONSTRAINT {SEGMENTS_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- laps ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{LAPS_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{LAPS_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{LAPS_S_TABLE}
          ADD CONSTRAINT {LAPS_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- best efforts ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{BEST_EFFORTS_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{BEST_EFFORTS_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{BEST_EFFORTS_S_TABLE}
          ADD CONSTRAINT {BEST_EFFORTS_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- locations ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{LOCATIONS_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{LOCATIONS_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{LOCATIONS_S_TABLE}
          ADD CONSTRAINT {LOCATIONS_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- zones ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{ZONES_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{ZONES_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{ZONES_S_TABLE}
          ADD CONSTRAINT {ZONES_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- relative_effort ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{RELATIVE_EFFORT_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{RELATIVE_EFFORT_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{RELATIVE_EFFORT_S_TABLE}
          ADD CONSTRAINT {RELATIVE_EFFORT_S_TABLE}_pkey PRIMARY KEY (activity_id);
      END IF;
    END $$;
    """
]

### Create constrains

In [88]:
with engine.begin() as conn:
    for sql in keys_instructions:
        conn.execute(text(sql))