In [1]:
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
from sqlalchemy import Integer, Float, String, Boolean, DateTime, Interval, Text, BigInteger
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import JSONB
from datetime import timezone, timedelta
from tqdm import tqdm

pd.set_option('display.max_columns', None)

In [2]:
# from geopy.geocoders import Nominatim
# from geopy.extra.rate_limiter import RateLimiter
# from requests.adapters import HTTPAdapter
# from urllib3.util.retry import Retry

# geolocator = Nominatim(user_agent="strava_analtytics_app", scheme="https")
# reverse = RateLimiter(
#   geolocator.reverse, 
#   min_delay_seconds=1.5, 
#   max_retries=3, 
#   error_wait_seconds=3.0, 
#   swallow_exceptions=True)

# session = geolocator.adapter.session
# retry = Retry(
#     total=5,
#     connect=5, read=5,
#     backoff_factor=1.5,
#     status_forcelist=[429, 500, 502, 503, 504],
#     raise_on_status=False,
# )
# adapter = HTTPAdapter(max_retries=retry)
# session.mount("https://", adapter)
# session.mount("http://", adapter)

### Load evn variables

In [3]:
load_dotenv()

True

### SQL setup

In [4]:
engine = create_engine(os.getenv('DB_URI'))

## Request data

### All activities Dataframe

In [5]:
activitie_list_query = "SELECT * FROM bronze.activities"
activities_list_df = pd.read_sql(activitie_list_query, engine)

### All activities with details Dataframe

In [6]:
activities_details_query = "SELECT * FROM bronze.activities_details"
activities_details_df = pd.read_sql(activities_details_query, engine)

### All kudos Dataframe

In [7]:
kudos_query = "SELECT * FROM bronze.kudos"
kudos_df = pd.read_sql(kudos_query, engine)

### Separate tables setup

In [8]:
dataframe_columns = {
  'activities' : [
    'id',
    'name',
    'distance',
    'moving_time',
    'elapsed_time',
    'total_elevation_gain',
    'type',
    'sport_type',
    'workout_type',
    'start_date',
    'start_date_local',
    'timezone',
    'achievement_count',
    'kudos_count',
    'comment_count',
    'athlete_count',
    'photo_count',
    'trainer',
    'commute',
    'manual',
    'visibility',
    'start_latlng',
    'end_latlng',
    'average_speed',
    'max_speed',
    'average_cadence',
    'average_watts',
    'max_watts',
    'weighted_average_watts',
    'has_heartrate',
    'average_heartrate',
    'max_heartrate',
    'elev_high',
    'elev_low',
    'pr_count',
    'total_photo_count',
    'suffer_score',
    'description',
    'calories',
    'device_name',
    'map_id',
    'gear_id'],
  'maps' : [
    'map_id',
    'map_polyline',
    'map_summary_polyline'],
  'gear' : [
    'gear_id',
    'gear_primary',
    'gear_name',
    'gear_nickname',
    'gear_retired',
    'gear_distance',
    'gear_converted_distance'],
  'segment_efforts' : [
    'id',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'start_index',
    'end_index',
    'average_cadence',
    'device_watts',
    'average_watts',
    'average_heartrate',
    'max_heartrate',
    'pr_rank',
    'achievements',
    'visibility',
    'kom_rank',
    'hidden',
    'activity_id',
    'segment_id'],
  'segments' : [
    'segment_id',
    'segment_name',
    'segment_activity_type',
    'segment_distance',
    'segment_average_grade',
    'segment_maximum_grade',
    'segment_elevation_high',
    'segment_elevation_low',
    'segment_start_latlng',
    'segment_end_latlng',
    'segment_elevation_profile',
    'segment_elevation_profiles',
    'segment_climb_category',
    'segment_city',
    'segment_state',
    'segment_country',
    'segment_private',
    'segment_hazardous',
    'segment_starred'],
  'laps' : [
    'id',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'average_speed',
    'max_speed',
    'lap_index',
    'split',
    'start_index',
    'end_index',
    'total_elevation_gain',
    'average_cadence',
    'device_watts',
    'average_watts',
    'average_heartrate',
    'max_heartrate',
    'pace_zone',
    'activity_id'],
  'best_efforts' : [
    'id',
    'activity_id',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'pr_rank',
    'achievements',
    'start_index',
    'end_index']
}

### Spliting data into tables

In [9]:
def select_cols(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
  """
  Selects only the specified columns from a DataFrame if they exist.

  Parameters
  ----------
  df : pd.DataFrame
      The input DataFrame.
  columns : list of str
      List of column names to select.

  Returns
  -------
  pd.DataFrame
      A new DataFrame containing only the specified columns that exist 
      in the input DataFrame. If none of the columns exist, 
      an empty DataFrame is returned.
  """
  return df[[c for c in cols if c in df.columns]].copy()

def explode_normalize_json(df: pd.DataFrame, col: str) -> pd.DataFrame:
  """
  Explodes a list-like column in a DataFrame and normalizes nested JSON records into a flat table.

  Parameters
  ----------
  df : pd.DataFrame
      Input DataFrame containing a column with list-like or dictionary-like structures.
  col : str
      The name of the column to explode and normalize.

  Returns
  -------
  pd.DataFrame
      A new DataFrame with the exploded and normalized JSON records.
      If the column does not exist or contains only empty values, 
      an empty DataFrame is returned.
  """

  if col not in df.columns:
    return pd.DataFrame()
  
  exploded = df.explode(col).reset_index(drop=True)
  exploded_values = exploded[col].dropna()

  if exploded_values.empty:
    return pd.DataFrame()
  
  return pd.json_normalize(exploded_values, sep='_')

In [10]:
# Activities
activities_cols = dataframe_columns['activities']
activities_df = activities_details_df[[c for c in activities_cols if c in activities_details_df.columns]].copy()

# Maps
maps_cols = dataframe_columns['maps']
maps_df = activities_details_df[[c for c in maps_cols if c in activities_details_df.columns]].copy()


# Gear
gear_cols = dataframe_columns['gear']
gear_df = activities_details_df[[c for c in gear_cols if c in activities_details_df.columns]].copy()
gear_df = gear_df.drop_duplicates()

# Segment efforts
seg_eff_cols = dataframe_columns['segment_efforts']
segments_eff_exploded_df = activities_details_df.copy().explode('segment_efforts').reset_index(drop=True)
segments_eff_df = pd.json_normalize(segments_eff_exploded_df['segment_efforts'], sep='_')
segments_eff_df = segments_eff_df[[c for c in seg_eff_cols if c in segments_eff_df.columns]].dropna(how="all")

# Segments
seg_cols = dataframe_columns['segments']
segments_exploded_df = activities_details_df.copy().explode('segment_efforts').reset_index(drop=True)
segments_df = pd.json_normalize(segments_exploded_df['segment_efforts'], sep='_')
segments_df = segments_df[[c for c in seg_cols if c in segments_df.columns]].dropna(how="all")

# Laps
lap_cols = dataframe_columns['laps']
laps_exploded_df = activities_details_df.copy().explode('laps').reset_index(drop=True)
laps_df = pd.json_normalize(laps_exploded_df['laps'], sep='_')
laps_df = laps_df[[c for c in lap_cols if c in laps_df.columns]]

# Best efforts
best_eff_cols = dataframe_columns['best_efforts']
best_eff_exploded_df = activities_details_df.copy().explode('best_efforts').reset_index(drop=True)
best_eff_df = pd.json_normalize(best_eff_exploded_df['best_efforts'], sep='_')
best_eff_df = best_eff_df[[c for c in lap_cols if c in best_eff_df.columns]].dropna(how="all")

# All dataframes in dictoinary
dataframes = {
    "activities": activities_df,
    "maps": maps_df,
    "gear": gear_df,
    "segment_efforts": segments_eff_df,
    "segments": segments_df,
    "laps": laps_df,
    "best_efforts": best_eff_df,
    "kudos" : kudos_df
}

In [11]:
workout_types = [
    {"id": 0.0, "type": "Running - None"},
    {"id": 1.0, "type": "Running - Race"},
    {"id": 2.0, "type": "Running - Long Run"},
    {"id": 3.0, "type": "Running - Workout"},
    {"id": 10.0, "type": "Riding - None"},
    {"id": 11.0, "type": "Riding - Race"},
    {"id": 12.0, "type": "Riding - Race"},
    {"id": 20.0, "type": "Other"}
]

## Activities Dataframe

In [12]:
def speed_to_pace_str(speed: float) -> str | None:
  """
  Converts speed in meters per second to running pace in the format "M:SS per km".

  Parameters
  ----------
  speed : float
      Speed value in meters per second. Must be greater than zero.

  Returns
  -------
  str or None
      A string representing the pace in minutes and seconds per kilometer 
      (e.g., "5:32"). Returns None if the speed is less than or equal to zero.
  """

  if speed <= 0:
    return None
  
  seconds = 1000/speed
  minutes = int(seconds // 60)
  sec = int(round(seconds % 60))

  if sec == 60:
    minutes += 1
    sec = 0

  return f"{minutes}:{sec:02d}"

def speed_to_pace_float(speed: float) -> float | None:

  """
  Converts speed in meters per second to running pace in minutes per kilometer (float).

  Parameters
  ----------
  speed : float
      Speed value in meters per second. Must be greater than zero.

  Returns
  -------
  float or None
      Running pace in minutes per kilometer, represented as a float 
      (e.g., 5.53 means 5.53 minutes per km). 
      Returns None if the speed is less than or equal to zero.
  """

  if speed <= 0:
    return None
  
  seconds = 1000/speed

  return seconds / 60

In [13]:
activities_df

Unnamed: 0,id,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,start_date,start_date_local,timezone,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,visibility,start_latlng,end_latlng,average_speed,max_speed,average_cadence,average_watts,max_watts,weighted_average_watts,has_heartrate,average_heartrate,max_heartrate,elev_high,elev_low,pr_count,total_photo_count,suffer_score,description,calories,device_name,map_id,gear_id
0,15729456618,Lunch Ride,79588.5,11082,14430,202.0,Ride,Ride,,2025-09-07T09:45:26Z,2025-09-07T11:45:26Z,(GMT+01:00) Europe/Warsaw,44,13,0,1,0,False,False,False,followers_only,"[51.108316, 17.123345]","[51.107901, 17.123794]",7.182,11.18,,183.2,,,True,129.0,148.0,158.4,115.4,20,0,53.0,Nogi nie wsp√≥≈Çpracowa≈Çy po wczorajszym longuü™¶,1388.0,Garmin Edge 840,a15729456618,b12572672
1,15716821076,24km Race Practice Long Runüî©,24120.3,8004,8085,56.0,Run,Run,2.0,2025-09-06T10:41:12Z,2025-09-06T12:41:12Z,(GMT+01:00) Europe/Warsaw,17,13,2,2,0,False,False,False,everyone,"[51.107164, 17.123723]","[51.106689, 17.123415]",3.014,4.28,84.8,369.3,581.0,375.0,True,154.9,173.0,123.2,111.4,10,0,165.0,24km Race Practice Long Run with Runna ‚úÖ\n\nDo...,1857.0,Garmin Forerunner 970,a15716821076,g23642256
2,15708639235,Evening Ride,16823.7,3683,6122,47.0,Ride,Ride,,2025-09-05T16:31:17Z,2025-09-05T18:31:17Z,(GMT+01:00) Europe/Warsaw,5,10,0,4,0,False,False,False,followers_only,"[51.107755, 17.123295]","[51.107903, 17.12546]",4.568,12.62,,95.9,,,True,101.0,145.0,126.2,116.2,0,1,9.0,Coffee ride bez kawyüóø,320.0,Garmin Edge 840,a15708639235,b12572672
3,15705468575,Afternoon Weight Training,0.0,3713,3713,0.0,Workout,WeightTraining,,2025-09-05T12:00:34Z,2025-09-05T14:00:34Z,(GMT+02:00) Africa/Blantyre,0,8,0,1,0,True,False,False,followers_only,[],[],0.000,0.00,,,,,True,99.6,142.0,0.0,0.0,0,0,8.0,Reska8Ô∏è‚É£5Ô∏è‚É£,306.0,Garmin Forerunner 970,a15705468575,
4,15705659558,Afternoon Ride,13045.3,1871,6563,44.0,Ride,Ride,10.0,2025-09-05T11:32:00Z,2025-09-05T13:32:00Z,(GMT+01:00) Europe/Warsaw,9,12,0,1,0,False,True,False,followers_only,"[51.1085, 17.123504]","[51.107656, 17.125015]",6.972,10.50,,181.2,,,True,134.1,152.0,129.0,115.4,4,0,13.0,Reska dojazdü´°,318.0,Garmin Edge 840,a15705659558,b12572672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,14731708283,‚ÄûLong‚Äù RunüôÇ‚Äç‚ÜïÔ∏è,11021.1,4111,4367,71.0,Run,Run,2.0,2025-06-08T08:31:47Z,2025-06-08T10:31:47Z,(GMT+01:00) Europe/Warsaw,0,14,0,1,0,False,False,False,everyone,"[52.757555, 15.249096]","[52.730161, 15.241658]",2.681,3.58,83.5,343.8,463.0,335.0,True,145.8,158.0,77.8,23.8,0,0,54.0,11km Long Run with Runna ‚úÖ\n\nLu≈∫no po dzielni...,857.0,Garmin Forerunner 970,a14731708283,g20426652
96,14722686686,XXXI Bieg ≈ªakowskiüî•,5047.6,2057,2062,41.0,Run,Run,1.0,2025-06-07T11:01:15Z,2025-06-07T13:01:15Z,(GMT+01:00) Europe/Warsaw,0,19,0,4,0,False,False,False,everyone,"[52.750739, 15.233868]","[52.751896, 15.235475]",2.448,3.48,78.8,309.2,486.0,309.0,True,146.9,164.0,69.8,32.6,0,1,29.0,Karo poprowadzona na nowy PRüèÜ,416.0,Garmin Forerunner 970,a14722686686,g23642256
97,14707040076,Fast 8-4-2süöÄ,8732.2,2892,2892,3.0,Run,Run,3.0,2025-06-05T17:15:13Z,2025-06-05T19:15:13Z,(GMT+01:00) Europe/Warsaw,7,13,2,3,0,False,False,False,everyone,"[51.110665, 17.076283]","[51.110786, 17.07661]",3.019,4.98,78.0,348.0,583.0,381.0,True,157.8,179.0,119.2,115.0,3,0,73.0,Fast 8-4-2s with Runna ‚úÖ\n\nNogi w ko≈Ñcu dobrz...,666.0,Garmin Forerunner 970,a14707040076,g23642256
98,14694691688,Afternoon Weight Training,0.0,3763,3763,0.0,WeightTraining,WeightTraining,,2025-06-04T13:58:52Z,2025-06-04T15:58:52Z,(GMT+02:00) Africa/Blantyre,0,10,0,1,0,True,False,False,followers_only,[],[],0.000,0.00,,,,,True,90.8,129.0,0.0,0.0,0,0,7.0,"Reska6Ô∏è‚É£1Ô∏è‚É£\nBench press: 82,5kgüèÜ",237.0,Garmin Forerunner 970,a14694691688,


In [14]:
mov_vals = [(timedelta(seconds=int(s)) if pd.notnull(s) else None) for s in activities_df["moving_time"]]
ela_vals = [(timedelta(seconds=int(s)) if pd.notnull(s) else None) for s in activities_df["elapsed_time"]]

activities_df.loc[:, "moving_time_td"]  = pd.Series(mov_vals, dtype="object")
activities_df.loc[:, "elapsed_time_td"] = pd.Series(ela_vals, dtype="object")

activities_df["start_date_dt"] = pd.to_datetime(activities_df["start_date"], utc=True)
activities_df["timezone_name"] = activities_df["timezone"].str.extract(r'\)\s*(.*)')
activities_df["start_date_local_dt"] = activities_df.apply(
    lambda row: row["start_date_dt"].tz_convert(row["timezone_name"]),
    axis=1
)

activities_df[["start_lat", "start_lng"]]  = pd.DataFrame(activities_df["start_latlng"].tolist(), index=activities_df.index)
activities_df[["end_lat", "end_lng"]] = pd.DataFrame(activities_df["end_latlng"].tolist(), index=activities_df.index)

activities_df['average_cadence'] = activities_df.apply(
  lambda row: row['average_cadence'] * 2 if row['type'] == 'Run' else row['average_cadence'], axis=1
)

activities_df['avg_pace_str'] = activities_df.apply(
  lambda row: speed_to_pace_str(row['average_speed']) if row['type'] == 'Run' else np.nan, axis=1
)

activities_df['avg_pace_float'] = activities_df.apply(
  lambda row: speed_to_pace_float(row['average_speed']) if row['type'] == 'Run' else np.nan, axis=1
)

activities_df['max_pace_str'] = activities_df.apply(
  lambda row: speed_to_pace_str(row['max_speed']) if row['type'] == 'Run' else np.nan, axis=1
)

activities_df['max_pace_float'] = activities_df.apply(
  lambda row: speed_to_pace_float(row['max_speed']) if row['type'] == 'Run' else np.nan, axis=1
)

In [15]:
activities_cols_clean = [
    'id',
    'name',
    'start_date_dt',
    'start_date_local_dt',
    'distance',
    'moving_time',
    'moving_time_td',
    'elapsed_time',
    'elapsed_time_td',
    'total_elevation_gain',
    'elev_low',
    'elev_high',
    'type',
    'sport_type',
    'workout_type',
    'achievement_count',
    'kudos_count',
    'comment_count',
    'athlete_count',
    'photo_count',
    'trainer',
    'commute',
    'manual',
    'visibility',
    'average_speed',
    'avg_pace_str',
    'avg_pace_float',
    'max_speed',
    'max_pace_str',
    'max_pace_float',
    'average_cadence',
    'average_watts',
    'max_watts',
    'weighted_average_watts',
    'has_heartrate',
    'average_heartrate',
    'max_heartrate',
    'pr_count',
    'total_photo_count',
    'suffer_score',
    'description',
    'calories',
    'device_name',
    'map_id',
    'gear_id'
]
activities_df = activities_df[activities_cols_clean]
activities_df

Unnamed: 0,id,name,start_date_dt,start_date_local_dt,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,elev_low,elev_high,type,sport_type,workout_type,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,visibility,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_watts,max_watts,weighted_average_watts,has_heartrate,average_heartrate,max_heartrate,pr_count,total_photo_count,suffer_score,description,calories,device_name,map_id,gear_id
0,15729456618,Lunch Ride,2025-09-07 09:45:26+00:00,2025-09-07 11:45:26+02:00,79588.5,11082,3:04:42,14430,4:00:30,202.0,115.4,158.4,Ride,Ride,,44,13,0,1,0,False,False,False,followers_only,7.182,,,11.18,,,,183.2,,,True,129.0,148.0,20,0,53.0,Nogi nie wsp√≥≈Çpracowa≈Çy po wczorajszym longuü™¶,1388.0,Garmin Edge 840,a15729456618,b12572672
1,15716821076,24km Race Practice Long Runüî©,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,24120.3,8004,2:13:24,8085,2:14:45,56.0,111.4,123.2,Run,Run,2.0,17,13,2,2,0,False,False,False,everyone,3.014,5:32,5.529750,4.28,3:54,3.894081,169.6,369.3,581.0,375.0,True,154.9,173.0,10,0,165.0,24km Race Practice Long Run with Runna ‚úÖ\n\nDo...,1857.0,Garmin Forerunner 970,a15716821076,g23642256
2,15708639235,Evening Ride,2025-09-05 16:31:17+00:00,2025-09-05 18:31:17+02:00,16823.7,3683,1:01:23,6122,1:42:02,47.0,116.2,126.2,Ride,Ride,,5,10,0,4,0,False,False,False,followers_only,4.568,,,12.62,,,,95.9,,,True,101.0,145.0,0,1,9.0,Coffee ride bez kawyüóø,320.0,Garmin Edge 840,a15708639235,b12572672
3,15705468575,Afternoon Weight Training,2025-09-05 12:00:34+00:00,2025-09-05 14:00:34+02:00,0.0,3713,1:01:53,3713,1:01:53,0.0,0.0,0.0,Workout,WeightTraining,,0,8,0,1,0,True,False,False,followers_only,0.000,,,0.00,,,,,,,True,99.6,142.0,0,0,8.0,Reska8Ô∏è‚É£5Ô∏è‚É£,306.0,Garmin Forerunner 970,a15705468575,
4,15705659558,Afternoon Ride,2025-09-05 11:32:00+00:00,2025-09-05 13:32:00+02:00,13045.3,1871,0:31:11,6563,1:49:23,44.0,115.4,129.0,Ride,Ride,10.0,9,12,0,1,0,False,True,False,followers_only,6.972,,,10.50,,,,181.2,,,True,134.1,152.0,4,0,13.0,Reska dojazdü´°,318.0,Garmin Edge 840,a15705659558,b12572672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,14731708283,‚ÄûLong‚Äù RunüôÇ‚Äç‚ÜïÔ∏è,2025-06-08 08:31:47+00:00,2025-06-08 10:31:47+02:00,11021.1,4111,1:08:31,4367,1:12:47,71.0,23.8,77.8,Run,Run,2.0,0,14,0,1,0,False,False,False,everyone,2.681,6:13,6.216586,3.58,4:39,4.655493,167.0,343.8,463.0,335.0,True,145.8,158.0,0,0,54.0,11km Long Run with Runna ‚úÖ\n\nLu≈∫no po dzielni...,857.0,Garmin Forerunner 970,a14731708283,g20426652
96,14722686686,XXXI Bieg ≈ªakowskiüî•,2025-06-07 11:01:15+00:00,2025-06-07 13:01:15+02:00,5047.6,2057,0:34:17,2062,0:34:22,41.0,32.6,69.8,Run,Run,1.0,0,19,0,4,0,False,False,False,everyone,2.448,6:48,6.808279,3.48,4:47,4.789272,157.6,309.2,486.0,309.0,True,146.9,164.0,0,1,29.0,Karo poprowadzona na nowy PRüèÜ,416.0,Garmin Forerunner 970,a14722686686,g23642256
97,14707040076,Fast 8-4-2süöÄ,2025-06-05 17:15:13+00:00,2025-06-05 19:15:13+02:00,8732.2,2892,0:48:12,2892,0:48:12,3.0,115.0,119.2,Run,Run,3.0,7,13,2,3,0,False,False,False,everyone,3.019,5:31,5.520592,4.98,3:21,3.346720,156.0,348.0,583.0,381.0,True,157.8,179.0,3,0,73.0,Fast 8-4-2s with Runna ‚úÖ\n\nNogi w ko≈Ñcu dobrz...,666.0,Garmin Forerunner 970,a14707040076,g23642256
98,14694691688,Afternoon Weight Training,2025-06-04 13:58:52+00:00,2025-06-04 15:58:52+02:00,0.0,3763,1:02:43,3763,1:02:43,0.0,0.0,0.0,WeightTraining,WeightTraining,,0,10,0,1,0,True,False,False,followers_only,0.000,,,0.00,,,,,,,True,90.8,129.0,0,0,7.0,"Reska6Ô∏è‚É£1Ô∏è‚É£\nBench press: 82,5kgüèÜ",237.0,Garmin Forerunner 970,a14694691688,


In [16]:
activities_df.dtypes

id                                      int64
name                                   object
start_date_dt             datetime64[ns, UTC]
start_date_local_dt                    object
distance                              float64
moving_time                             int64
moving_time_td                         object
elapsed_time                            int64
elapsed_time_td                        object
total_elevation_gain                  float64
elev_low                              float64
elev_high                             float64
type                                   object
sport_type                             object
workout_type                          float64
achievement_count                       int64
kudos_count                             int64
comment_count                           int64
athlete_count                           int64
photo_count                             int64
trainer                                  bool
commute                           

In [17]:
activities_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "start_date_dt": DateTime(timezone=True),
    "start_date_local_dt":  DateTime(timezone=True),
    "distance": Float,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "total_elevation_gain": Float,
    "elev_low": Float,
    "elev_high": Float,
    "type": String,
    "sport_type": String,
    "workout_type": Float,
    "achievement_count": Integer,
    "kudos_count": Integer,
    "comment_count": Integer,
    "athlete_count": Integer,
    "photo_count": Integer,
    "trainer": Boolean,
    "commute": Boolean,
    "manual": Boolean,
    "visibility": String,
    "average_speed": Float,
    "avg_pace_str": String,
    "avg_pace_float": Float,
    "max_speed": Float,
    "max_pace_str": String,
    "max_pace_float": Float,
    "average_cadence": Float,
    "average_watts": Float,
    "max_watts": Float,
    "weighted_average_watts": Float,
    "has_heartrate": Boolean,
    "average_heartrate": Float,
    "max_heartrate": Float,
    "pr_count": Integer,
    "total_photo_count": Integer,
    "suffer_score": Float,
    "description": Text,
    "calories": Float,
    "device_name": String,
    "map_id": String,
    "gear_id": String,
}

with engine.begin() as conn:
    conn.exec_driver_sql("CREATE SCHEMA IF NOT EXISTS silver;")

activities_df.to_sql(
    name="activities",
    schema="silver",
    con=engine,
    if_exists="replace",
    index=False,
    dtype=activities_df_dtype_map,
    method="multi",
    chunksize=1000
)

-1

In [18]:
activities_df

Unnamed: 0,id,name,start_date_dt,start_date_local_dt,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,elev_low,elev_high,type,sport_type,workout_type,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,visibility,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_watts,max_watts,weighted_average_watts,has_heartrate,average_heartrate,max_heartrate,pr_count,total_photo_count,suffer_score,description,calories,device_name,map_id,gear_id
0,15729456618,Lunch Ride,2025-09-07 09:45:26+00:00,2025-09-07 11:45:26+02:00,79588.5,11082,3:04:42,14430,4:00:30,202.0,115.4,158.4,Ride,Ride,,44,13,0,1,0,False,False,False,followers_only,7.182,,,11.18,,,,183.2,,,True,129.0,148.0,20,0,53.0,Nogi nie wsp√≥≈Çpracowa≈Çy po wczorajszym longuü™¶,1388.0,Garmin Edge 840,a15729456618,b12572672
1,15716821076,24km Race Practice Long Runüî©,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,24120.3,8004,2:13:24,8085,2:14:45,56.0,111.4,123.2,Run,Run,2.0,17,13,2,2,0,False,False,False,everyone,3.014,5:32,5.529750,4.28,3:54,3.894081,169.6,369.3,581.0,375.0,True,154.9,173.0,10,0,165.0,24km Race Practice Long Run with Runna ‚úÖ\n\nDo...,1857.0,Garmin Forerunner 970,a15716821076,g23642256
2,15708639235,Evening Ride,2025-09-05 16:31:17+00:00,2025-09-05 18:31:17+02:00,16823.7,3683,1:01:23,6122,1:42:02,47.0,116.2,126.2,Ride,Ride,,5,10,0,4,0,False,False,False,followers_only,4.568,,,12.62,,,,95.9,,,True,101.0,145.0,0,1,9.0,Coffee ride bez kawyüóø,320.0,Garmin Edge 840,a15708639235,b12572672
3,15705468575,Afternoon Weight Training,2025-09-05 12:00:34+00:00,2025-09-05 14:00:34+02:00,0.0,3713,1:01:53,3713,1:01:53,0.0,0.0,0.0,Workout,WeightTraining,,0,8,0,1,0,True,False,False,followers_only,0.000,,,0.00,,,,,,,True,99.6,142.0,0,0,8.0,Reska8Ô∏è‚É£5Ô∏è‚É£,306.0,Garmin Forerunner 970,a15705468575,
4,15705659558,Afternoon Ride,2025-09-05 11:32:00+00:00,2025-09-05 13:32:00+02:00,13045.3,1871,0:31:11,6563,1:49:23,44.0,115.4,129.0,Ride,Ride,10.0,9,12,0,1,0,False,True,False,followers_only,6.972,,,10.50,,,,181.2,,,True,134.1,152.0,4,0,13.0,Reska dojazdü´°,318.0,Garmin Edge 840,a15705659558,b12572672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,14731708283,‚ÄûLong‚Äù RunüôÇ‚Äç‚ÜïÔ∏è,2025-06-08 08:31:47+00:00,2025-06-08 10:31:47+02:00,11021.1,4111,1:08:31,4367,1:12:47,71.0,23.8,77.8,Run,Run,2.0,0,14,0,1,0,False,False,False,everyone,2.681,6:13,6.216586,3.58,4:39,4.655493,167.0,343.8,463.0,335.0,True,145.8,158.0,0,0,54.0,11km Long Run with Runna ‚úÖ\n\nLu≈∫no po dzielni...,857.0,Garmin Forerunner 970,a14731708283,g20426652
96,14722686686,XXXI Bieg ≈ªakowskiüî•,2025-06-07 11:01:15+00:00,2025-06-07 13:01:15+02:00,5047.6,2057,0:34:17,2062,0:34:22,41.0,32.6,69.8,Run,Run,1.0,0,19,0,4,0,False,False,False,everyone,2.448,6:48,6.808279,3.48,4:47,4.789272,157.6,309.2,486.0,309.0,True,146.9,164.0,0,1,29.0,Karo poprowadzona na nowy PRüèÜ,416.0,Garmin Forerunner 970,a14722686686,g23642256
97,14707040076,Fast 8-4-2süöÄ,2025-06-05 17:15:13+00:00,2025-06-05 19:15:13+02:00,8732.2,2892,0:48:12,2892,0:48:12,3.0,115.0,119.2,Run,Run,3.0,7,13,2,3,0,False,False,False,everyone,3.019,5:31,5.520592,4.98,3:21,3.346720,156.0,348.0,583.0,381.0,True,157.8,179.0,3,0,73.0,Fast 8-4-2s with Runna ‚úÖ\n\nNogi w ko≈Ñcu dobrz...,666.0,Garmin Forerunner 970,a14707040076,g23642256
98,14694691688,Afternoon Weight Training,2025-06-04 13:58:52+00:00,2025-06-04 15:58:52+02:00,0.0,3763,1:02:43,3763,1:02:43,0.0,0.0,0.0,WeightTraining,WeightTraining,,0,10,0,1,0,True,False,False,followers_only,0.000,,,0.00,,,,,,,True,90.8,129.0,0,0,7.0,"Reska6Ô∏è‚É£1Ô∏è‚É£\nBench press: 82,5kgüèÜ",237.0,Garmin Forerunner 970,a14694691688,


## Maps Dataframe

In [19]:
maps_df.columns = maps_df.columns.str.replace("^map_", "", regex=True)


In [20]:
maps_cols_clean = [
  'id',
  'polyline',
  'summary_polyline'
]
maps_df = maps_df[maps_cols_clean]
maps_df

Unnamed: 0,id,polyline,summary_polyline
0,a15729456618,}a}vH{kogBILs@x@WP_@PGJKp@O|BWxAM`@c@|@o@~@KZ?...,kv}vHw~mgBqMlTwFbHa@pA`LzVaAzNy@`Pf@bZFtYrAfPC...
1,a15716821076,wz|vHgnogBPx@Rd@JFHC^c@DCTLV\DBNCr@u@NS`AcAz@k...,{`|vHsrogBz@fBdAbCDX?t@Ob@aBbCmCfD{JnKuChDuBxC...
2,a15708639235,m~|vHqkogBBJ?HIb@@Rb@`B^x@BXCRaBzBWVy@lAcCvESR...,kt}vHkangBkAhBKh@_@x@_CnDa@v@}@|AiA`B}@|A{B~Cw...
3,a15705468575,,
4,a15705659558,cc}vH{logBMP?Lw@bAERGLSPQHEJHlBCr@UrAM^w@`Ba@l...,yv}vH_~mgBmFfJeAvAm@fAgC~DqAxAcAtAu@z@g@v@ELDX...
...,...,...,...
95,a14731708283,ue_aIyia|AG{@g@wD@QJQEs@@e@Cc@Y}AMmAIQYWGSUqE@...,ue_aIyia|AG{@g@wD@QJQEs@@e@Cc@Y}AMmAIQYWGSUqE@...
96,a14722686686,a{}`Isj~{As@M[?YGI?_@I}DSeB[qBw@QAU@YCI@sAMKEi...,a{}`Isj~{As@M[?YGI?_@I}DSeB[qBw@QAU@YCI@sAMKEi...
97,a14707040076,sp}vHwefgBNNLHN@TEPIPUPi@Bc@@a@a@{CQe@SSYKU?SH...,sp}vHwefgBl@Zr@[X}@@kA_@kCe@y@}@Em@z@ElA\xCb@|...
98,a14694691688,,


In [21]:
maps_df_dtype_map = {
    "id": String,
    "polyline": Text,
    "summary_polyline": Text
}

with engine.begin() as conn:
    conn.exec_driver_sql("CREATE SCHEMA IF NOT EXISTS silver;")

maps_df.to_sql(
    name="maps",
    schema="silver",
    con=engine,
    if_exists="replace",
    index=False,
    dtype=maps_df_dtype_map,
    method="multi",
    chunksize=1000
)

-1

## Gear Dataframe

In [22]:
gear_df.columns = gear_df.columns.str.replace("^gear_", "", regex=True)
gear_df = gear_df.rename(columns={'distance' : 'distance_m', 'converted_distance' : 'distance_km'})
gear_df

Unnamed: 0,id,primary,name,nickname,retired,distance_m,distance_km
0,b12572672,False,Cube Nuroad Pro,Cube Nuroad Pro,False,3734608.0,3734.6
1,g23642256,False,Adidas EVO SL,,False,146967.0,147.0
3,,,,,,,
5,g24134620,False,ASICS Novablast 5,,False,268313.0,268.3
39,g19800575,False,Nike Invincible Run 3 White,White,True,430102.0,430.1
89,g20426652,False,Nike Invincible Run 3 Blueprint,Blueprint,True,565167.0,565.2


In [23]:
gear_df_dtype_map = {
    "id": String,
    "primary": Boolean,
    "name": String,
    "nickname": String,
    "retired": Boolean,
    "distance_m": Float,
    "distance_km": Float,
}

with engine.begin() as conn:
    conn.exec_driver_sql("CREATE SCHEMA IF NOT EXISTS silver;")

gear_df.to_sql(
    name="gear",
    schema="silver",
    con=engine,
    if_exists="replace",
    index=False,
    dtype=gear_df_dtype_map,
    method="multi",
    chunksize=1000
)

-1

## Segments efforts Dataframe

In [24]:
segments_types_df = segments_df[['segment_id', 'segment_activity_type']].copy()
segments_types_df.drop_duplicates(inplace=True)
segments_eff_df = pd.merge(segments_eff_df, segments_types_df, on='segment_id', how='left')

In [25]:
segments_eff_df_exploded = segments_eff_df.explode("achievements", ignore_index=True)
segments_eff_df_norm = pd.json_normalize(segments_eff_df_exploded["achievements"])
segments_eff_df = pd.concat([segments_eff_df_exploded.drop(columns="achievements"), segments_eff_df_norm], axis=1)


In [26]:
mov_vals = [(timedelta(seconds=int(s)) if pd.notnull(s) else None) for s in segments_eff_df["moving_time"]]
ela_vals = [(timedelta(seconds=int(s)) if pd.notnull(s) else None) for s in segments_eff_df["elapsed_time"]]

segments_eff_df.loc[:, "moving_time_td"]  = pd.Series(mov_vals, dtype="object")
segments_eff_df.loc[:, "elapsed_time_td"] = pd.Series(ela_vals, dtype="object")

segments_eff_df["start_date_dt"] = pd.to_datetime(segments_eff_df["start_date"], utc=True)
segments_eff_df["start_date_local_dt"] = pd.to_datetime(segments_eff_df["start_date_local"], utc=False)
segments_eff_df['utc_offset'] = (segments_eff_df["start_date_local_dt"] - segments_eff_df["start_date_dt"]).dt.total_seconds() / 60
segments_eff_df["tz"] = segments_eff_df["utc_offset"].apply(
    lambda m: timezone(timedelta(minutes=int(m))) if pd.notna(m) else None
)
segments_eff_df["start_date_local_dt"] = segments_eff_df.apply(
    lambda row: row["start_date_dt"].tz_convert(row["tz"]),
    axis=1
)

segments_eff_df['average_cadence'] = segments_eff_df.apply(
  lambda row: row['average_cadence'] * 2 if row['segment_activity_type'] == 'Run' else row['average_cadence'], axis=1
)


In [27]:
segments_eff_df

Unnamed: 0,id,name,elapsed_time,moving_time,start_date,start_date_local,distance,start_index,end_index,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,pr_rank,visibility,kom_rank,hidden,activity_id,segment_id,segment_activity_type,rank,type,type_id,effort_count,moving_time_td,elapsed_time_td,start_date_dt,start_date_local_dt,utc_offset,tz
0,3.399769e+18,Marco Polo pod g√≥re,126.0,126.0,2025-09-07T09:46:21Z,2025-09-07T11:46:21Z,871.9,55.0,181.0,,False,,131.8,148.0,3.0,only_me,,False,1.572946e+10,17440005.0,Ride,3.0,pr,3.0,,0:02:06,0:02:06,2025-09-07 09:46:21+00:00,2025-09-07 11:46:21+02:00,120.0,UTC+02:00
1,3.399769e+18,Mickiewicza (most-Kopernika),310.0,257.0,2025-09-07T09:48:56Z,2025-09-07T11:48:56Z,1744.0,210.0,470.0,,False,,126.5,142.0,1.0,followers_only,,False,1.572946e+10,8184058.0,Ride,1.0,pr,3.0,,0:04:17,0:05:10,2025-09-07 09:48:56+00:00,2025-09-07 11:48:56+02:00,120.0,UTC+02:00
2,3.399769e+18,Kostka Mickiewicza do centrum,41.0,41.0,2025-09-07T09:49:30Z,2025-09-07T11:49:30Z,301.2,244.0,285.0,,False,,125.9,130.0,2.0,followers_only,,False,1.572946e+10,15029115.0,Ride,2.0,pr,3.0,,0:00:41,0:00:41,2025-09-07 09:49:30+00:00,2025-09-07 11:49:30+02:00,120.0,UTC+02:00
3,3.399769e+18,Sienkiewicza sprint,349.0,296.0,2025-09-07T09:50:17Z,2025-09-07T11:50:17Z,2088.4,291.0,590.0,,False,,128.9,142.0,2.0,followers_only,,False,1.572946e+10,9329284.0,Ride,2.0,pr,3.0,,0:04:56,0:05:49,2025-09-07 09:50:17+00:00,2025-09-07 11:50:17+02:00,120.0,UTC+02:00
4,3.399769e+18,Park Swojczycki - 8 maja,97.0,97.0,2025-09-07T09:50:17Z,2025-09-07T11:50:17Z,671.0,291.0,388.0,,False,,132.6,137.0,3.0,followers_only,,False,1.572946e+10,36975541.0,Ride,3.0,pr,3.0,,0:01:37,0:01:37,2025-09-07 09:50:17+00:00,2025-09-07 11:50:17+02:00,120.0,UTC+02:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852,3.365772e+18,300 m AWF Stadion Witelona,120.0,120.0,2025-06-05T17:53:52Z,2025-06-05T19:53:52Z,321.2,2319.0,2439.0,146.6,True,328.5,170.9,178.0,,everyone,,False,1.470704e+10,31340120.0,Run,,,,,0:02:00,0:02:00,2025-06-05 17:53:52+00:00,2025-06-05 19:53:52+02:00,120.0,UTC+02:00
853,3.365772e+18,1000 m AWF Stadion Witelona,444.0,443.0,2025-06-05T17:56:01Z,2025-06-05T19:56:01Z,972.2,2448.0,2892.0,158.8,True,284.5,148.5,158.0,,everyone,,False,1.470704e+10,31340176.0,Run,,,,,0:07:23,0:07:24,2025-06-05 17:56:01+00:00,2025-06-05 19:56:01+02:00,120.0,UTC+02:00
854,3.365772e+18,300 m AWF Stadion Witelona,149.0,148.0,2025-06-05T17:56:08Z,2025-06-05T19:56:08Z,321.2,2455.0,2604.0,152.2,True,254.3,145.6,154.0,,everyone,,False,1.470704e+10,31340120.0,Run,,,,,0:02:28,0:02:29,2025-06-05 17:56:08+00:00,2025-06-05 19:56:08+02:00,120.0,UTC+02:00
855,3.365772e+18,300 m AWF Stadion Witelona,134.0,134.0,2025-06-05T17:58:46Z,2025-06-05T19:58:46Z,321.2,2613.0,2747.0,163.8,True,305.8,150.5,153.0,,everyone,,False,1.470704e+10,31340120.0,Run,,,,,0:02:14,0:02:14,2025-06-05 17:58:46+00:00,2025-06-05 19:58:46+02:00,120.0,UTC+02:00


In [28]:
segments_eff_cols_clean = [
  'id',
  'name',
  'start_date_dt',
  'start_date_local_dt',
  'distance',
  'moving_time',
  'moving_time_td',
  'elapsed_time',
  'elapsed_time_td',
  'average_cadence',
  'device_watts',
  'average_watts',
  'average_heartrate',
  'max_heartrate',
  'pr_rank',
  'visibility',
  'kom_rank',
  'hidden',
  'rank',
  'type',
  'activity_id',
  'segment_id'
]
segments_eff_df = segments_eff_df[segments_eff_cols_clean]
segments_eff_df

Unnamed: 0,id,name,start_date_dt,start_date_local_dt,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,pr_rank,visibility,kom_rank,hidden,rank,type,activity_id,segment_id
0,3.399769e+18,Marco Polo pod g√≥re,2025-09-07 09:46:21+00:00,2025-09-07 11:46:21+02:00,871.9,126.0,0:02:06,126.0,0:02:06,,False,,131.8,148.0,3.0,only_me,,False,3.0,pr,1.572946e+10,17440005.0
1,3.399769e+18,Mickiewicza (most-Kopernika),2025-09-07 09:48:56+00:00,2025-09-07 11:48:56+02:00,1744.0,257.0,0:04:17,310.0,0:05:10,,False,,126.5,142.0,1.0,followers_only,,False,1.0,pr,1.572946e+10,8184058.0
2,3.399769e+18,Kostka Mickiewicza do centrum,2025-09-07 09:49:30+00:00,2025-09-07 11:49:30+02:00,301.2,41.0,0:00:41,41.0,0:00:41,,False,,125.9,130.0,2.0,followers_only,,False,2.0,pr,1.572946e+10,15029115.0
3,3.399769e+18,Sienkiewicza sprint,2025-09-07 09:50:17+00:00,2025-09-07 11:50:17+02:00,2088.4,296.0,0:04:56,349.0,0:05:49,,False,,128.9,142.0,2.0,followers_only,,False,2.0,pr,1.572946e+10,9329284.0
4,3.399769e+18,Park Swojczycki - 8 maja,2025-09-07 09:50:17+00:00,2025-09-07 11:50:17+02:00,671.0,97.0,0:01:37,97.0,0:01:37,,False,,132.6,137.0,3.0,followers_only,,False,3.0,pr,1.572946e+10,36975541.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852,3.365772e+18,300 m AWF Stadion Witelona,2025-06-05 17:53:52+00:00,2025-06-05 19:53:52+02:00,321.2,120.0,0:02:00,120.0,0:02:00,146.6,True,328.5,170.9,178.0,,everyone,,False,,,1.470704e+10,31340120.0
853,3.365772e+18,1000 m AWF Stadion Witelona,2025-06-05 17:56:01+00:00,2025-06-05 19:56:01+02:00,972.2,443.0,0:07:23,444.0,0:07:24,158.8,True,284.5,148.5,158.0,,everyone,,False,,,1.470704e+10,31340176.0
854,3.365772e+18,300 m AWF Stadion Witelona,2025-06-05 17:56:08+00:00,2025-06-05 19:56:08+02:00,321.2,148.0,0:02:28,149.0,0:02:29,152.2,True,254.3,145.6,154.0,,everyone,,False,,,1.470704e+10,31340120.0
855,3.365772e+18,300 m AWF Stadion Witelona,2025-06-05 17:58:46+00:00,2025-06-05 19:58:46+02:00,321.2,134.0,0:02:14,134.0,0:02:14,163.8,True,305.8,150.5,153.0,,everyone,,False,,,1.470704e+10,31340120.0


In [None]:
segments_eff_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "start_date_dt": DateTime(timezone=True),
    "start_date_local_dt": DateTime(timezone=True),
    "distance": Float,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,_
    "elapsed_time_td": Interval,
    "average_cadence": Float,
    "device_watts": Boolean,
    "average_watts": Float,
    "average_heartrate": Float,
    "max_heartrate": Float,
    "pr_rank": Integer,
    "visibility": String,
    "kom_rank": Integer,
    "hidden": Boolean,
    "rank": Integer,
    "type": String,
    "activity_id": BigInteger,
    "segment_id": BigInteger,
}

with engine.begin() as conn:
    conn.exec_driver_sql("CREATE SCHEMA IF NOT EXISTS silver;")

segments_eff_df.to_sql(
    name="segments_efforts",
    schema="silver",
    con=engine,
    if_exists="replace",
    index=False,
    dtype=segments_eff_df_dtype_map,
    method="multi",
    chunksize=1000
)

-1

## Segments Dataframe

In [30]:
segments_df.columns = segments_df.columns.str.replace("^segment_", "", regex=True)

segments_df[["start_lat", "start_lng"]]  = pd.DataFrame(segments_df["start_latlng"].tolist(), index=segments_df.index)
segments_df[["end_lat", "end_lng"]] = pd.DataFrame(segments_df["end_latlng"].tolist(), index=segments_df.index)
segments_df = segments_df.drop(columns=["start_latlng", "end_latlng"])
segments_df = segments_df.drop_duplicates()

In [31]:
# segments_df['city_geo'] = segments_df['geolocation'].apply(lambda loc: loc.raw['address']['city'] if loc else None)
# segments_df['state_geo'] = segments_df['geolocation'].apply(lambda loc: loc.raw['address']['state'] if loc else None)
# segments_df['country_geo'] = segments_df['geolocation'].apply(lambda loc: loc.raw['address']['country'] if loc else None)

In [32]:
segments_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "activity_type": String,
    "distance": Float,
    "average_grade": Float,
    "maximum_grade": Float,
    "elevation_high": Float,
    "elevation_low": Float,
    "elevation_profile": Float,
    "elevation_profiles": Float,
    "climb_category": Float,
    "city": String,
    "state": String,
    "country": String,
    "private": Boolean,
    "hazardous": Boolean,
    "starred": Boolean,
    "start_lat": Float,
    "start_lng": Float,
    "end_lat": Float,
    "end_lng": Float,
}

with engine.begin() as conn:
    conn.exec_driver_sql("CREATE SCHEMA IF NOT EXISTS silver;")

segments_df.to_sql(
    name="segments",
    schema="silver",
    con=engine,
    if_exists="replace",
    index=False,
    dtype=segments_df_dtype_map,
    method="multi",
    chunksize=1000
)

-1

## Laps Dataframe

In [33]:
laps_types_df = activities_df[['id', 'type']].copy()
laps_types_df.drop_duplicates(inplace=True)
laps_df = pd.merge(laps_df, laps_types_df, left_on='activity_id', right_on='id', how='left')

In [34]:
mov_vals = [(timedelta(seconds=int(s)) if pd.notnull(s) else None) for s in laps_df["moving_time"]]
ela_vals = [(timedelta(seconds=int(s)) if pd.notnull(s) else None) for s in laps_df["elapsed_time"]]

laps_df.loc[:, "moving_time_td"]  = pd.Series(mov_vals, dtype="object")
laps_df.loc[:, "elapsed_time_td"] = pd.Series(ela_vals, dtype="object")

laps_df["start_date_dt"] = pd.to_datetime(laps_df["start_date"], utc=True)
laps_df["start_date_local_dt"] = pd.to_datetime(laps_df["start_date_local"], utc=False)
laps_df['utc_offset'] = (laps_df["start_date_local_dt"] - laps_df["start_date_dt"]).dt.total_seconds() / 60
laps_df["tz"] = laps_df["utc_offset"].apply(
    lambda m: timezone(timedelta(minutes=int(m))) if pd.notna(m) else None
)
laps_df["start_date_local_dt"] = laps_df.apply(
    lambda row: row["start_date_dt"].tz_convert(row["tz"]),
    axis=1
)

laps_df['average_cadence'] = laps_df.apply(
  lambda row: row['average_cadence'] * 2 if row['type'] == 'Run' else row['average_cadence'], axis=1
)

laps_df['avg_pace_str'] = laps_df.apply(
  lambda row: speed_to_pace_str(row['average_speed']) if row['type'] == 'Run' else np.nan, axis=1
)

laps_df['avg_pace_float'] = laps_df.apply(
  lambda row: speed_to_pace_float(row['average_speed']) if row['type'] == 'Run' else np.nan, axis=1
)

laps_df['max_pace_str'] = laps_df.apply(
  lambda row: speed_to_pace_str(row['max_speed']) if row['type'] == 'Run' else np.nan, axis=1
)

laps_df['max_pace_float'] = laps_df.apply(
  lambda row: speed_to_pace_float(row['max_speed']) if row['type'] == 'Run' else np.nan, axis=1
)

In [35]:
laps_df

Unnamed: 0,id_x,name,elapsed_time,moving_time,start_date,start_date_local,distance,average_speed,max_speed,lap_index,split,start_index,end_index,total_elevation_gain,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,pace_zone,activity_id,id_y,type,moving_time_td,elapsed_time_td,start_date_dt,start_date_local_dt,utc_offset,tz,avg_pace_str,avg_pace_float,max_pace_str,max_pace_float
0,5.601627e+10,Lap 1,808.0,737.0,2025-09-07T09:45:26Z,2025-09-07T11:45:26Z,5000.00,6.78,9.28,1.0,1.0,0.0,639.0,11.8,,False,167.2,131.5,148.0,,1.572946e+10,1.572946e+10,Ride,0:12:17,0:13:28,2025-09-07 09:45:26+00:00,2025-09-07 11:45:26+02:00,120.0,UTC+02:00,,,,
1,5.601627e+10,Lap 2,1376.0,854.0,2025-09-07T09:58:56Z,2025-09-07T11:58:56Z,5000.00,5.85,8.58,2.0,2.0,640.0,1514.0,24.2,,False,160.0,126.1,143.0,,1.572946e+10,1.572946e+10,Ride,0:14:14,0:22:56,2025-09-07 09:58:56+00:00,2025-09-07 11:58:56+02:00,120.0,UTC+02:00,,,,
2,5.601627e+10,Lap 3,975.0,745.0,2025-09-07T10:21:53Z,2025-09-07T12:21:53Z,5000.00,6.71,9.62,3.0,3.0,1515.0,2266.0,6.0,,False,169.5,127.9,146.0,,1.572946e+10,1.572946e+10,Ride,0:12:25,0:16:15,2025-09-07 10:21:53+00:00,2025-09-07 12:21:53+02:00,120.0,UTC+02:00,,,,
3,5.601627e+10,Lap 4,695.0,646.0,2025-09-07T10:38:08Z,2025-09-07T12:38:08Z,5000.00,7.74,9.78,4.0,4.0,2267.0,2915.0,15.0,,False,208.6,130.2,145.0,,1.572946e+10,1.572946e+10,Ride,0:10:46,0:11:35,2025-09-07 10:38:08+00:00,2025-09-07 12:38:08+02:00,120.0,UTC+02:00,,,,
4,5.601627e+10,Lap 5,684.0,661.0,2025-09-07T10:49:43Z,2025-09-07T12:49:43Z,5000.00,7.56,9.60,5.0,5.0,2916.0,3579.0,25.4,,False,214.8,132.5,140.0,,1.572946e+10,1.572946e+10,Ride,0:11:01,0:11:24,2025-09-07 10:49:43+00:00,2025-09-07 12:49:43+02:00,120.0,UTC+02:00,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851,5.229341e+10,Lap 30,40.0,40.0,2025-06-05T17:55:56Z,2025-06-05T19:55:56Z,60.68,1.52,1.74,30.0,30.0,2443.0,2482.0,0.0,114.4,True,140.1,150.8,162.0,1.0,1.470704e+10,1.470704e+10,Run,0:00:40,0:00:40,2025-06-05 17:55:56+00:00,2025-06-05 19:55:56+02:00,120.0,UTC+02:00,10:58,10.964912,9:35,9.578544
852,5.229341e+10,Lap 31,401.0,401.0,2025-06-05T17:56:36Z,2025-06-05T19:56:36Z,1000.00,2.49,2.92,31.0,31.0,2483.0,2884.0,0.0,162.6,True,296.9,148.4,153.0,2.0,1.470704e+10,1.470704e+10,Run,0:06:41,0:06:41,2025-06-05 17:56:36+00:00,2025-06-05 19:56:36+02:00,120.0,UTC+02:00,6:42,6.693440,5:42,5.707763
853,5.229341e+10,Lap 32,8.0,8.0,2025-06-05T18:03:18Z,2025-06-05T20:03:18Z,20.16,2.52,2.52,32.0,32.0,2885.0,2892.0,0.0,161.4,True,297.7,146.9,147.0,2.0,1.470704e+10,1.470704e+10,Run,0:00:08,0:00:08,2025-06-05 18:03:18+00:00,2025-06-05 20:03:18+02:00,120.0,UTC+02:00,6:37,6.613757,6:37,6.613757
854,5.224926e+10,Lap 1,3763.0,3763.0,2025-06-04T13:58:52Z,2025-06-04T15:58:52Z,0.00,0.00,0.00,1.0,1.0,0.0,2502.0,0.0,,False,,90.8,129.0,,1.469469e+10,1.469469e+10,WeightTraining,1:02:43,1:02:43,2025-06-04 13:58:52+00:00,2025-06-04 15:58:52+02:00,120.0,UTC+02:00,,,,


In [36]:
laps_cols_df_clean = [
    'id_x',
    'name',
    'lap_index',
    'split',
    'start_date_dt',
    'start_date_local_dt',
    'distance',
    'moving_time',
    'moving_time_td',
    'elapsed_time',
    'elapsed_time_td',
    'total_elevation_gain',
    'type',
    'average_speed',
    'avg_pace_str',
    'avg_pace_float',
    'pace_zone',
    'max_speed',
    'max_pace_str',
    'max_pace_float',
    'average_cadence',
    'device_watts',
    'average_watts',
    'average_heartrate',
    'max_heartrate',
    'activity_id'
]

laps_df = laps_df[laps_cols_df_clean]
laps_df = laps_df.rename(columns={'id_x': 'id'})
laps_df


Unnamed: 0,id,name,lap_index,split,start_date_dt,start_date_local_dt,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,type,average_speed,avg_pace_str,avg_pace_float,pace_zone,max_speed,max_pace_str,max_pace_float,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,activity_id
0,5.601627e+10,Lap 1,1.0,1.0,2025-09-07 09:45:26+00:00,2025-09-07 11:45:26+02:00,5000.00,737.0,0:12:17,808.0,0:13:28,11.8,Ride,6.78,,,,9.28,,,,False,167.2,131.5,148.0,1.572946e+10
1,5.601627e+10,Lap 2,2.0,2.0,2025-09-07 09:58:56+00:00,2025-09-07 11:58:56+02:00,5000.00,854.0,0:14:14,1376.0,0:22:56,24.2,Ride,5.85,,,,8.58,,,,False,160.0,126.1,143.0,1.572946e+10
2,5.601627e+10,Lap 3,3.0,3.0,2025-09-07 10:21:53+00:00,2025-09-07 12:21:53+02:00,5000.00,745.0,0:12:25,975.0,0:16:15,6.0,Ride,6.71,,,,9.62,,,,False,169.5,127.9,146.0,1.572946e+10
3,5.601627e+10,Lap 4,4.0,4.0,2025-09-07 10:38:08+00:00,2025-09-07 12:38:08+02:00,5000.00,646.0,0:10:46,695.0,0:11:35,15.0,Ride,7.74,,,,9.78,,,,False,208.6,130.2,145.0,1.572946e+10
4,5.601627e+10,Lap 5,5.0,5.0,2025-09-07 10:49:43+00:00,2025-09-07 12:49:43+02:00,5000.00,661.0,0:11:01,684.0,0:11:24,25.4,Ride,7.56,,,,9.60,,,,False,214.8,132.5,140.0,1.572946e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851,5.229341e+10,Lap 30,30.0,30.0,2025-06-05 17:55:56+00:00,2025-06-05 19:55:56+02:00,60.68,40.0,0:00:40,40.0,0:00:40,0.0,Run,1.52,10:58,10.964912,1.0,1.74,9:35,9.578544,114.4,True,140.1,150.8,162.0,1.470704e+10
852,5.229341e+10,Lap 31,31.0,31.0,2025-06-05 17:56:36+00:00,2025-06-05 19:56:36+02:00,1000.00,401.0,0:06:41,401.0,0:06:41,0.0,Run,2.49,6:42,6.693440,2.0,2.92,5:42,5.707763,162.6,True,296.9,148.4,153.0,1.470704e+10
853,5.229341e+10,Lap 32,32.0,32.0,2025-06-05 18:03:18+00:00,2025-06-05 20:03:18+02:00,20.16,8.0,0:00:08,8.0,0:00:08,0.0,Run,2.52,6:37,6.613757,2.0,2.52,6:37,6.613757,161.4,True,297.7,146.9,147.0,1.470704e+10
854,5.224926e+10,Lap 1,1.0,1.0,2025-06-04 13:58:52+00:00,2025-06-04 15:58:52+02:00,0.00,3763.0,1:02:43,3763.0,1:02:43,0.0,WeightTraining,0.00,,,,0.00,,,,False,,90.8,129.0,1.469469e+10


In [37]:
laps_df_dtype_map = {
"id_x": BigInteger,
"name": String,
"lap_index": Integer,
"split": Integer,
"start_date_dt": DateTime(timezone=True),
"start_date_local_dt": DateTime(timezone=True),
"distance": Float,
"moving_time": Integer,
"moving_time_td": Interval,
"elapsed_time": Integer,
"elapsed_time_td": Interval,
"total_elevation_gain": Float,
"type": String,
"average_speed": Float,
"avg_pace_str": String,
"avg_pace_float": Float,
"pace_zone": Float,
"max_speed": Float,
"max_pace_str": String,
"max_pace_float": Float,
"average_cadence": Float,
"device_watts": Boolean,
"average_watts": Float,
"average_heartrate": Float,
"max_heartrate": Float,
"activity_id": BigInteger,
}

with engine.begin() as conn:
    conn.exec_driver_sql("CREATE SCHEMA IF NOT EXISTS silver;")

laps_df.to_sql(
    name="laps",
    schema="silver",
    con=engine,
    if_exists="replace",
    index=False,
    dtype=laps_df_dtype_map,
    method="multi",
    chunksize=1000
)


-1

## Best efforts Dataframe

In [38]:
mov_vals = [(timedelta(seconds=int(s)) if pd.notnull(s) else None) for s in best_eff_df["moving_time"]]
ela_vals = [(timedelta(seconds=int(s)) if pd.notnull(s) else None) for s in best_eff_df["elapsed_time"]]

best_eff_df.loc[:, "moving_time_td"]  = pd.Series(mov_vals, dtype="object")
best_eff_df.loc[:, "elapsed_time_td"] = pd.Series(ela_vals, dtype="object")

best_eff_df["start_date_dt"] = pd.to_datetime(best_eff_df["start_date"], utc=True)
best_eff_df["start_date_local_dt"] = pd.to_datetime(best_eff_df["start_date_local"], utc=False)
best_eff_df['utc_offset'] = (best_eff_df["start_date_local_dt"] - best_eff_df["start_date_dt"]).dt.total_seconds() / 60
best_eff_df["tz"] = best_eff_df["utc_offset"].apply(
    lambda m: timezone(timedelta(minutes=int(m))) if pd.notna(m) else None
)
best_eff_df["start_date_local_dt"] = best_eff_df.apply(
    lambda row: row["start_date_dt"].tz_convert(row["tz"]),
    axis=1
)
best_eff_df

Unnamed: 0,id,name,elapsed_time,moving_time,start_date,start_date_local,distance,start_index,end_index,activity_id,moving_time_td,elapsed_time_td,start_date_dt,start_date_local_dt,utc_offset,tz
1,6.592842e+10,400m,112.0,112.0,2025-09-06T10:41:12Z,2025-09-06T12:41:12Z,400.0,6329.0,6441.0,1.571682e+10,0:03:49,0:03:49,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,120.0,UTC+02:00
2,6.592842e+10,1/2 mile,229.0,229.0,2025-09-06T10:41:12Z,2025-09-06T12:41:12Z,805.0,6215.0,6444.0,1.571682e+10,0:04:46,0:04:46,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,120.0,UTC+02:00
3,6.592842e+10,1K,286.0,286.0,2025-09-06T10:41:12Z,2025-09-06T12:41:12Z,1000.0,6157.0,6443.0,1.571682e+10,0:07:45,0:07:45,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,120.0,UTC+02:00
4,6.592842e+10,1 mile,465.0,465.0,2025-09-06T10:41:12Z,2025-09-06T12:41:12Z,1609.0,5977.0,6442.0,1.571682e+10,0:15:36,0:15:36,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,120.0,UTC+02:00
5,6.592842e+10,2 mile,936.0,936.0,2025-09-06T10:41:12Z,2025-09-06T12:41:12Z,3219.0,5496.0,6432.0,1.571682e+10,0:24:22,0:24:22,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,120.0,UTC+02:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,6.188356e+10,1/2 mile,206.0,206.0,2025-06-05T17:15:13Z,2025-06-05T19:15:13Z,805.0,714.0,920.0,1.470704e+10,,,2025-06-05 17:15:13+00:00,2025-06-05 19:15:13+02:00,120.0,UTC+02:00
316,6.188356e+10,1K,293.0,293.0,2025-06-05T17:15:13Z,2025-06-05T19:15:13Z,1000.0,2016.0,2309.0,1.470704e+10,,,2025-06-05 17:15:13+00:00,2025-06-05 19:15:13+02:00,120.0,UTC+02:00
317,6.188356e+10,1 mile,457.0,457.0,2025-06-05T17:15:13Z,2025-06-05T19:15:13Z,1609.0,423.0,880.0,1.470704e+10,,,2025-06-05 17:15:13+00:00,2025-06-05 19:15:13+02:00,120.0,UTC+02:00
318,6.188356e+10,2 mile,964.0,964.0,2025-06-05T17:15:13Z,2025-06-05T19:15:13Z,3219.0,421.0,1385.0,1.470704e+10,,,2025-06-05 17:15:13+00:00,2025-06-05 19:15:13+02:00,120.0,UTC+02:00


In [39]:
best_eff_df_cols_clean = [
    'id',
    'name',
    'start_date_dt',
    'start_date_local_dt',
    'distance',
    'moving_time',
    'moving_time_td',
    'elapsed_time',
    'elapsed_time_td',
    'activity_id'
]
best_eff_df = best_eff_df[best_eff_df_cols_clean]
best_eff_df

Unnamed: 0,id,name,start_date_dt,start_date_local_dt,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,activity_id
1,6.592842e+10,400m,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,400.0,112.0,0:03:49,112.0,0:03:49,1.571682e+10
2,6.592842e+10,1/2 mile,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,805.0,229.0,0:04:46,229.0,0:04:46,1.571682e+10
3,6.592842e+10,1K,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,1000.0,286.0,0:07:45,286.0,0:07:45,1.571682e+10
4,6.592842e+10,1 mile,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,1609.0,465.0,0:15:36,465.0,0:15:36,1.571682e+10
5,6.592842e+10,2 mile,2025-09-06 10:41:12+00:00,2025-09-06 12:41:12+02:00,3219.0,936.0,0:24:22,936.0,0:24:22,1.571682e+10
...,...,...,...,...,...,...,...,...,...,...
315,6.188356e+10,1/2 mile,2025-06-05 17:15:13+00:00,2025-06-05 19:15:13+02:00,805.0,206.0,,206.0,,1.470704e+10
316,6.188356e+10,1K,2025-06-05 17:15:13+00:00,2025-06-05 19:15:13+02:00,1000.0,293.0,,293.0,,1.470704e+10
317,6.188356e+10,1 mile,2025-06-05 17:15:13+00:00,2025-06-05 19:15:13+02:00,1609.0,457.0,,457.0,,1.470704e+10
318,6.188356e+10,2 mile,2025-06-05 17:15:13+00:00,2025-06-05 19:15:13+02:00,3219.0,964.0,,964.0,,1.470704e+10


In [40]:
best_eff_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "start_date_dt": DateTime(timezone=True),
    "start_date_local_dt": DateTime(timezone=True),
    "distance": Float,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "activity_id": BigInteger,
}

with engine.begin() as conn:
    conn.exec_driver_sql("CREATE SCHEMA IF NOT EXISTS silver;")

best_eff_df.to_sql(
    name="best_efforts",
    schema="silver",
    con=engine,
    if_exists="replace",
    index=False,
    dtype=best_eff_df_dtype_map,
    method="multi",
    chunksize=1000
)

-1