### Import and config

In [1]:
# Imports
import os
import logging
from datetime import timezone, timedelta

from dotenv import load_dotenv

import pandas as pd
import numpy as np

from sqlalchemy import create_engine, text, Integer, Float, String, Boolean, DateTime, Interval, Text, BigInteger

# Geopy - Nominatim
import json, time
from tqdm import tqdm
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Configuration
load_dotenv()

# DB
DB_URI = os.getenv('DB_URI')

# Bronze tables
TARGET_B_SCHEMA = os.getenv('TARGET_B_SCHEMA')
ACTIVITIES_B_TABLE = os.getenv('ACTIVITIES_B_TABLE')
DETAILS_B_TABLE = os.getenv('DETAILS_B_TABLE')
KUDOS_B_TABLE = os.getenv('KUDOS_B_TABLE')

# Silver tables
TARGET_S_SCHEMA = os.getenv('TARGET_S_SCHEMA')
ACTIVITIES_S_TABLE = os.getenv('ACTIVITIES_S_TABLE')
BEST_EFFORTS_S_TABLE = os.getenv('BEST_EFFORTS_S_TABLE')
GEAR_S_TABLE = os.getenv('GEAR_S_TABLE')
LAPS_S_TABLE = os.getenv('LAPS_S_TABLE')
MAPS_S_TABLE = os.getenv('MAPS_S_TABLE')
SEGMENTS_S_TABLE = os.getenv('SEGMENTS_S_TABLE')
SEGMENTS_EFFORTS_S_TABLE = os.getenv('SEGMENTS_EFFORTS_S_TABLE')
LOCATIONS_S_TABLE = os.getenv('LOCATIONS_S_TABLE')
KUDOS_S_TABLE = os.getenv('KUDOS_S_TABLE')

# Other
LOG_LEVEL = os.getenv('LOG_LEVEL')
NO_GEAR_ID = 'x00000000'

# Geopy - Nominatim
PRECISION = int(os.getenv('PRECISION'))
CACHE_PATH = os.getenv('CACHE_PATH')
USER_AGENT = os.getenv('USER_AGENT')

logging.basicConfig(
    level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
    format="%(asctime)s | %(levelname)s | %(message)s"
)

pd.set_option('display.max_columns', None)

### DB names validation

In [2]:
REQUIRED_DB_ENV = ['DB_URI', 'TARGET_B_SCHEMA', 'ACTIVITIES_B_TABLE', 'DETAILS_B_TABLE', 'TARGET_S_SCHEMA', 'ACTIVITIES_S_TABLE', 'BEST_EFFORTS_S_TABLE', 'GEAR_S_TABLE', 'LAPS_S_TABLE', 'MAPS_S_TABLE', 'SEGMENTS_S_TABLE', 'SEGMENTS_EFFORTS_S_TABLE']
missing_db_env = [env for env in REQUIRED_DB_ENV if not os.getenv(env)]
if missing_db_env:
  raise RuntimeError(f"Missing env variables: {', '.join(missing_db_env)}.")

### Request data from `bronze.activities_details` and `bronze.kudos`

In [3]:
engine = create_engine(
  DB_URI, 
  pool_pre_ping=True, 
  pool_size=5, 
  max_overflow=10
)
logging.info("Connection established")

2025-09-22 12:02:03,922 | INFO | Connection established


In [4]:
with engine.begin() as conn:
  activities_details_df = pd.read_sql(text(f"SELECT * FROM {TARGET_B_SCHEMA}.{DETAILS_B_TABLE}"), conn)
  kudos_df = pd.read_sql(text(f"SELECT * FROM {TARGET_B_SCHEMA}.{KUDOS_B_TABLE}"), conn)
logging.info(f"Data from {TARGET_B_SCHEMA}.{DETAILS_B_TABLE} and {TARGET_B_SCHEMA}.{KUDOS_B_TABLE} downloaded.")

2025-09-22 12:02:04,252 | INFO | Data from bronze.activities_details and bronze.kudos downloaded.


In [5]:
activities_details_df.head()

Unnamed: 0,resource_state,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id,start_date,start_date_local,timezone,utc_offset,location_city,location_state,location_country,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,private,visibility,flagged,gear_id,start_latlng,end_latlng,average_speed,max_speed,average_cadence,average_watts,max_watts,weighted_average_watts,device_watts,kilojoules,has_heartrate,average_heartrate,max_heartrate,heartrate_opt_out,display_hide_heartrate_option,elev_high,elev_low,upload_id,upload_id_str,external_id,from_accepted_tag,pr_count,total_photo_count,has_kudoed,suffer_score,description,calories,perceived_exertion,prefer_perceived_exertion,segment_efforts,splits_metric,splits_standard,laps,best_efforts,stats_visibility,hide_from_home,device_name,embed_token,available_zones,athlete_id,athlete_resource_state,map_id,map_polyline,map_resource_state,map_summary_polyline,gear_primary,gear_name,gear_nickname,gear_resource_state,gear_retired,gear_distance,gear_converted_distance,photos_primary,photos_count,similar_activities_effort_count,similar_activities_average_speed,similar_activities_min_average_speed,similar_activities_mid_average_speed,similar_activities_max_average_speed,similar_activities_pr_rank,similar_activities_frequency_milestone,similar_activities_trend_speeds,similar_activities_trend_current_activity_index,similar_activities_trend_min_speed,similar_activities_trend_mid_speed,similar_activities_trend_max_speed,similar_activities_trend_direction,similar_activities_resource_state,average_temp,private_note,photos_primary_unique_id,photos_primary_urls_600,photos_primary_urls_100,photos_primary_source,photos_primary_media_type,photos_use_primary_photo
0,3,Lunch Weight Training,0.0,6066,6066,0.0,WeightTraining,WeightTraining,,14086094444,2025-04-05T09:39:40Z,2025-04-05T11:39:40Z,(GMT+02:00) Africa/Blantyre,7200.0,,,,0,9,0,1,0,True,False,False,False,followers_only,False,,[],[],0.0,0.0,,,,,,,True,106.9,149.0,False,True,0.0,0.0,15035940000.0,15035936606,garmin_ping_426122504181,False,0,0,False,17.0,,552.0,,False,[],,,"[{'id': 49980673377, 'name': 'Lap 1', 'split':...",,"[{'type': 'heart_rate', 'visibility': 'everyon...",False,Garmin Forerunner 945,82075ce6443833456de2feeb79ad874381da038b,[heartrate],81055898,1,a14086094444,,3,,,,,,,,,,0,,,,,,,,,,,,,,,27.0,,,,,,,
1,3,Evening Walk,5035.5,3637,3829,17.0,Walk,Walk,,14080030310,2025-04-04T16:37:40Z,2025-04-04T18:37:40Z,(GMT+01:00) Europe/Warsaw,7200.0,,,,0,8,0,2,0,False,False,False,False,followers_only,False,g19800575,"[51.107619, 17.124085]","[51.108098, 17.124984]",1.385,3.4,52.2,,,,,,True,94.3,107.0,False,True,125.6,114.6,15029560000.0,15029564885,garmin_ping_425899048795,False,0,0,False,7.0,Dokrƒôcanie krok√≥wüôÇ‚Äç‚ÜîÔ∏è,344.0,,False,"[{'id': 3343296484852327920, 'name': 'BƒÖczek c...","[{'split': 1, 'distance': 1000.4, 'pace_zone':...","[{'split': 1, 'distance': 1615.1, 'pace_zone':...","[{'id': 49956432295, 'name': 'Lap 1', 'split':...",,"[{'type': 'heart_rate', 'visibility': 'everyon...",False,Garmin Forerunner 945,33a9d220712243f8e49e182755025746b35e4118,[heartrate],81055898,1,a14080030310,q}|vHopogB?BH@VJFGD]\|@H\?HBDDAJJJd@DJFANJ@HJL...,3,mh|vH{togBP`@F\G|@GTSVGAKd@IRKJ?BOZQVMDIHCJUHG...,False,Nike Invincible Run 3 White,White,2.0,True,430102.0,430.1,,0,,,,,,,,,,,,,,,25.0,,,,,,,
2,3,Morning Run,10030.0,3953,3953,21.0,Run,Run,0.0,12956260994,2024-11-22T05:54:04Z,2024-11-22T06:54:04Z,(GMT+01:00) Europe/Warsaw,3600.0,,,,0,8,0,1,0,False,False,False,False,everyone,False,g19800575,"[51.107261, 17.123889]","[51.107572, 17.124193]",2.537,3.42,82.7,295.0,601.0,295.0,True,1166.1,True,150.5,157.0,False,True,127.6,111.2,13816150000.0,13816147384,garmin_ping_387799325704,False,0,0,False,78.0,10km Easy Run with Runna ‚úÖ\n\n10km easy run at...,787.0,,False,"[{'id': 3294936864762577520, 'name': 'Pƒôtla od...","[{'split': 1, 'distance': 1001.7, 'pace_zone':...","[{'split': 1, 'distance': 1609.9, 'pace_zone':...","[{'id': 45542468202, 'name': 'Lap 1', 'split':...","[{'id': 55038759090, 'name': '400m', 'athlete'...","[{'type': 'heart_rate', 'visibility': 'everyon...",False,Garmin Forerunner 945,8ba65750ff79254f8eba7f6d29358d57bc623c1a,"[heartrate, pace, power]",81055898,1,a12956260994,k{|vHgoogB^`AZd@BLHJF?DAXa@FCJJLPRPf@]^a@HQb@]...,3,uh|vHmuogB\`ABTAZGZKTe@p@a@v@c@l@KJ{@vAURcAvA}...,False,Nike Invincible Run 3 White,White,2.0,True,430102.0,430.1,,0,1.0,2.537313,2.537313,2.537313,2.537313,,,[2.537313432835821],0.0,2.537313,2.537313,2.537313,0.0,2.0,9.0,,,,,,,
3,3,Evening Weight Training,0.0,3846,3846,0.0,WeightTraining,WeightTraining,,12945894446,2024-11-20T17:02:56Z,2024-11-20T18:02:56Z,(GMT+01:00) Africa/Algiers,3600.0,,,,0,5,0,1,0,True,False,False,False,followers_only,False,,[],[],0.0,0.0,,,,,,,True,105.8,142.0,False,True,0.0,0.0,13805260000.0,13805258478,garmin_ping_387403002282,False,0,0,False,10.0,Reska5Ô∏è‚É£,367.0,,False,[],,,"[{'id': 45504588397, 'name': 'Lap 1', 'split':...",,"[{'type': 'heart_rate', 'visibility': 'everyon...",False,Garmin Forerunner 945,f75f171b141157b24ebe3c0237814fc4baaa88e8,[heartrate],81055898,1,a12945894446,,3,,,,,,,,,,0,,,,,,,,,,,,,,,25.0,,,,,,,
4,3,Afternoon Run,8283.8,3154,3154,22.0,Run,Run,0.0,12944852535,2024-11-20T14:44:03Z,2024-11-20T15:44:03Z,(GMT+01:00) Europe/Warsaw,3600.0,,,,3,4,0,1,0,False,False,False,False,everyone,False,g19800575,"[51.10735, 17.12442]","[51.107565, 17.124136]",2.626,4.62,81.8,318.6,448.0,318.0,True,1005.0,True,149.6,163.0,False,True,128.0,114.8,13804140000.0,13804143758,garmin_ping_387374107952,False,0,0,False,57.0,8km Easy Run with Runna ‚úÖ\n\n8km easy run at a...,637.0,,False,"[{'id': 3294344204331294624, 'name': 'Pƒôtla od...","[{'split': 1, 'distance': 1000.5, 'pace_zone':...","[{'split': 1, 'distance': 1611.3, 'pace_zone':...","[{'id': 45500475514, 'name': 'Lap 1', 'split':...","[{'id': 54997510055, 'name': '400m', 'athlete'...","[{'type': 'heart_rate', 'visibility': 'everyon...",False,Garmin Forerunner 945,7f488a045f51ea466f08f8cfec3c4e5916058cea,"[heartrate, pace, power]",81055898,1,a12944852535,}{|vHsrogB^zAF^h@lAPNJ@NIV_@BDH\TRLDJEJ[HEd@i@...,3,oh|vHuuogBRd@H\?^AVKR_A|Ag@~@[`@m@j@gCpDa@v@[\...,False,Nike Invincible Run 3 White,White,2.0,True,430102.0,430.1,,0,6.0,2.52247,2.071395,2.385717,2.812496,3.0,,"[2.248123840707158, 2.3815517814976417, 2.4764...",4.0,2.071395,2.385717,2.812496,1.0,2.0,11.0,,,,,,,


In [6]:
kudos_df.head()

Unnamed: 0,resource_state,firstname,lastname,activity_id,kudos_id,id
0,2,Filip,C.,15831049874,0,15831049874-0
1,2,Ola,≈Å.,15831049874,1,15831049874-1
2,2,Mal,C.,15831049874,2,15831049874-2
3,2,Agnieszka,G.,15831049874,3,15831049874-3
4,2,Wies≈Çawa,C.,15831049874,4,15831049874-4


### Separate tables setup

In [7]:
dataframe_columns = {
  'activities' : [
    'id',
    'name',
    'distance',
    'moving_time',
    'elapsed_time',
    'total_elevation_gain',
    'type',
    'sport_type',
    'workout_type',
    'start_date',
    'start_date_local',
    'timezone',
    'achievement_count',
    'kudos_count',
    'comment_count',
    'athlete_count',
    'photo_count',
    'trainer',
    'commute',
    'manual',
    'visibility',
    'start_latlng',
    'end_latlng',
    'average_speed',
    'max_speed',
    'average_cadence',
    'average_watts',
    'max_watts',
    'weighted_average_watts',
    'has_heartrate',
    'average_heartrate',
    'max_heartrate',
    'elev_high',
    'elev_low',
    'pr_count',
    'total_photo_count',
    'suffer_score',
    'description',
    'calories',
    'device_name',
    'map_id',
    'gear_id'],
  'maps' : [
    'map_id',
    'map_polyline',
    'map_summary_polyline'],
  'gear' : [
    'gear_id',
    'gear_name',
    'gear_distance',
    'gear_converted_distance',
    'start_date',
    'start_date_local'],
  'segment_efforts' : [
    'id',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'start_index',
    'end_index',
    'average_cadence',
    'device_watts',
    'average_watts',
    'average_heartrate',
    'max_heartrate',
    'pr_rank',
    'achievements',
    'visibility',
    'kom_rank',
    'hidden',
    'activity_id',
    'segment_id'],
  'segments' : [
    'segment_id',
    'segment_name',
    'segment_activity_type',
    'segment_distance',
    'segment_average_grade',
    'segment_maximum_grade',
    'segment_elevation_high',
    'segment_elevation_low',
    'segment_start_latlng',
    'segment_end_latlng',
    'segment_elevation_profile',
    'segment_elevation_profiles',
    'segment_climb_category',
    'segment_private',
    'segment_hazardous',
    'segment_starred'],
  'laps' : [
    'id',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'average_speed',
    'max_speed',
    'lap_index',
    'split',
    'start_index',
    'end_index',
    'total_elevation_gain',
    'average_cadence',
    'device_watts',
    'average_watts',
    'average_heartrate',
    'max_heartrate',
    'pace_zone',
    'activity_id'],
  'best_efforts' : [
    'id',
    'activity_id',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'pr_rank',
    'achievements',
    'start_index',
    'end_index']
}

### Spliting data into tables

In [8]:
def select_cols(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
  """
  Select only the specified columns from a DataFrame if they exist.

  Parameters
  ----------
  df : pd.DataFrame
      The input DataFrame.
  cols : list of str
      List of column names to select.

  Returns
  -------
  pd.DataFrame
      A new DataFrame containing only the specified columns that exist 
      in the input DataFrame. If none of the columns exist, 
      an empty DataFrame is returned.
  """
  
  existing = [c for c in cols if c in df.columns]
  
  return df[existing].copy() if existing else pd.DataFrame()

def explode_normalize_json(df: pd.DataFrame, col: str, id_col: str | None = None, id_name: str | None = None) -> pd.DataFrame:
  """
  Explode a list-like column into multiple rows and normalize nested JSON/dict objects 
  into a flat tabular structure.

  This function is useful for columns containing arrays of JSON objects 
  (e.g. laps, segment efforts). Each element of the list becomes a separate row, 
  and nested fields are flattened into individual columns. Optionally, 
  a parent identifier column can be retained/renamed to act as a foreign key.

  Parameters
  ----------
  df : pd.DataFrame
      Input DataFrame containing the column to explode.
  col : str
      Name of the column with list- or dict-like values to explode and normalize.
  id_col : str, optional
      Name of the column in the input DataFrame to keep as a parent identifier.
      If provided, it will be included in the output.
  id_name : str, optional
      If provided together with `id_col`, renames the identifier column 
      in the result (e.g. from "id" to "activity_id").

  Returns
  -------
  pd.DataFrame
      A new DataFrame where:
        * each list element from `col` is a separate row,
        * nested JSON/dict objects are flattened into columns with names joined by "_",
        * the parent identifier (`id_col`) is preserved and optionally renamed.
      If `col` is missing or contains only null/empty values, 
      an empty DataFrame is returned.
  """

  if col not in df.columns:
    return pd.DataFrame()
  
  base_cols = [col]

  if id_col and id_col in df.columns:
    base_cols.insert(0, id_col)

  base = df[base_cols].copy()
  exploded = base.explode(col, ignore_index=True)
  values = exploded[col].dropna()

  if values.empty:
    return pd.DataFrame()
  
  norm = pd.json_normalize(values, sep='_')
  out = exploded.loc[values.index].drop(columns=[col]).reset_index(drop=True)
  res = pd.concat([out.reset_index(drop=True), norm.reset_index(drop=True)], axis=1)
  
  if id_col and id_name and id_col in df.columns:
    res = res.rename(columns={id_col: id_name})
  return res

In [9]:
# Activities
activities_cols = dataframe_columns['activities']
activities_df = select_cols(activities_details_df, activities_cols)
logging.info("DataFrame 'activities_df' created.")

# Maps
maps_cols = dataframe_columns['maps']
maps_df = select_cols(activities_details_df, maps_cols)
logging.info("DataFrame 'maps_df' created.")

# Gear
gear_cols = dataframe_columns['gear']
gear_df = select_cols(activities_details_df, gear_cols)
logging.info("DataFrame 'gear_df' created.")

# Segment efforts
seg_eff_cols = dataframe_columns['segment_efforts']
segments_eff_df = explode_normalize_json(activities_details_df, 'segment_efforts')
segments_eff_df = select_cols(segments_eff_df, seg_eff_cols)
logging.info("DataFrame 'segments_eff_df' created.")

# Segments
seg_cols = dataframe_columns['segments']
segments_df = explode_normalize_json(activities_details_df, 'segment_efforts')
segments_df = select_cols(segments_df, seg_cols)
logging.info("DataFrame 'segments_df' created.")

# Laps
lap_cols = dataframe_columns['laps']
laps_df = explode_normalize_json(activities_details_df, 'laps')
laps_df = select_cols(laps_df, lap_cols)
logging.info("DataFrame 'laps_df' created.")

# Best efforts
best_eff_cols = dataframe_columns['best_efforts']
best_eff_df = explode_normalize_json(activities_details_df, 'best_efforts')
best_eff_df = select_cols(best_eff_df, best_eff_cols)
logging.info("DataFrame 'best_eff_df' created.")

# All dataframes in dictionary
dataframes = {
    "activities": activities_df,
    "maps": maps_df,
    "gear": gear_df,
    "segment_efforts": segments_eff_df,
    "segments": segments_df,
    "laps": laps_df,
    "best_efforts": best_eff_df
}

2025-09-22 12:02:04,318 | INFO | DataFrame 'activities_df' created.
2025-09-22 12:02:04,319 | INFO | DataFrame 'maps_df' created.
2025-09-22 12:02:04,319 | INFO | DataFrame 'gear_df' created.
2025-09-22 12:02:04,380 | INFO | DataFrame 'segments_eff_df' created.
2025-09-22 12:02:04,436 | INFO | DataFrame 'segments_df' created.
2025-09-22 12:02:04,484 | INFO | DataFrame 'laps_df' created.
2025-09-22 12:02:04,499 | INFO | DataFrame 'best_eff_df' created.
2025-09-22 12:02:04,500 | INFO | DataFrame 'workout_types_df' created.


### Activities Dataframe

In [10]:
def speed_to_pace_str(speed: float) -> str | None:
  """
  Convert speed in meters per second to running pace as a string.

  Parameters
  ----------
  speed : float
      Speed value in meters per second. Must be greater than zero.

  Returns
  -------
  str or None
      Running pace in the format "M:SS" representing minutes per kilometer.
      For example, "5:32" means 5 minutes and 32 seconds per kilometer.
      Returns None if the speed is less than or equal to zero.
  """

  if speed <= 0:
    return None
  
  seconds = 1000/speed
  minutes = int(seconds // 60)
  sec = int(round(seconds % 60))

  if sec == 60:
    minutes += 1
    sec = 0

  return f"{minutes}:{sec:02d}"

def speed_to_pace_float(speed: float) -> float | None:
  """
  Convert speed in meters per second to running pace as a float.

  Parameters
  ----------
  speed : float
      Speed value in meters per second. Must be greater than zero.

  Returns
  -------
  float or None
      Running pace in minutes per kilometer, represented as a float.
      For example, 5.53 means approximately 5 minutes and 32 seconds per kilometer.
      Returns None if the speed is less than or equal to zero.
  """

  if speed <= 0:
    return None
  
  return 1000 / speed / 60

def extract_timedelta(time: pd.Series) -> pd.Series:
  """
  Convert a Series of numeric values (seconds) into timedeltas.

  Parameters
  ----------
  time : pd.Series
      Series containing durations expressed in seconds (int/float). 
      Null values are preserved as None.

  Returns
  -------
  pd.Series
      Series of Python ``datetime.timedelta`` objects. 
      Each element corresponds to the given number of seconds or None if missing.
  """

  return pd.Series([(timedelta(seconds=int(t)) if pd.notnull(t) else None) for t in time], dtype="object")

def extract_latlng(latlng: pd.Series) -> pd.DataFrame:
  """
  Split a Series of latitude/longitude pairs into a DataFrame with separate columns.

  Parameters
  ----------
  latlng : pd.Series
      Series where each element is expected to be a list or tuple of length 2 
      (latitude, longitude). If the element is not a valid pair, it is replaced 
      with [None, None].

  Returns
  -------
  pd.DataFrame
      DataFrame with two columns:
        * first column: latitude
        * second column: longitude
      The index is preserved from the input Series.
  """
  latlng = latlng.apply(
    lambda row: row if isinstance(row, (list, tuple)) and len(row) == 2 else [None, None]
  )
  return pd.DataFrame(latlng.tolist(), index=latlng.index)

def etc_gmt_from_offset(minutes: int) -> str:
    """
    Convert a UTC offset (in minutes) to an IANA fixed-offset zone name ``Etc/GMT¬±N``.

    Parameters
    ----------
    minutes : int
        Offset from UTC in minutes.
        Positive values mean UTC+ (east of Greenwich), negative mean UTC‚àí (west).

    Returns
    -------
    str
        IANA timezone name. For whole-hour offsets the format is ``Etc/GMT¬±H``,
        e.g. ``Etc/GMT-2`` for +120 minutes and ``Etc/GMT+5`` for ‚àí300 minutes.
        For non-hour offsets, minutes are included, e.g. ``Etc/GMT-2:30``.

    Notes
    -----
    - The ``Etc/GMT`` naming convention uses an **inverted sign** relative to ISO 8601:
      ``UTC+02:00 ‚Üí Etc/GMT-2`` and ``UTC-05:00 ‚Üí Etc/GMT+5``.
    - ``Etc/GMT`` zones are fixed-offset and **do not observe DST**.

    Examples
    --------
    >>> etc_gmt_from_offset(120)
    'Etc/GMT-2'
    >>> etc_gmt_from_offset(-300)
    'Etc/GMT+5'
    >>> etc_gmt_from_offset(150)
    'Etc/GMT-2:30'
    """

    sign = '-' if minutes > 0 else '+'
    h, m = divmod(abs(minutes), 60)
    return f"Etc/GMT{sign}{h}" if m == 0 else f"Etc/GMT{sign}{h}:{m:02d}"

def create_datetime_tz_cols(df: pd.DataFrame, date_col: str, date_col_local: str) -> pd.DataFrame:
  """
  Derive a UTC timestamp and a fixed-offset timezone name from UTC and local datetimes.

  The function parses a UTC datetime column and a corresponding local datetime column
  (both representing the same instant), computes the offset in minutes
  ``local - utc``, and maps that offset to a fixed-offset IANA zone name
  using ``Etc/GMT¬±H[:MM]``. It returns a DataFrame with the UTC timestamp and
  the derived timezone name.

  Parameters
  ----------
  df : pd.DataFrame
      Input DataFrame containing the UTC and local datetime columns.
  date_col : str
      Name of the column with UTC datetimes (string or datetime-like).
      Values are parsed to a tz-aware UTC dtype.
  date_col_local : str
      Name of the column with local datetimes (string or datetime-like).
      Values are used only to infer the UTC offset.

  Returns
  -------
  pd.DataFrame
      A DataFrame with two columns:
        * ``start_date_utc_dt`` ‚Äî tz-aware UTC timestamp (dtype ``datetime64[ns, UTC]``),
        * ``tz`` ‚Äî fixed-offset IANA zone name in the ``Etc/GMT`` family
          (e.g., ``"Etc/GMT-2"`` for UTC+02:00, ``"Etc/GMT+5"`` for UTC‚àí05:00).

  Notes
  -----
  - ``Etc/GMT`` zones are fixed offsets and **do not observe DST**. The sign is
    intentionally inverted by IANA naming convention: UTC+02:00 ‚Üí ``Etc/GMT-2``.
  - Both columns must refer to the same moment in time; otherwise the inferred
    offset (and thus ``tz``) will be incorrect.

  Raises
  ------
  KeyError
      If ``date_col`` or ``date_col_local`` is missing in ``df``.
  ValueError
      If datetime parsing fails.
  """
  
  if date_col not in df.columns or date_col_local not in df.columns:
        raise KeyError(f"Missing required columns: {date_col}, {date_col_local}")

  temp_df = pd.DataFrame()
  temp_df["start_date_utc_dt"] = pd.to_datetime(df[date_col])
  temp_df["start_date_local_dt"] = pd.to_datetime(df[date_col_local])
  temp_df["utc_offset"] = (temp_df["start_date_local_dt"] - temp_df["start_date_utc_dt"]).dt.total_seconds() / 60

  temp_df["tz"] = temp_df["utc_offset"].apply(etc_gmt_from_offset)
  
  return temp_df[["start_date_utc_dt", "start_date_local_dt", "tz"]]

activity_details

In [11]:
activities_df.loc[:, "moving_time_td"]  = extract_timedelta(activities_df["moving_time"])
activities_df.loc[:, "elapsed_time_td"] = extract_timedelta(activities_df["elapsed_time"])

activities_df[["start_date_utc_dt", "start_date_local_dt", "local_timezone"]] = create_datetime_tz_cols(activities_df, "start_date", "start_date_local")

activities_df[["start_lat", "start_lng"]]  = extract_latlng(activities_df["start_latlng"])
activities_df[["end_lat", "end_lng"]] = extract_latlng(activities_df["end_latlng"])

is_run = activities_df['type'] == 'Run'

activities_df.loc[is_run, 'average_cadence'] = activities_df['average_cadence'].apply(lambda x: x * 2)

activities_df.loc[is_run, 'avg_pace_str'] = activities_df['average_speed'].apply(speed_to_pace_str)
activities_df.loc[is_run, 'avg_pace_float'] = activities_df['average_speed'].apply(speed_to_pace_float)

activities_df.loc[is_run, 'max_pace_str'] = activities_df['max_speed'].apply(speed_to_pace_str)
activities_df.loc[is_run, 'max_pace_float'] = activities_df['max_speed'].apply(speed_to_pace_float)

activities_df['gear_id'] = activities_df['gear_id'].fillna(NO_GEAR_ID)


In [12]:
activities_cols_clean = [
    'id',
    'name',
    'start_date_utc_dt',
    'start_date_local_dt',
    'local_timezone',
    'distance',
    'moving_time',
    'moving_time_td',
    'elapsed_time',
    'elapsed_time_td',
    'total_elevation_gain',
    'elev_low',
    'elev_high',
    'type',
    'sport_type',
    'workout_type',
    'achievement_count',
    'kudos_count',
    'comment_count',
    'athlete_count',
    'photo_count',
    'trainer',
    'commute',
    'manual',
    'visibility',
    'average_speed',
    'avg_pace_str',
    'avg_pace_float',
    'max_speed',
    'max_pace_str',
    'max_pace_float',
    'average_cadence',
    'average_watts',
    'max_watts',
    'weighted_average_watts',
    'has_heartrate',
    'average_heartrate',
    'max_heartrate',
    'pr_count',
    'total_photo_count',
    'suffer_score',
    'description',
    'calories',
    'device_name',
    'start_lat',
    'start_lng',
    'map_id',
    'gear_id'
]
activities_df = activities_df[activities_cols_clean]
activities_df = activities_df.sort_values(by='start_date_utc_dt', ascending=False)

In [13]:
activities_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,elev_low,elev_high,type,sport_type,workout_type,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,visibility,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_watts,max_watts,weighted_average_watts,has_heartrate,average_heartrate,max_heartrate,pr_count,total_photo_count,suffer_score,description,calories,device_name,start_lat,start_lng,map_id,gear_id
1094,15855640218,K200süèéÔ∏è,2025-09-18 15:30:30+00:00,2025-09-18 17:30:30+00:00,Etc/GMT-2.0,9915.6,3272,0:54:32,3314,0:55:14,10.0,115.6,122.8,Run,Run,3.0,0,9,0,1,0,False,False,False,everyone,3.03,5:30,5.50055,4.82,3:27,3.457815,164.4,365.2,546.0,382.0,True,151.7,176.0,0,0,60.0,K200s with Runna ‚úÖ\n\nKilometr√≥wki z dwusetkam...,765.0,Garmin Forerunner 970,51.107177,17.123797,a15855640218,g24134620
1093,15843349072,9km Easy RunüëΩ,2025-09-17 14:08:34+00:00,2025-09-17 16:08:34+00:00,Etc/GMT-2.0,9051.8,3095,0:51:35,3142,0:52:22,16.0,114.8,123.0,Run,Run,,0,6,0,1,0,False,False,False,everyone,2.925,5:42,5.698006,4.12,4:03,4.045307,169.4,367.5,513.0,364.0,True,145.1,152.0,0,0,41.0,9km Easy Run with Runna ‚úÖ\n\nWyjƒÖtkowo ≈ºwawe e...,699.0,Garmin Forerunner 970,51.107162,17.123739,a15843349072,g24134620
1092,15831049874,Afternoon Weight Training,2025-09-16 13:01:07+00:00,2025-09-16 15:01:07+00:00,Etc/GMT-2.0,0.0,3825,1:03:45,3825,1:03:45,0.0,0.0,0.0,Workout,WeightTraining,,0,5,1,1,0,True,False,False,followers_only,0.0,,,0.0,,,,,,,True,94.0,222.0,0,0,9.0,Reska8Ô∏è‚É£8Ô∏è‚É£\nBench press PR: 85kgü•≥,254.0,Garmin Forerunner 970,,,a15831049874,x00000000
1091,15820198827,Tempo 2kmü•µ,2025-09-15 14:23:21+00:00,2025-09-15 16:23:21+00:00,Etc/GMT-2.0,9521.9,3241,0:54:01,3241,0:54:01,13.0,115.4,125.0,Run,Run,3.0,0,8,0,1,0,False,False,False,everyone,2.938,5:40,5.672793,4.94,3:22,3.373819,167.8,353.5,493.0,369.0,True,153.1,178.0,0,0,66.0,Tempo 2km Repeats with Runna ‚úÖ\n\nWysz≈Ço troch...,735.0,Garmin Forerunner 970,51.107301,17.124098,a15820198827,g24134620
973,15805849875,15km Long Run‚òîÔ∏è,2025-09-14 07:59:25+00:00,2025-09-14 09:59:25+00:00,Etc/GMT-2.0,15059.0,5461,1:31:01,5488,1:31:28,31.0,114.6,125.4,Run,Run,2.0,0,4,0,1,0,False,False,False,everyone,2.758,6:03,6.043026,3.44,4:51,4.844961,172.0,335.5,455.0,334.0,True,144.8,153.0,0,0,66.0,15km Long Run with Runna ‚úÖ\n\nOkrutny beton po...,1170.0,Garmin Forerunner 970,51.107336,17.124136,a15805849875,g24134620


Load to PostgreSQL will be made after extracting location from coordinates

## Maps Dataframe

In [14]:
maps_df.columns = maps_df.columns.str.replace("^map_", "", regex=True)

In [15]:
maps_cols_clean = [
  'id',
  'polyline',
  'summary_polyline'
]
maps_df = maps_df[maps_cols_clean]

In [16]:
maps_df.head()

Unnamed: 0,id,polyline,summary_polyline
0,a14086094444,,
1,a14080030310,q}|vHopogB?BH@VJFGD]\|@H\?HBDDAJJJd@DJFANJ@HJL...,mh|vH{togBP`@F\G|@GTSVGAKd@IRKJ?BOZQVMDIHCJUHG...
2,a12956260994,k{|vHgoogB^`AZd@BLHJF?DAXa@FCJJLPRPf@]^a@HQb@]...,uh|vHmuogB\`ABTAZGZKTe@p@a@v@c@l@KJ{@vAURcAvA}...
3,a12945894446,,
4,a12944852535,}{|vHsrogB^zAF^h@lAPNJ@NIV_@BDH\TRLDJEJ[HEd@i@...,oh|vHuuogBRd@H\?^AVKR_A|Ag@~@[`@m@j@gCpDa@v@[\...


In [17]:
maps_df_dtype_map = {
    "id": String,
    "polyline": Text,
    "summary_polyline": Text
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{MAPS_S_TABLE} will be overwritten.")

maps_df.to_sql(
    name=MAPS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=maps_df_dtype_map,
    method="multi",
    chunksize=1000
)



-2

## Gear Dataframe

In [18]:
gear_df.columns = gear_df.columns.str.replace("^gear_", "", regex=True)
gear_df['id'] = gear_df['id'].fillna(NO_GEAR_ID)
gear_df['name'] = gear_df['name'].fillna('No gear')
gear_df = gear_df.fillna(0)
gear_df = gear_df.rename(columns={'distance' : 'distance_m', 'converted_distance' : 'distance_km'})
gear_df[["start_date_utc_dt", "start_date_local_dt", "local_timezone"]] = create_datetime_tz_cols(gear_df, "start_date", "start_date_local")
gear_df = (
  gear_df.sort_values(by='start_date_utc_dt', ascending=False)
  .drop_duplicates(subset=["id"], keep="first")
  .reset_index(drop=True)
)
gear_df = gear_df[['id', 'name', 'distance_m', 'distance_km']]

In [19]:
gear_df

Unnamed: 0,id,name,distance_m,distance_km
0,g24134620,ASICS Novablast 5,333012.0,333.0
1,x00000000,No gear,0.0,0.0
2,b12572672,Cube Nuroad Pro,3734608.0,3734.6
3,g23642256,Adidas EVO SL,146967.0,147.0
4,g19800575,Nike Invincible Run 3 White,430102.0,430.1
5,g20426652,Nike Invincible Run 3 Blueprint,565167.0,565.2
6,g11783267,Nike Zoom Fly 4,272798.0,272.8
7,b13100260,Cube Nuroad Pro Wirtualnie,520771.0,520.8
8,g17673165,Nike Invincible Run 3 Black,666029.0,666.0
9,g11165677,New Balance 1080 v12,1194617.0,1194.6


In [20]:
gear_df_dtype_map = {
    "id": String,
    "name": String,
    "distance_m": Float,
    "distance_km": Float,
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{GEAR_S_TABLE} will be overwritten.")

gear_df.to_sql(
    name=GEAR_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=gear_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

## Segments efforts Dataframe

In [21]:
segments_types_df = segments_df[['segment_id', 'segment_activity_type']].copy()
segments_types_df.drop_duplicates(inplace=True)

segments_eff_df = pd.merge(segments_eff_df, segments_types_df, on='segment_id', how='left')
segments_eff_df = pd.merge(segments_eff_df.drop(columns="achievements"), explode_normalize_json(segments_eff_df, 'achievements', 'id'), on='id', how='left')

segments_eff_df.loc[:, "moving_time_td"]  = extract_timedelta(segments_eff_df["moving_time"])
segments_eff_df.loc[:, "elapsed_time_td"] = extract_timedelta(segments_eff_df["elapsed_time"])

segments_eff_df[["start_date_utc_dt", "start_date_local_dt", "local_timezone"]] = create_datetime_tz_cols(segments_eff_df, "start_date", "start_date_local")

is_run = segments_eff_df['segment_activity_type'] == 'Run'

segments_eff_df.loc[is_run, 'average_cadence'] = segments_eff_df['average_cadence'].apply(lambda x: x * 2)

In [22]:
segments_eff_cols_clean = [
  'id',
  'name',
  'start_date_utc_dt',
  'start_date_local_dt',
  'local_timezone',
  'distance',
  'moving_time',
  'moving_time_td',
  'elapsed_time',
  'elapsed_time_td',
  'average_cadence',
  'device_watts',
  'average_watts',
  'average_heartrate',
  'max_heartrate',
  'pr_rank',
  'visibility',
  'kom_rank',
  'hidden',
  'rank',
  'type',
  'activity_id',
  'segment_id'
]
segments_eff_df = segments_eff_df[segments_eff_cols_clean]

In [23]:
segments_eff_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,pr_rank,visibility,kom_rank,hidden,rank,type,activity_id,segment_id
0,3343296484852327920,BƒÖczek counterclockwise,2025-04-04 16:45:56+00:00,2025-04-04 18:45:56+00:00,Etc/GMT-2.0,3722.4,2692,0:44:52,2694,0:44:54,52.3,False,,94.9,107.0,,followers_only,,False,,,14080030310,38033619
1,3294936864762577520,Pƒôtla od ≈õluzy,2024-11-22 05:57:15+00:00,2024-11-22 06:57:15+00:00,Etc/GMT-1.0,3886.4,1449,0:24:09,1449,0:24:09,165.6,True,291.4,148.4,157.0,,everyone,,False,,,12956260994,17455167
2,3294936864763706992,Most Chrobrego- most Bartoszowicki,2024-11-22 06:09:05+00:00,2024-11-22 07:09:05+00:00,Etc/GMT-1.0,1582.0,624,0:10:24,624,0:10:24,166.2,True,285.2,149.4,155.0,,everyone,,False,,,12956260994,10082640
3,3294936864763212400,Swojczycki - Sluza revers,2024-11-22 06:20:54+00:00,2024-11-22 07:20:54+00:00,Etc/GMT-1.0,1580.4,631,0:10:31,631,0:10:31,165.8,True,293.3,150.9,156.0,,everyone,,False,,,12956260994,10082666
4,3294936864765383280,Pƒôtla od ≈õluzy,2024-11-22 06:21:28+00:00,2024-11-22 07:21:28+00:00,Etc/GMT-1.0,3886.4,1444,0:24:04,1444,0:24:04,165.6,True,292.3,151.8,157.0,,everyone,,False,,,12956260994,17455167


In [24]:
segments_eff_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "start_date_utc_dt": DateTime(timezone=False),
    "start_date_local_dt": DateTime(timezone=False),
    "distance": Float,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "average_cadence": Float,
    "device_watts": Boolean,
    "average_watts": Float,
    "average_heartrate": Float,
    "max_heartrate": Float,
    "pr_rank": Integer,
    "visibility": String,
    "kom_rank": Integer,
    "hidden": Boolean,
    "rank": Integer,
    "type": String,
    "activity_id": BigInteger,
    "segment_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{SEGMENTS_EFFORTS_S_TABLE} will be overwritten.")

segments_eff_df.to_sql(
    name=SEGMENTS_EFFORTS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=segments_eff_df_dtype_map,
    method="multi",
    chunksize=1000
)



-6

## Segments Dataframe

In [25]:
segments_df.columns = segments_df.columns.str.replace("^segment_", "", regex=True)

segments_df[["start_lat", "start_lng"]]  = extract_latlng(segments_df["start_latlng"])
segments_df[["end_lat", "end_lng"]] = extract_latlng(segments_df["end_latlng"])

segments_df = segments_df.drop(columns=["start_latlng", "end_latlng"])
segments_df = segments_df.drop_duplicates()

In [26]:
segments_df.head()

Unnamed: 0,id,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,elevation_profile,elevation_profiles,climb_category,private,hazardous,starred,start_lat,start_lng,end_lat,end_lng
0,38033619,BƒÖczek counterclockwise,Walk,3722.4,0.0,8.0,123.8,118.2,,,0,False,False,False,51.104196,17.124846,51.104209,17.124837
1,17455167,Pƒôtla od ≈õluzy,Run,3886.4,0.0,10.5,120.0,112.9,,,0,False,False,False,51.104195,17.124249,51.104082,17.124274
2,10082640,Most Chrobrego- most Bartoszowicki,Run,1582.0,0.0,2.5,118.2,115.5,,,0,False,False,False,51.113182,17.108164,51.102416,17.122673
3,10082666,Swojczycki - Sluza revers,Run,1580.4,0.0,2.7,118.8,114.6,,,0,False,False,False,51.103481,17.12464,51.114135,17.109978
6,22997595,Po kostce do Grobli,Run,456.5,0.2,7.5,117.0,116.0,,,0,False,False,False,51.104347,17.125358,51.101168,17.129389


In [27]:
segments_df[segments_df['id'] == 20350088]

Unnamed: 0,id,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,elevation_profile,elevation_profiles,climb_category,private,hazardous,starred,start_lat,start_lng,end_lat,end_lng
4278,20350088,Tempus Fugit,VirtualRide,17297.1,0.0,2.0,17.6,12.6,,,0,False,False,False,-11.639187,166.982381,-11.639168,166.982381


Load to PostgreSQL will be made after extracting location from coordinates

## Laps Dataframe

In [28]:
laps_types_df = activities_df[['id', 'type']].copy()
laps_types_df.drop_duplicates(inplace=True)
laps_df = pd.merge(laps_df, laps_types_df, left_on='activity_id', right_on='id', how='left')

In [29]:
laps_df.loc[:, "moving_time_td"]  = extract_timedelta(laps_df["moving_time"])
laps_df.loc[:, "elapsed_time_td"] = extract_timedelta(laps_df["elapsed_time"])

laps_df[["start_date_utc_dt", "start_date_local_dt", "local_timezone"]] = create_datetime_tz_cols(laps_df, "start_date", "start_date_local")

is_run = laps_df['type'] == 'Run'

laps_df.loc[is_run, 'average_cadence'] = laps_df['average_cadence'].apply(lambda x: x * 2)

laps_df.loc[is_run, 'avg_pace_str'] = laps_df['average_speed'].apply(speed_to_pace_str)
laps_df.loc[is_run, 'avg_pace_float'] = laps_df['average_speed'].apply(speed_to_pace_float)

laps_df.loc[is_run, 'max_pace_str'] = laps_df['max_speed'].apply(speed_to_pace_str)
laps_df.loc[is_run, 'max_pace_float'] = laps_df['max_speed'].apply(speed_to_pace_float)


In [30]:
laps_cols_df_clean = [
    'id_x',
    'name',
    'lap_index',
    'split',
    'start_date_utc_dt',
    'start_date_local_dt',
    'local_timezone',
    'distance',
    'moving_time',
    'moving_time_td',
    'elapsed_time',
    'elapsed_time_td',
    'total_elevation_gain',
    'type',
    'average_speed',
    'avg_pace_str',
    'avg_pace_float',
    'pace_zone',
    'max_speed',
    'max_pace_str',
    'max_pace_float',
    'average_cadence',
    'device_watts',
    'average_watts',
    'average_heartrate',
    'max_heartrate',
    'activity_id'
]

laps_df = laps_df[laps_cols_df_clean]
laps_df = laps_df.rename(columns={'id_x': 'id'})

In [31]:
laps_df.head()

Unnamed: 0,id,name,lap_index,split,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,type,average_speed,avg_pace_str,avg_pace_float,pace_zone,max_speed,max_pace_str,max_pace_float,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,activity_id
0,49980673377,Lap 1,1,1,2025-04-05 09:39:40+00:00,2025-04-05 11:39:40+00:00,Etc/GMT-2.0,0.0,6066,1:41:06,6066,1:41:06,0.0,WeightTraining,0.0,,,,0.0,,,,False,,106.9,149.0,14086094444
1,49956432295,Lap 1,1,1,2025-04-04 16:37:40+00:00,2025-04-04 18:37:40+00:00,Etc/GMT-2.0,1000.0,811,0:13:31,811,0:13:31,8.4,Walk,1.23,,,,3.4,,,51.7,False,,94.8,107.0,14080030310
2,49956432299,Lap 2,2,2,2025-04-04 16:37:40+00:00,2025-04-04 18:37:40+00:00,Etc/GMT-2.0,1000.0,745,0:12:25,745,0:12:25,3.8,Walk,1.34,,,,1.814,,,52.1,False,,95.0,107.0,14080030310
3,49956432304,Lap 3,3,3,2025-04-04 16:37:40+00:00,2025-04-04 18:37:40+00:00,Etc/GMT-2.0,1000.0,704,0:11:44,704,0:11:44,3.4,Walk,1.42,,,,1.709,,,52.9,False,,95.4,106.0,14080030310
4,49956432306,Lap 4,4,4,2025-04-04 16:37:40+00:00,2025-04-04 18:37:40+00:00,Etc/GMT-2.0,1000.0,720,0:12:00,720,0:12:00,0.0,Walk,1.39,,,,1.791,,,52.1,False,,94.1,100.0,14080030310


In [32]:
laps_df_dtype_map = {
"id": BigInteger,
"name": String,
"lap_index": Integer,
"split": Integer,
"start_date_utc_dt": DateTime(timezone=False),
"start_date_local_dt": DateTime(timezone=False),
'local_timezone' : String,
"distance": Float,
"moving_time": Integer,
"moving_time_td": Interval,
"elapsed_time": Integer,
"elapsed_time_td": Interval,
"total_elevation_gain": Float,
"type": String,
"average_speed": Float,
"avg_pace_str": String,
"avg_pace_float": Float,
"pace_zone": Float,
"max_speed": Float,
"max_pace_str": String,
"max_pace_float": Float,
"average_cadence": Float,
"device_watts": Boolean,
"average_watts": Float,
"average_heartrate": Float,
"max_heartrate": Float,
"activity_id": BigInteger,
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{LAPS_S_TABLE} will be overwritten.")

laps_df.to_sql(
    name=LAPS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=laps_df_dtype_map,
    method="multi",
    chunksize=1000
)




-9

## Best efforts Dataframe

In [33]:

best_eff_df.loc[:, "moving_time_td"]  = extract_timedelta(best_eff_df["moving_time"])
best_eff_df.loc[:, "elapsed_time_td"] = extract_timedelta(best_eff_df["elapsed_time"])

best_eff_df[["start_date_utc_dt", "start_date_local_dt", "local_timezone"]] = create_datetime_tz_cols(best_eff_df, "start_date", "start_date_local")
best_eff_df = pd.merge(best_eff_df.drop(columns="achievements"), explode_normalize_json(best_eff_df, 'achievements', 'id'), on='id', how='left')

In [34]:
best_eff_df_cols_clean = [
    'id',
    'name',
    'start_date_utc_dt',
    'start_date_local_dt',
    'local_timezone',
    'distance',
    'moving_time',
    'moving_time_td',
    'elapsed_time',
    'elapsed_time_td',
    'rank',
    'type',
    'activity_id'
]
best_eff_df = best_eff_df[best_eff_df_cols_clean]

In [35]:
best_eff_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,rank,type,activity_id
0,55038759090,400m,2024-11-22 05:54:04+00:00,2024-11-22 06:54:04+00:00,Etc/GMT-1.0,400,147,0:02:27,147,0:02:27,,,12956260994
1,55038759091,1/2 mile,2024-11-22 05:54:04+00:00,2024-11-22 06:54:04+00:00,Etc/GMT-1.0,805,301,0:05:01,301,0:05:01,,,12956260994
2,55038759092,1K,2024-11-22 05:54:04+00:00,2024-11-22 06:54:04+00:00,Etc/GMT-1.0,1000,376,0:06:16,376,0:06:16,,,12956260994
3,55038759086,1 mile,2024-11-22 05:54:04+00:00,2024-11-22 06:54:04+00:00,Etc/GMT-1.0,1609,612,0:10:12,612,0:10:12,,,12956260994
4,55038759087,2 mile,2024-11-22 05:54:04+00:00,2024-11-22 06:54:04+00:00,Etc/GMT-1.0,3219,1238,0:20:38,1238,0:20:38,,,12956260994


In [36]:
best_eff_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "start_date_utc_dt": DateTime(timezone=False),
    "start_date_local_dt": DateTime(timezone=False),
    "local_timezone": String,
    "distance": Float,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "rank": Integer,
    "type": String,
    "activity_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{BEST_EFFORTS_S_TABLE} will be overwritten.")

best_eff_df.to_sql(
    name=BEST_EFFORTS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=best_eff_df_dtype_map,
    method="multi",
    chunksize=1000
)



-4

## Kudos Dataframe

In [37]:
kudos_df['full_name'] = kudos_df[['firstname', 'lastname']].astype('string').agg(' '.join, axis=1)
kudos_df_cols_clean = [
    'firstname',
    'lastname',
    'full_name',
    'activity_id'
]
kudos_df = kudos_df[kudos_df_cols_clean]
kudos_df = kudos_df.rename(columns={'firstname': 'first_name', 'lastname': 'last_name'})

In [38]:
kudos_df.head()

Unnamed: 0,first_name,last_name,full_name,activity_id
0,Filip,C.,Filip C.,15831049874
1,Ola,≈Å.,Ola ≈Å.,15831049874
2,Mal,C.,Mal C.,15831049874
3,Agnieszka,G.,Agnieszka G.,15831049874
4,Wies≈Çawa,C.,Wies≈Çawa C.,15831049874


In [39]:
kudos_dtype_map = {
    "firs_tname": String,
    "last_name": String,
    "full_name": String,
    "activity_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{KUDOS_S_TABLE} will be overwritten.")

kudos_df.to_sql(
    name=KUDOS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=kudos_dtype_map,
    method="multi",
    chunksize=1000
)



-8

## Decoding coordinates with geopy

### Create list of unique locations

In [40]:
lat_lng_points_df = pd.concat([activities_df[['start_lat', 'start_lng']], segments_df[['start_lat', 'start_lng']]]).drop_duplicates()

In [41]:
geo = Nominatim(user_agent=USER_AGENT)
reverse = RateLimiter(geo.reverse, min_delay_seconds=1.1, max_retries=3, error_wait_seconds=5)

def cache_key(lat, lng, precision):
  """
  Create a stable cache key for a coordinate cell.

  Parameters
  ----------
  lat : float
      Latitude in decimal degrees.
  lng : float
      Longitude in decimal degrees.
  precision : int
      Number of decimal places used to format the coordinates.
      This effectively defines the grid cell size (e.g., 2 dp ‚âà city level).

  Returns
  -------
  str
      Key formatted as ``"lat,lng"`` with fixed precision, e.g. ``"51.11,17.02"``.
      Using a string avoids floating-point representation issues and works as a JSON key.
  """

  return f"{lat:.{precision}f},{lng:.{precision}f}"

def pick_locality(addr):
  """
  Select a locality (city/town/village) from a Nominatim ``address`` mapping.

  The function returns the first non-empty value in the following order of preference:
  ``city`` ‚Üí ``town`` ‚Üí ``village`` ‚Üí ``hamlet`` ‚Üí ``municipality`` ‚Üí ``locality`` ‚Üí ``county`` (fallback).

  Parameters
  ----------
  addr : Mapping[str, str]
      The ``address`` object from a Nominatim response (``loc.raw['address']``).

  Returns
  -------
  str or None
      Locality name or ``None`` if none of the keys are available.

  Notes
  -----
  Including ``county`` as the last-resort fallback may return a county name
  in places where a true locality is missing in OSM data (useful in some countries),
  but it can be more general than a town/city.
  """
  
  return (addr.get("city") or addr.get("town") or addr.get("village") or addr.get("hamlet") or addr.get("municipality") or addr.get("locality") or addr.get("county"))

def pick_region(addr):
  """
  Select a region/state from a Nominatim ``address`` mapping.

  The function returns the first non-empty value in the following order of preference:
  ``state`` ‚Üí ``region`` ‚Üí ``state_district`` ‚Üí ``province`` ‚Üí ``county`` (fallback).

  Parameters
  ----------
  addr : Mapping[str, str]
      The ``address`` object from a Nominatim response (``loc.raw['address']``).

  Returns
  -------
  str or None
      Region/administrative area name or ``None`` if not present.

  Notes
  -----
  In Poland and many countries ``state`` corresponds to the top-level region
  (e.g., voivodeship/province). ``county`` is typically a lower level and is used
  here only as a fallback for countries where counties act as primary regions.
  """
  
  return (addr.get("state") or addr.get("region") or addr.get("state_district") or addr.get("province") or addr.get("county"))

def address_fields(reverse_fn, lat, lng):
  """
  Perform reverse geocoding and extract minimal address fields.

  Parameters
  ----------
  reverse_fn : Callable
      A function compatible with ``geopy.Nominatim.reverse`` (optionally wrapped
      with ``RateLimiter``) that accepts ``(lat, lng)`` and returns a Location-like
      object with a ``.raw`` dict payload.
  lat : float
      Latitude in decimal degrees.
  lng : float
      Longitude in decimal degrees.

  Returns
  -------
  dict
      Dictionary with three keys:
      - ``locality`` : str or None ‚Äî city/town/village (best available),
      - ``region``   : str or None ‚Äî region/state/province,
      - ``country``  : str or None ‚Äî country name.

  Notes
  -----
  The function calls ``reverse_fn`` with ``language="en"`` and ``addressdetails=True``.
  Change the language parameter if localized names are desired.
  """

  loc = reverse_fn((lat, lng), language="en", addressdetails=True)
  if not loc:
      return {"locality": None, "region": None, "country": None}
  address = (loc.raw or {}).get("address", {})
  return {
      "locality": pick_locality(address),
      "region": pick_region(address),
      "country": address.get("country")
  }

def decode_coordinates(coordinates_df: pd.DataFrame, lat_col: str, lng_col: str, cache_path: str, precision: int, geo_reverse_fn) -> pd.DataFrame:
  """
  Append ``locality``, ``region``, and ``country`` to a DataFrame using reverse geocoding with caching.

  For each unique coordinate pair (rounded to ``precision`` decimal places), the function performs
  a reverse geocode via ``geo_reverse_fn`` and stores results in a JSON cache. Subsequent runs read
  from the cache to minimize API calls.

  Parameters
  ----------
  coordinates_df : pd.DataFrame
      Input DataFrame containing coordinate columns.
  lat_col : str
      Name of the latitude column (e.g., ``"start_lat"``).
  lng_col : str
      Name of the longitude column (e.g., ``"start_lng"``).
  cache_path : str
      File path to the JSON cache. The file will be created/updated as needed.
  precision : int
      Number of decimal places for rounding coordinates and building the cache key
      (e.g., 2 ‚âà city-level granularity).
  geo_reverse_fn : Callable
      Reverse geocoding function (typically a ``RateLimiter(Nominatim.reverse, ...)``).

  Returns
  -------
  pd.DataFrame
      A copy of the input DataFrame with three additional columns:
      ``locality``, ``region``, and ``country``.

  Raises
  ------
  KeyError
      If the required ``lat_col`` or ``lng_col`` is missing from ``coordinates_df``.
  json.JSONDecodeError
      If the cache file exists but contains invalid JSON.
  Exception
      Any exception propagated from the reverse geocoding function or file I/O.

  Notes
  -----
  - The function writes the cache to ``cache_path`` whenever a new key is added.
  - A small sleep (``time.sleep(0.5)``) is used per new lookup; adjust to comply with
    your provider's rate limits (public Nominatim typically requires ‚â§1 request/sec).
  - Change the language in ``address_fields`` if you need localized names.
  """

  temp_df = coordinates_df.copy()

  if lat_col not in temp_df.columns or lng_col not in temp_df.columns:
    raise KeyError(f"Missing required columns: {lat_col}, {lng_col}")
  
  temp_df['lat_round'] = temp_df[lat_col].round(precision)
  temp_df['lng_round'] = temp_df[lng_col].round(precision)
  temp_df = temp_df.dropna(subset=[lat_col, lng_col])
  lat_lng_df = temp_df[['lat_round', 'lng_round']].drop_duplicates().reset_index(drop=True)

  cache = {}
  if os.path.exists(cache_path):
    with open(cache_path, "r", encoding="utf-8") as f:
      cache = json.load(f)

  records = []
  for i, row in tqdm(lat_lng_df.iterrows(), total=lat_lng_df.shape[0]):

    key = cache_key(row['lat_round'], row['lng_round'], precision)

    if key in cache:
      resp = cache[key]

    else:
      resp = address_fields(geo_reverse_fn, row['lat_round'], row['lng_round'])
      cache[key] = resp
      time.sleep(0.5)

      with open(cache_path, "w", encoding="utf-8") as f:
        json.dump(cache, f, ensure_ascii=False, indent=2)

    records.append({"lat_round": row['lat_round'], "lng_round": row['lng_round'], **resp})

  loc_df = pd.DataFrame(records)
  
  result = temp_df.merge(loc_df, on=["lat_round","lng_round"], how="left")
  result = result.drop(columns=["lat_round","lng_round"])

  return result

In [42]:
lat_lng_points_df = decode_coordinates(lat_lng_points_df, 'start_lat', 'start_lng', CACHE_PATH, PRECISION, reverse)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1516/1516 [00:00<00:00, 90334.64it/s]


In [43]:
lat_lng_points_df.head()

Unnamed: 0,start_lat,start_lng,locality,region,country
0,51.107177,17.123797,Wroc≈Çaw,Lower Silesian Voivodeship,Poland
1,51.107162,17.123739,Wroc≈Çaw,Lower Silesian Voivodeship,Poland
2,51.107301,17.124098,Wroc≈Çaw,Lower Silesian Voivodeship,Poland
3,51.107336,17.124136,Wroc≈Çaw,Lower Silesian Voivodeship,Poland
4,51.107367,17.124221,Wroc≈Çaw,Lower Silesian Voivodeship,Poland


In [44]:
locations_df = lat_lng_points_df[['locality', 'region', 'country']].copy()
locations_df = locations_df.drop_duplicates()
locations_df = locations_df.sort_values(by=['country', 'region', 'locality']).reset_index(drop=True)
locations_df['location_id'] = 1000 + np.arange(len(locations_df))
locations_df = locations_df[['location_id', 'country', 'region', 'locality']]

In [45]:
locations_df.head()

Unnamed: 0,location_id,country,region,locality
0,1000,Austria,Lower Austria,Waidhofen an der Ybbs
1,1001,Austria,Upper Austria,Garsten
2,1002,Austria,Upper Austria,Gr√ºnburg
3,1003,Austria,Upper Austria,Linz
4,1004,Austria,Upper Austria,St. Ulrich bei Steyr


In [46]:
lat_lng_points_df = pd.merge(lat_lng_points_df, locations_df, how='left', on=['country', 'region', 'locality'])

In [47]:
lat_lng_points_df.head()

Unnamed: 0,start_lat,start_lng,locality,region,country,location_id
0,51.107177,17.123797,Wroc≈Çaw,Lower Silesian Voivodeship,Poland,1179
1,51.107162,17.123739,Wroc≈Çaw,Lower Silesian Voivodeship,Poland,1179
2,51.107301,17.124098,Wroc≈Çaw,Lower Silesian Voivodeship,Poland,1179
3,51.107336,17.124136,Wroc≈Çaw,Lower Silesian Voivodeship,Poland,1179
4,51.107367,17.124221,Wroc≈Çaw,Lower Silesian Voivodeship,Poland,1179


In [48]:
activities_df = pd.merge(activities_df, lat_lng_points_df[['start_lat', 'start_lng', 'location_id']], how='left', on=['start_lat', 'start_lng'])

In [49]:
activities_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,elev_low,elev_high,type,sport_type,workout_type,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,visibility,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_watts,max_watts,weighted_average_watts,has_heartrate,average_heartrate,max_heartrate,pr_count,total_photo_count,suffer_score,description,calories,device_name,start_lat,start_lng,map_id,gear_id,location_id
0,15855640218,K200süèéÔ∏è,2025-09-18 15:30:30+00:00,2025-09-18 17:30:30+00:00,Etc/GMT-2.0,9915.6,3272,0:54:32,3314,0:55:14,10.0,115.6,122.8,Run,Run,3.0,0,9,0,1,0,False,False,False,everyone,3.03,5:30,5.50055,4.82,3:27,3.457815,164.4,365.2,546.0,382.0,True,151.7,176.0,0,0,60.0,K200s with Runna ‚úÖ\n\nKilometr√≥wki z dwusetkam...,765.0,Garmin Forerunner 970,51.107177,17.123797,a15855640218,g24134620,1179.0
1,15843349072,9km Easy RunüëΩ,2025-09-17 14:08:34+00:00,2025-09-17 16:08:34+00:00,Etc/GMT-2.0,9051.8,3095,0:51:35,3142,0:52:22,16.0,114.8,123.0,Run,Run,,0,6,0,1,0,False,False,False,everyone,2.925,5:42,5.698006,4.12,4:03,4.045307,169.4,367.5,513.0,364.0,True,145.1,152.0,0,0,41.0,9km Easy Run with Runna ‚úÖ\n\nWyjƒÖtkowo ≈ºwawe e...,699.0,Garmin Forerunner 970,51.107162,17.123739,a15843349072,g24134620,1179.0
2,15831049874,Afternoon Weight Training,2025-09-16 13:01:07+00:00,2025-09-16 15:01:07+00:00,Etc/GMT-2.0,0.0,3825,1:03:45,3825,1:03:45,0.0,0.0,0.0,Workout,WeightTraining,,0,5,1,1,0,True,False,False,followers_only,0.0,,,0.0,,,,,,,True,94.0,222.0,0,0,9.0,Reska8Ô∏è‚É£8Ô∏è‚É£\nBench press PR: 85kgü•≥,254.0,Garmin Forerunner 970,,,a15831049874,x00000000,
3,15820198827,Tempo 2kmü•µ,2025-09-15 14:23:21+00:00,2025-09-15 16:23:21+00:00,Etc/GMT-2.0,9521.9,3241,0:54:01,3241,0:54:01,13.0,115.4,125.0,Run,Run,3.0,0,8,0,1,0,False,False,False,everyone,2.938,5:40,5.672793,4.94,3:22,3.373819,167.8,353.5,493.0,369.0,True,153.1,178.0,0,0,66.0,Tempo 2km Repeats with Runna ‚úÖ\n\nWysz≈Ço troch...,735.0,Garmin Forerunner 970,51.107301,17.124098,a15820198827,g24134620,1179.0
4,15805849875,15km Long Run‚òîÔ∏è,2025-09-14 07:59:25+00:00,2025-09-14 09:59:25+00:00,Etc/GMT-2.0,15059.0,5461,1:31:01,5488,1:31:28,31.0,114.6,125.4,Run,Run,2.0,0,4,0,1,0,False,False,False,everyone,2.758,6:03,6.043026,3.44,4:51,4.844961,172.0,335.5,455.0,334.0,True,144.8,153.0,0,0,66.0,15km Long Run with Runna ‚úÖ\n\nOkrutny beton po...,1170.0,Garmin Forerunner 970,51.107336,17.124136,a15805849875,g24134620,1179.0


In [50]:
activities_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "start_date_utc_dt": DateTime(timezone=False),
    "start_date_local_dt": DateTime(timezone=False),
    "local_timezone":  String,
    "distance": Float,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "total_elevation_gain": Float,
    "elev_low": Float,
    "elev_high": Float,
    "type": String,
    "sport_type": String,
    "workout_type": Integer,
    "achievement_count": Integer,
    "kudos_count": Integer,
    "comment_count": Integer,
    "athlete_count": Integer,
    "photo_count": Integer,
    "trainer": Boolean,
    "commute": Boolean,
    "manual": Boolean,
    "visibility": String,
    "average_speed": Float,
    "avg_pace_str": String,
    "avg_pace_float": Float,
    "max_speed": Float,
    "max_pace_str": String,
    "max_pace_float": Float,
    "average_cadence": Float,
    "average_watts": Float,
    "max_watts": Float,
    "weighted_average_watts": Float,
    "has_heartrate": Boolean,
    "average_heartrate": Float,
    "max_heartrate": Float,
    "pr_count": Integer,
    "total_photo_count": Integer,
    "suffer_score": Float,
    "description": Text,
    "calories": Float,
    "device_name": String,
    'start_lat' : Float,
    'start_lng' : Float,
    "map_id": String,
    "gear_id": String,
    "location_id": Integer
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE} will be overwritten.")

activities_df.to_sql(
    name=ACTIVITIES_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=activities_df_dtype_map,
    method="multi",
    chunksize=1000
)



-2

In [59]:
activities_df.dtypes

id                                      int64
name                                   object
start_date_utc_dt         datetime64[ns, UTC]
start_date_local_dt       datetime64[ns, UTC]
local_timezone                         object
distance                              float64
moving_time                             int64
moving_time_td                         object
elapsed_time                            int64
elapsed_time_td                        object
total_elevation_gain                  float64
elev_low                              float64
elev_high                             float64
type                                   object
sport_type                             object
workout_type                          float64
achievement_count                       int64
kudos_count                             int64
comment_count                           int64
athlete_count                           int64
photo_count                             int64
trainer                           

In [51]:
segments_df = pd.merge(segments_df, lat_lng_points_df[['start_lat', 'start_lng',  'location_id']], how='left', on=['start_lat', 'start_lng'])

In [52]:
segments_df.head()

Unnamed: 0,id,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,elevation_profile,elevation_profiles,climb_category,private,hazardous,starred,start_lat,start_lng,end_lat,end_lng,location_id
0,38033619,BƒÖczek counterclockwise,Walk,3722.4,0.0,8.0,123.8,118.2,,,0,False,False,False,51.104196,17.124846,51.104209,17.124837,1179
1,17455167,Pƒôtla od ≈õluzy,Run,3886.4,0.0,10.5,120.0,112.9,,,0,False,False,False,51.104195,17.124249,51.104082,17.124274,1179
2,10082640,Most Chrobrego- most Bartoszowicki,Run,1582.0,0.0,2.5,118.2,115.5,,,0,False,False,False,51.113182,17.108164,51.102416,17.122673,1179
3,10082666,Swojczycki - Sluza revers,Run,1580.4,0.0,2.7,118.8,114.6,,,0,False,False,False,51.103481,17.12464,51.114135,17.109978,1179
4,22997595,Po kostce do Grobli,Run,456.5,0.2,7.5,117.0,116.0,,,0,False,False,False,51.104347,17.125358,51.101168,17.129389,1179


In [53]:
segments_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "activity_type": String,
    "distance": Float,
    "average_grade": Float,
    "maximum_grade": Float,
    "elevation_high": Float,
    "elevation_low": Float,
    "elevation_profile": Float,
    "elevation_profiles": Float,
    "climb_category": Float,
    "private": Boolean,
    "hazardous": Boolean,
    "starred": Boolean,
    "start_lat": Float,
    "start_lng": Float,
    "end_lat": Float,
    "end_lng": Float,
    "location_id": Integer
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{SEGMENTS_S_TABLE} will be overwritten.")

segments_df.to_sql(
    name=SEGMENTS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=segments_df_dtype_map,
    method="multi",
    chunksize=1000
)



-2

In [54]:
locations_df = locations_df.rename(columns={'location_id': 'id'})

In [55]:
locations_df.head()

Unnamed: 0,id,country,region,locality
0,1000,Austria,Lower Austria,Waidhofen an der Ybbs
1,1001,Austria,Upper Austria,Garsten
2,1002,Austria,Upper Austria,Gr√ºnburg
3,1003,Austria,Upper Austria,Linz
4,1004,Austria,Upper Austria,St. Ulrich bei Steyr


In [56]:
locations_df_dtype_map = {
    "id": Integer,
    "locality": String,
    "region	": String,
    "country": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};"))

logging.warning(f"Whole table {TARGET_S_SCHEMA}.{LOCATIONS_S_TABLE} will be overwritten.")

locations_df.to_sql(
    name=LOCATIONS_S_TABLE,
    schema=TARGET_S_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=locations_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Primary keys definition

In [57]:
keys_instructions= [
    f"""CREATE SCHEMA IF NOT EXISTS {TARGET_S_SCHEMA};""",
    # ********** PRIMARY KEYS **********
    # --- activities ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE}
          ADD CONSTRAINT {ACTIVITIES_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- maps ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{MAPS_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{MAPS_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{MAPS_S_TABLE}
          ADD CONSTRAINT {MAPS_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- gear ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{GEAR_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{GEAR_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{GEAR_S_TABLE}
          ADD CONSTRAINT {GEAR_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- segments ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{SEGMENTS_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{SEGMENTS_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{SEGMENTS_S_TABLE}
          ADD CONSTRAINT {SEGMENTS_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- laps ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{LAPS_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{LAPS_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{LAPS_S_TABLE}
          ADD CONSTRAINT {LAPS_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- best efforts ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{BEST_EFFORTS_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{BEST_EFFORTS_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{BEST_EFFORTS_S_TABLE}
          ADD CONSTRAINT {BEST_EFFORTS_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- locations ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_S_SCHEMA}.{LOCATIONS_S_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_S_SCHEMA}.{LOCATIONS_S_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_S_SCHEMA}.{LOCATIONS_S_TABLE}
          ADD CONSTRAINT {LOCATIONS_S_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """
]

### Create constrains

In [58]:
with engine.begin() as conn:
    for sql in keys_instructions:
        conn.execute(text(sql))