### Import and config

In [1]:
# Imports
import os
import logging
from datetime import timedelta

from dotenv import load_dotenv

import pandas as pd
import numpy as np

from sqlalchemy import create_engine, text, Integer, Float, String, Boolean, Date, Interval, Text, BigInteger, Time, DateTime


# Configuration
load_dotenv()

# DB
DB_URI = os.getenv('DB_URI')

# Silver tables
TARGET_S_SCHEMA = os.getenv('TARGET_S_SCHEMA')
ACTIVITIES_S_TABLE = os.getenv('ACTIVITIES_S_TABLE')
LOCATIONS_S_TABLE = os.getenv('LOCATIONS_S_TABLE')
GEAR_S_TABLE = os.getenv('GEAR_S_TABLE')
SEGMENTS_S_TABLE = os.getenv('SEGMENTS_S_TABLE')
BEST_EFFORTS_S_TABLE = os.getenv('BEST_EFFORTS_S_TABLE')
SEGMENTS_EFFORTS_S_TABLE = os.getenv('SEGMENTS_EFFORTS_S_TABLE')
LAPS_S_TABLE = os.getenv('LAPS_S_TABLE')
KUDOS_S_TABLE = os.getenv('KUDOS_S_TABLE')
ZONES_S_TABLE = os.getenv('ZONES_S_TABLE')
RELATIVE_EFFORT_S_TABLE = os.getenv('RELATIVE_EFFORT_S_TABLE')
MAPS_S_TABLE = os.getenv('MAPS_S_TABLE')

# Gold tables
TARGET_G_SCHEMA = os.getenv('TARGET_G_SCHEMA')
FACT_ACTIVITIES_TABLE = os.getenv('FACT_ACTIVITIES_TABLE')
FACT_SEGMENTS_EFFORTS_TABLE = os.getenv('FACT_SEGMENTS_EFFORTS_TABLE')
FACT_BEST_EFFORTS_TABLE = os.getenv('FACT_BEST_EFFORTS_TABLE')
FACT_KUDOS_TABLE = os.getenv('FACT_KUDOS_TABLE')
FACT_LAPS_TABLE = os.getenv('FACT_LAPS_TABLE')
FACT_ZONES_TABLE = os.getenv('FACT_ZONES_TABLE')
FACT_MAPS_TABLE = os.getenv('FACT_MAPS_TABLE')

DIM_CALENDAR_TABLE = os.getenv('DIM_CALENDAR_TABLE')
DIM_TIME_TABLE = os.getenv('DIM_TIME_TABLE')
DIM_SPORT_TYPE_TABLE = os.getenv('DIM_SPORT_TYPE_TABLE')
DIM_DEVICE_TABLE = os.getenv('DIM_DEVICE_TABLE')
DIM_LOCATION_TABLE = os.getenv('DIM_LOCATION_TABLE')
DIM_GEAR_TABLE = os.getenv('DIM_GEAR_TABLE')
DIM_SEGMENT_TABLE = os.getenv('DIM_SEGMENT_TABLE')
DIM_EFFORT_TYPE_TABLE = os.getenv('DIM_EFFORT_TYPE_TABLE')
DIM_WORKOUT_TYPE_TABLE = os.getenv('DIM_WORKOUT_TYPE_TABLE')

# Other
LOG_LEVEL = os.getenv('LOG_LEVEL')

logging.basicConfig(
    level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
    format="%(asctime)s | %(levelname)s | %(message)s"
)

pd.set_option('display.max_columns', None)

### DB names validation

In [2]:
REQUIRED_ENV = [
  'DB_URI',
  'TARGET_S_SCHEMA','TARGET_G_SCHEMA',
  'ACTIVITIES_S_TABLE','LOCATIONS_S_TABLE','GEAR_S_TABLE','SEGMENTS_S_TABLE','BEST_EFFORTS_S_TABLE','SEGMENTS_EFFORTS_S_TABLE','LAPS_S_TABLE','KUDOS_S_TABLE', 'ZONES_S_TABLE', 'RELATIVE_EFFORT_S_TABLE', 'MAPS_S_TABLE',
  'DIM_CALENDAR_TABLE','DIM_TIME_TABLE','DIM_DEVICE_TABLE','DIM_SPORT_TYPE_TABLE','DIM_LOCATION_TABLE','DIM_GEAR_TABLE','DIM_SEGMENT_TABLE','DIM_EFFORT_TYPE_TABLE','DIM_WORKOUT_TYPE_TABLE',
  'FACT_ACTIVITIES_TABLE','FACT_SEGMENTS_EFFORTS_TABLE','FACT_BEST_EFFORTS_TABLE','FACT_LAPS_TABLE','FACT_KUDOS_TABLE', 'FACT_ZONES_TABLE', 'FACT_MAPS_TABLE'
]
missing = [k for k in REQUIRED_ENV if not os.getenv(k)]
if missing:
    raise RuntimeError(f"Missing env variables: {', '.join(missing)}")


### Request data from `silver` layer

In [3]:
engine = create_engine(
  DB_URI, 
  pool_pre_ping=True, 
  pool_size=5, 
  max_overflow=10
)
logging.info("Connection established")

2025-10-10 12:06:39,614 | INFO | Connection established


In [4]:
with engine.begin() as conn:
    activities_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE}"), conn)
    logging.info(f"Data from {TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE} downloaded. shape={activities_df.shape}")

    locations_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{LOCATIONS_S_TABLE}"), conn)
    logging.info(f"Data from {TARGET_S_SCHEMA}.{LOCATIONS_S_TABLE} downloaded. shape={locations_df.shape}")

    gear_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{GEAR_S_TABLE}"), conn)
    logging.info(f"Data from {TARGET_S_SCHEMA}.{GEAR_S_TABLE} downloaded. shape={gear_df.shape}")

    segments_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{SEGMENTS_S_TABLE}"), conn)
    logging.info(f"Data from {TARGET_S_SCHEMA}.{SEGMENTS_S_TABLE} downloaded. shape={segments_df.shape}")

    best_eff_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{BEST_EFFORTS_S_TABLE}"), conn)
    logging.info(f"Data from {TARGET_S_SCHEMA}.{BEST_EFFORTS_S_TABLE} downloaded. shape={best_eff_df.shape}")

    seg_eff_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{SEGMENTS_EFFORTS_S_TABLE}"), conn)
    logging.info(f"Data from {TARGET_S_SCHEMA}.{SEGMENTS_EFFORTS_S_TABLE} downloaded. shape={seg_eff_df.shape}")

    laps_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{LAPS_S_TABLE}"), conn)
    logging.info(f"Data from {TARGET_S_SCHEMA}.{LAPS_S_TABLE} downloaded. shape={laps_df.shape}")

    kudos_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{KUDOS_S_TABLE}"), conn)
    logging.info(f"Data from {TARGET_S_SCHEMA}.{KUDOS_S_TABLE} downloaded. shape={kudos_df.shape}")

    zones_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{ZONES_S_TABLE}"), conn)
    logging.info(f"Data from {TARGET_S_SCHEMA}.{ZONES_S_TABLE} downloaded. shape={zones_df.shape}")

    relative_effort_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{RELATIVE_EFFORT_S_TABLE}"), conn)
    logging.info(f"Data from {TARGET_S_SCHEMA}.{RELATIVE_EFFORT_S_TABLE} downloaded. shape={relative_effort_df.shape}")

    maps_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{MAPS_S_TABLE}"), conn)
    logging.info(f"Data from {TARGET_S_SCHEMA}.{RELATIVE_EFFORT_S_TABLE} downloaded. shape={relative_effort_df.shape}")


2025-10-10 12:06:39,677 | INFO | Data from silver.activities downloaded. shape=(1116, 49)
2025-10-10 12:06:39,680 | INFO | Data from silver.locations downloaded. shape=(242, 4)
2025-10-10 12:06:39,681 | INFO | Data from silver.gear downloaded. shape=(12, 4)
2025-10-10 12:06:39,727 | INFO | Data from silver.segments downloaded. shape=(1521, 19)
2025-10-10 12:06:39,748 | INFO | Data from silver.best_efforts downloaded. shape=(3327, 13)
2025-10-10 12:06:39,804 | INFO | Data from silver.segments_efforts downloaded. shape=(5933, 23)
2025-10-10 12:06:39,887 | INFO | Data from silver.laps downloaded. shape=(8405, 27)
2025-10-10 12:06:39,898 | INFO | Data from silver.kudos downloaded. shape=(7737, 4)
2025-10-10 12:06:39,921 | INFO | Data from silver.zones downloaded. shape=(7319, 6)
2025-10-10 12:06:39,923 | INFO | Data from silver.relative_effort downloaded. shape=(955, 2)
2025-10-10 12:06:40,639 | INFO | Data from silver.relative_effort downloaded. shape=(955, 2)


In [5]:
activities_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,elev_low,elev_high,type,sport_type,workout_type,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,visibility,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_watts,max_watts,weighted_average_watts,has_heartrate,average_heartrate,max_heartrate,pr_count,total_photo_count,suffer_score,description,calories,device_name,start_lat,start_lng,map_id,gear_id,location_id
0,16052008939,Afternoon Weight Training,2025-10-06 15:00:37,2025-10-06 17:00:37,Etc/GMT-2.0,0.0,3616,0 days 01:00:16,3616,0 days 01:00:16,0.0,0.0,0.0,Workout,WeightTraining,,0,5,0,1,0,True,False,False,followers_only,0.0,,,0.0,,,,,,,True,95.9,156.0,0,0,8.0,Reska9Ô∏è‚É£4Ô∏è‚É£,299.0,Garmin Forerunner 970,,,a16052008939,x00000000,
1,16038944725,13km Long Run‚õ∞Ô∏è,2025-10-05 10:24:17,2025-10-05 12:24:17,Etc/GMT-2.0,13061.6,4597,0 days 01:16:37,4749,0 days 01:19:09,139.0,10.8,71.0,Run,Run,2.0,2,9,0,1,0,False,False,False,everyone,2.841,5:52,5.866479,3.74,4:27,4.456328,169.0,367.6,546.0,361.0,True,143.5,151.0,2,0,49.0,13km Long Run with Runna ‚úÖ\n\nGorz√≥w klasyczni...,1006.0,Garmin Forerunner 970,52.75577,15.22601,a16038944725,g24134620,1199.0
2,16033236291,Evening Swim,2025-10-04 18:50:53,2025-10-04 20:50:53,Etc/GMT-2.0,2000.0,2343,0 days 00:39:03,2873,0 days 00:47:53,0.0,0.0,0.0,Swim,Swim,,0,5,0,1,0,True,False,False,followers_only,0.854,,,1.8,,,,,,,True,127.8,147.0,0,0,14.0,Gorzowski klasykü•∞,539.0,Garmin Forerunner 970,,,a16033236291,x00000000,
3,16020263348,Tempo 6kmüî•,2025-10-03 14:44:47,2025-10-03 16:44:47,Etc/GMT-2.0,9416.9,2943,0 days 00:49:03,2943,0 days 00:49:03,17.0,108.4,119.6,Run,Run,3.0,7,10,1,1,0,False,False,False,everyone,3.2,5:12,5.208333,4.52,3:41,3.687316,169.2,383.8,534.0,393.0,True,152.5,169.0,7,0,57.0,Tempo 6km with Runna ‚úÖ\n\nOstatnie mocniejsze ...,710.0,Garmin Forerunner 970,51.108521,17.120467,a16020263348,g23642256,1179.0
4,16009596787,9km Easy Runüåû,2025-10-02 14:40:09,2025-10-02 16:40:09,Etc/GMT-2.0,9120.3,3078,0 days 00:51:18,3078,0 days 00:51:18,13.0,109.2,119.4,Run,Run,,0,10,1,1,0,False,False,False,everyone,2.963,5:37,5.62493,3.6,4:38,4.62963,170.0,358.2,447.0,357.0,True,145.3,153.0,0,0,40.0,9km Easy Run with Runna ‚úÖ\n\nStandardowe klepa...,706.0,Garmin Forerunner 970,51.107356,17.124201,a16009596787,g24134620,1179.0


In [6]:
locations_df.head()

Unnamed: 0,id,country,region,locality
0,1000,Austria,Lower Austria,Waidhofen an der Ybbs
1,1001,Austria,Upper Austria,Garsten
2,1002,Austria,Upper Austria,Gr√ºnburg
3,1003,Austria,Upper Austria,Linz
4,1004,Austria,Upper Austria,St. Ulrich bei Steyr


In [7]:
gear_df.head()

Unnamed: 0,id,name,distance_m,distance_km
0,x00000000,No gear,0.0,0.0
1,g24134620,ASICS Novablast 5,415689.0,415.7
2,g23642256,Adidas EVO SL,175449.0,175.4
3,b12572672,Cube Nuroad Pro,3788632.0,3788.6
4,g19800575,Nike Invincible Run 3 White,430102.0,430.1


In [8]:
segments_df.head()

Unnamed: 0,id,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,elevation_profile,elevation_profiles,climb_category,private,hazardous,starred,start_lat,start_lng,end_lat,end_lng,location_id
0,38033619,BƒÖczek counterclockwise,Walk,3722.4,0.0,8.0,123.8,118.2,,,0.0,False,False,False,51.104196,17.124846,51.104209,17.124837,1179
1,17455167,Pƒôtla od ≈õluzy,Run,3886.4,0.0,10.5,120.0,112.9,,,0.0,False,False,False,51.104195,17.124249,51.104082,17.124274,1179
2,10082640,Most Chrobrego- most Bartoszowicki,Run,1582.0,0.0,2.5,118.2,115.5,,,0.0,False,False,False,51.113182,17.108164,51.102416,17.122673,1179
3,10082666,Swojczycki - Sluza revers,Run,1580.4,0.0,2.7,118.8,114.6,,,0.0,False,False,False,51.103481,17.12464,51.114135,17.109978,1179
4,22997595,Po kostce do Grobli,Run,456.5,0.2,7.5,117.0,116.0,,,0.0,False,False,False,51.104347,17.125358,51.101168,17.129389,1179


In [9]:
best_eff_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,rank,type,activity_id
0,55038759090,400m,2024-11-22 06:54:04,2024-11-22 07:54:04,Etc/GMT-1.0,400.0,147,0 days 00:02:27,147,0 days 00:02:27,,,12956260994
1,55038759091,1/2 mile,2024-11-22 06:54:04,2024-11-22 07:54:04,Etc/GMT-1.0,805.0,301,0 days 00:05:01,301,0 days 00:05:01,,,12956260994
2,55038759092,1K,2024-11-22 06:54:04,2024-11-22 07:54:04,Etc/GMT-1.0,1000.0,376,0 days 00:06:16,376,0 days 00:06:16,,,12956260994
3,55038759086,1 mile,2024-11-22 06:54:04,2024-11-22 07:54:04,Etc/GMT-1.0,1609.0,612,0 days 00:10:12,612,0 days 00:10:12,,,12956260994
4,55038759087,2 mile,2024-11-22 06:54:04,2024-11-22 07:54:04,Etc/GMT-1.0,3219.0,1238,0 days 00:20:38,1238,0 days 00:20:38,,,12956260994


In [10]:
seg_eff_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,pr_rank,visibility,kom_rank,hidden,rank,type,activity_id,segment_id
0,3343296484852327920,BƒÖczek counterclockwise,2025-04-04 18:45:56,2025-04-04 20:45:56,Etc/GMT-2.0,3722.4,2692,0 days 00:44:52,2694,0 days 00:44:54,52.3,False,,94.9,107.0,,followers_only,,False,,,14080030310,38033619
1,3294936864762577520,Pƒôtla od ≈õluzy,2024-11-22 06:57:15,2024-11-22 07:57:15,Etc/GMT-1.0,3886.4,1449,0 days 00:24:09,1449,0 days 00:24:09,165.6,True,291.4,148.4,157.0,,everyone,,False,,,12956260994,17455167
2,3294936864763706992,Most Chrobrego- most Bartoszowicki,2024-11-22 07:09:05,2024-11-22 08:09:05,Etc/GMT-1.0,1582.0,624,0 days 00:10:24,624,0 days 00:10:24,166.2,True,285.2,149.4,155.0,,everyone,,False,,,12956260994,10082640
3,3294936864763212400,Swojczycki - Sluza revers,2024-11-22 07:20:54,2024-11-22 08:20:54,Etc/GMT-1.0,1580.4,631,0 days 00:10:31,631,0 days 00:10:31,165.8,True,293.3,150.9,156.0,,everyone,,False,,,12956260994,10082666
4,3294936864765383280,Pƒôtla od ≈õluzy,2024-11-22 07:21:28,2024-11-22 08:21:28,Etc/GMT-1.0,3886.4,1444,0 days 00:24:04,1444,0 days 00:24:04,165.6,True,292.3,151.8,157.0,,everyone,,False,,,12956260994,17455167


In [11]:
laps_df.head()

Unnamed: 0,id,name,lap_index,split,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,type,average_speed,avg_pace_str,avg_pace_float,pace_zone,max_speed,max_pace_str,max_pace_float,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,activity_id
0,49980673377,Lap 1,1,1,2025-04-05 11:39:40,2025-04-05 13:39:40,Etc/GMT-2.0,0.0,6066,0 days 01:41:06,6066,0 days 01:41:06,0.0,WeightTraining,0.0,,,,0.0,,,,False,,106.9,149.0,14086094444
1,49956432295,Lap 1,1,1,2025-04-04 18:37:40,2025-04-04 20:37:40,Etc/GMT-2.0,1000.0,811,0 days 00:13:31,811,0 days 00:13:31,8.4,Walk,1.23,,,,3.4,,,51.7,False,,94.8,107.0,14080030310
2,49956432299,Lap 2,2,2,2025-04-04 18:37:40,2025-04-04 20:37:40,Etc/GMT-2.0,1000.0,745,0 days 00:12:25,745,0 days 00:12:25,3.8,Walk,1.34,,,,1.814,,,52.1,False,,95.0,107.0,14080030310
3,49956432304,Lap 3,3,3,2025-04-04 18:37:40,2025-04-04 20:37:40,Etc/GMT-2.0,1000.0,704,0 days 00:11:44,704,0 days 00:11:44,3.4,Walk,1.42,,,,1.709,,,52.9,False,,95.4,106.0,14080030310
4,49956432306,Lap 4,4,4,2025-04-04 18:37:40,2025-04-04 20:37:40,Etc/GMT-2.0,1000.0,720,0 days 00:12:00,720,0 days 00:12:00,0.0,Walk,1.39,,,,1.791,,,52.1,False,,94.1,100.0,14080030310


In [12]:
kudos_df.head()

Unnamed: 0,first_name,last_name,full_name,activity_id
0,Kacper,G.,Kacper G.,15716821076
1,Jan,K.,Jan K.,15716821076
2,Jacek,S.,Jacek S.,15716821076
3,Ola,≈Å.,Ola ≈Å.,15716821076
4,Kacper,K.,Kacper K.,15716821076


In [13]:
zones_df.head()

Unnamed: 0,id,activity_id,type,zone_number,zone_name,time
0,15923268347-heartrate-1,15923268347,heartrate,1,Z1 - Recovery,73.0
1,15923268347-heartrate-2,15923268347,heartrate,2,Z2 - Endurance,1706.0
2,15923268347-heartrate-3,15923268347,heartrate,3,Z3 - Tempo,1290.0
3,15923268347-heartrate-4,15923268347,heartrate,4,Z4 - Threshold,0.0
4,15923268347-heartrate-5,15923268347,heartrate,5,Z5 - Anaerobic,0.0


In [14]:
relative_effort_df.head()

Unnamed: 0,activity_id,relative_effort
0,15923268347,39.0
1,8254517069,17.0
2,8252893698,13.0
3,8247520568,23.0
4,8239641985,16.0


In [15]:
maps_df.head()

Unnamed: 0,id,point_id,lat,lng
0,a14086094444,0,,
1,a14080030310,0,51.10761,17.12408
2,a14080030310,1,51.10761,17.12406
3,a14080030310,2,51.10756,17.12405
4,a14080030310,3,51.10744,17.12399


### Setup `gold.dim_calendar`

In [16]:
dim_calendar_df = pd.DataFrame(
  {'date': pd.date_range(
    activities_df['start_date_local_dt'].dt.date.min(), 
    pd.Timestamp('today').normalize(), 
    freq="D")}
)
dim_calendar_df = dim_calendar_df.sort_values(by='date', ascending=False).reset_index(drop=True)

In [17]:
#Year
dim_calendar_df['year'] = dim_calendar_df['date'].dt.year
dim_calendar_df['year_start_date'] = dim_calendar_df['date'].dt.to_period('Y').dt.start_time

# Month
dim_calendar_df['month'] = dim_calendar_df['date'].dt.month
dim_calendar_df['month_year'] = dim_calendar_df['date'].dt.to_period('M').astype('str')
dim_calendar_df['month_start_date'] = dim_calendar_df['date'].dt.to_period('M').dt.start_time
dim_calendar_df['month_name_year'] = dim_calendar_df['date'].dt.strftime('%b-%Y')
dim_calendar_df['month_name'] = dim_calendar_df['date'].dt.strftime('%B') 

# Week
dim_calendar_df['week'] = dim_calendar_df['date'].dt.isocalendar().week
dim_calendar_df['week_start_date'] = dim_calendar_df['date'].dt.to_period('W').dt.start_time

# Day
dim_calendar_df['day'] = dim_calendar_df['date'].dt.day
dim_calendar_df['day_of_year'] = dim_calendar_df['date'].dt.day_of_year
dim_calendar_df['day_of_week'] = dim_calendar_df['date'].dt.weekday + 1
dim_calendar_df['day_of_week_name'] = dim_calendar_df['date'].dt.day_name()
dim_calendar_df['is_weekend'] = dim_calendar_df['date'].dt.weekday + 1 >= 6

In [18]:
dim_calendar_df.head()

Unnamed: 0,date,year,year_start_date,month,month_year,month_start_date,month_name_year,month_name,week,week_start_date,day,day_of_year,day_of_week,day_of_week_name,is_weekend
0,2025-10-10,2025,2025-01-01,10,2025-10,2025-10-01,Oct-2025,October,41,2025-10-06,10,283,5,Friday,False
1,2025-10-09,2025,2025-01-01,10,2025-10,2025-10-01,Oct-2025,October,41,2025-10-06,9,282,4,Thursday,False
2,2025-10-08,2025,2025-01-01,10,2025-10,2025-10-01,Oct-2025,October,41,2025-10-06,8,281,3,Wednesday,False
3,2025-10-07,2025,2025-01-01,10,2025-10,2025-10-01,Oct-2025,October,41,2025-10-06,7,280,2,Tuesday,False
4,2025-10-06,2025,2025-01-01,10,2025-10,2025-10-01,Oct-2025,October,41,2025-10-06,6,279,1,Monday,False


In [19]:
dim_calendar_df_dtype_map = {
    "date": Date,
    "year": Integer,
    "year_start_date": Date,
    "month": Integer,
    "month_year": String,
    "month_start_date": Date,
    "month_name_year": String,
    "month_name": String,
    "week": Integer,
    "week_start_date": Date,
    "day": Integer,
    "day_of_year": Integer,
    "day_of_week": Integer,
    "day_of_week_name": String,
    "is_weekend": Boolean
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_CALENDAR_TABLE} will be overwritten.")

dim_calendar_df.to_sql(
    name=DIM_CALENDAR_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_calendar_df_dtype_map,
    method="multi",
    chunksize=1000
)



-4

### Setup `gold.dim_time`

In [20]:
times = pd.date_range("1900-01-01 00:00:00", "1900-01-01 23:59:59", freq="s")
dim_time_df = pd.DataFrame({"datetime": times})

In [21]:
dim_time_df['time'] = dim_time_df['datetime'].dt.time
dim_time_df['hour'] = dim_time_df['datetime'].dt.hour
dim_time_df['minute'] = dim_time_df['datetime'].dt.minute
dim_time_df['second'] = dim_time_df['datetime'].dt.second
dim_time_df['hour_minute'] = dim_time_df['datetime'].dt.strftime("%H:%M")
dim_time_df['hour_label'] = dim_time_df['datetime'].dt.strftime("%H:00")
dim_time_df['day_part'] = pd.cut(dim_time_df['hour'], 
                                 bins=[-1,5,11,13,17,23],
                                 labels=['Night','Morning','Lunch','Afternoon','Evening'],
                                 include_lowest=True)
dim_time_df['day_part_number'] = pd.cut(dim_time_df['hour'], 
                                 bins=[-1,5,11,13,17,23],
                                 labels=[5, 1, 2, 3, 4],
                                 include_lowest=True)
dim_time_df = dim_time_df.drop(columns='datetime')

In [22]:
dim_time_df.head()

Unnamed: 0,time,hour,minute,second,hour_minute,hour_label,day_part,day_part_number
0,00:00:00,0,0,0,00:00,00:00,Night,5
1,00:00:01,0,0,1,00:00,00:00,Night,5
2,00:00:02,0,0,2,00:00,00:00,Night,5
3,00:00:03,0,0,3,00:00,00:00,Night,5
4,00:00:04,0,0,4,00:00,00:00,Night,5


In [23]:
dim_time_df_dtype_map = {
    "time": Time,
    "hour": Integer,
    "minute": Integer,
    "second": Integer,
    "hour_minute": String,
    "hour_label": String,
    "day_part": String,
    "day_part_number": Integer
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_TIME_TABLE} will be overwritten.")

dim_time_df.to_sql(
    name=DIM_TIME_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_time_df_dtype_map,
    method="multi",
    chunksize=1000
)



-87

In [24]:
activities_df['sport_type'].value_counts()

sport_type
Run               547
Walk              239
Ride              123
WeightTraining    105
Swim               35
VirtualRide        21
Squash             14
Yoga               14
Workout             8
RockClimbing        5
Hike                3
Soccer              1
Rowing              1
Name: count, dtype: int64

### Setup `gold.dim_sport_type`

In [25]:
dim_sport_type_df = (activities_df['sport_type']
                     .copy()
                     .drop_duplicates()
                     .to_frame()
                     .reset_index(drop=True))
dim_sport_type_df['sport_type_id'] = 1000 + np.arange(len(dim_sport_type_df))

In [26]:
dim_sport_type_df = dim_sport_type_df[['sport_type_id', 'sport_type']]
dim_sport_type_df = dim_sport_type_df.rename(columns={'sport_type_id': 'id'})

In [27]:
dim_sport_type_df['sport_type_summary'] = np.where(
    dim_sport_type_df['sport_type'].isin(['Ride', 'Run', 'Swim']),
    dim_sport_type_df['sport_type'],
    'Other'
)
mapping = {
    'Run': 1,
    'Ride': 2,
    'Swim': 3,
    'Other': 4
}

dim_sport_type_df['sport_type_summary_number'] = (
    dim_sport_type_df['sport_type_summary'].map(mapping)
)

In [28]:
dim_sport_type_df.head()

Unnamed: 0,id,sport_type,sport_type_summary,sport_type_summary_number
0,1000,WeightTraining,Other,4
1,1001,Run,Run,1
2,1002,Swim,Swim,3
3,1003,Walk,Other,4
4,1004,Ride,Ride,2


In [29]:
dim_sport_type_df_dtype_map = {
    "id": Integer,
    "sport_type": String,
    "sport_type_summary": String,
    "sport_type_summary_number": Integer
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_SPORT_TYPE_TABLE} will be overwritten.")

dim_sport_type_df.to_sql(
    name=DIM_SPORT_TYPE_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_sport_type_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.dim_device`

In [30]:
dim_device_df = (activities_df['device_name']
                     .copy()
                     .drop_duplicates()
                     .to_frame()
                     .reset_index(drop=True))
dim_device_df['device_id'] = 1000 + np.arange(len(dim_device_df))

In [31]:
dim_device_df = dim_device_df[['device_id', 'device_name']]
dim_device_df = dim_device_df.rename(columns={'device_id': 'id'})
dim_device_df['device_name'] = dim_device_df['device_name'].fillna('No device')

In [32]:
dim_device_df.head()

Unnamed: 0,id,device_name
0,1000,Garmin Forerunner 970
1,1001,Garmin Edge 840
2,1002,Apple Watch SE
3,1003,No device
4,1004,Garmin Forerunner 945


In [33]:
dim_device_df_dtype_map = {
    "id": Integer,
    "device_name": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_DEVICE_TABLE} will be overwritten.")

dim_device_df.to_sql(
    name=DIM_DEVICE_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_device_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.dim_location`

In [34]:
dim_location_df = locations_df.copy()

In [35]:
dim_location_df.head()

Unnamed: 0,id,country,region,locality
0,1000,Austria,Lower Austria,Waidhofen an der Ybbs
1,1001,Austria,Upper Austria,Garsten
2,1002,Austria,Upper Austria,Gr√ºnburg
3,1003,Austria,Upper Austria,Linz
4,1004,Austria,Upper Austria,St. Ulrich bei Steyr


In [36]:
dim_location_df_dtype_map = {
    "id": Integer,
    "locality": String,
    "region": String,
    "country": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_LOCATION_TABLE} will be overwritten.")

dim_location_df.to_sql(
    name=DIM_LOCATION_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_location_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.dim_gear`

In [37]:
dim_gear_df = gear_df.copy()

In [38]:
dim_gear_df = dim_gear_df.sort_values(by='id')
dim_gear_df = dim_gear_df.reset_index(drop=True)
dim_gear_df['gear_type'] = dim_gear_df['id'].str[0].str.lower().map({'b': 'Bike', 'g': 'Shoes'}).fillna('Other')


In [39]:
retired_gear = [
  "Adidas Ultraboost 19",
  "New Balance 1080 v12",
  "Nike Invincible Run 3 Black",
  "Nike Invincible Run 3 Blueprint",
  "Nike Invincible Run 3 White",
  "Nike Pegasus 38",
  "Nike Zoom Fly 4"
]

dim_gear_df['status'] = np.where(
    dim_gear_df['name'].isin(retired_gear),
    "Retired",
    "Active"
)

In [40]:
dim_gear_df.head()

Unnamed: 0,id,name,distance_m,distance_km,gear_type,status
0,b12572672,Cube Nuroad Pro,3788632.0,3788.6,Bike,Active
1,b13100260,Cube Nuroad Pro Wirtualnie,520771.0,520.8,Bike,Active
2,g11165677,New Balance 1080 v12,1194617.0,1194.6,Shoes,Retired
3,g11783267,Nike Zoom Fly 4,272798.0,272.8,Shoes,Retired
4,g17673165,Nike Invincible Run 3 Black,666029.0,666.0,Shoes,Retired


In [41]:
dim_gear_df_dtype_map = {
    "id": String,
    "name": String,
    "distance_m": Float,
    "distance_km": Float,
    "gear_type": String,
    "status": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_GEAR_TABLE} will be overwritten.")

dim_gear_df.to_sql(
    name=DIM_GEAR_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_gear_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.dim_segment`

In [42]:
dim_segment_df = segments_df.copy()

In [43]:
dim_segment_cols_clean = [
  'id',
  'name',
  'activity_type',
  'distance',
  'average_grade',
  'maximum_grade',
  'elevation_high',
  'elevation_low',
  'climb_category',
  'location_id'
]

In [44]:
dim_segment_df = dim_segment_df[dim_segment_cols_clean]
dim_segment_df = dim_segment_df.sort_values(by='id')
dim_segment_df = dim_segment_df.reset_index(drop=True)

In [45]:
dim_segment_df.head()

Unnamed: 0,id,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,climb_category,location_id
0,1094192,Opatowitz 2and1/2m,Ride,294.308,1.6,2.7,120.7,115.8,0.0,1179
1,1137415,Passeig De Garcia F√†ria Climb,Run,361.742,1.0,18.1,17.2,-1.3,0.0,1231
2,1137416,Carretera De Montju√Øc Climb,Run,439.109,2.8,8.1,19.2,6.9,0.0,1231
3,1332451,Pasikurowice,Ride,11505.7,0.2,3.6,134.6,111.0,0.0,1179
4,1616501,Legnicka - Kleci≈Ñska,Ride,1481.65,0.4,2.9,121.0,113.3,0.0,1179


In [46]:
dim_segment_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "activity_type": String,
    "distance": Float,
    "average_grade": Float,
    "maximum_grade": Float,
    "elevation_high": Float,
    "elevation_low": Float,
    "climb_category": Float,
    "location_id": Integer
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_SEGMENT_TABLE} will be overwritten.")

dim_segment_df.to_sql(
    name=DIM_SEGMENT_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_segment_df_dtype_map,
    method="multi",
    chunksize=1000
)



-2

### Setup `gold.dim_effort_type`

In [47]:
dim_effort_type_df = (best_eff_df['name']
                     .copy()
                     .drop_duplicates()
                     .to_frame()
                     .reset_index(drop=True))
dim_effort_type_df['id'] = 1000 + np.arange(len(dim_effort_type_df))

In [48]:
dim_effort_type_df = dim_effort_type_df[['id', 'name']]

In [49]:
dim_effort_type_df.head()

Unnamed: 0,id,name
0,1000,400m
1,1001,1/2 mile
2,1002,1K
3,1003,1 mile
4,1004,2 mile


In [50]:
dim_effort_type_df_dtype_map = {
    "id": Integer,
    "name": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_EFFORT_TYPE_TABLE} will be overwritten.")

dim_effort_type_df.to_sql(
    name=DIM_EFFORT_TYPE_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_effort_type_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.dim_workout_type`

In [51]:
def extract_timedelta(time: pd.Series) -> pd.Series:
  """
  Convert a Series of numeric values (seconds) into timedeltas.

  Parameters
  ----------
  time : pd.Series
      Series containing durations expressed in seconds (int/float). 
      Null values are preserved as None.

  Returns
  -------
  pd.Series
      Series of Python ``datetime.timedelta`` objects. 
      Each element corresponds to the given number of seconds or None if missing.
  """

  return pd.Series([(timedelta(seconds=int(t)) if pd.notnull(t) else None) for t in time], dtype="object")

In [52]:
dim_workout_type_df = pd.DataFrame([
    {"id": 0.0, "type": "Run - General"},
    {"id": 1.0, "type": "Run - Race"},
    {"id": 2.0, "type": "Run - Long Run"},
    {"id": 3.0, "type": "Run - Workout"},
    {"id": 10.0, "type": "Ride - General"},
    {"id": 11.0, "type": "Ride - Race"},
    {"id": 12.0, "type": "Ride - Workout"},
    {"id": 20.0, "type": "Other"}
])

In [53]:
dim_workout_type_df.head()

Unnamed: 0,id,type
0,0.0,Run - General
1,1.0,Run - Race
2,2.0,Run - Long Run
3,3.0,Run - Workout
4,10.0,Ride - General


In [54]:
dim_workout_type_df_dtype_map = {
    "id": Integer,
    "type": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_WORKOUT_TYPE_TABLE} will be overwritten.")

dim_workout_type_df.to_sql(
    name=DIM_WORKOUT_TYPE_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_workout_type_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.fact_activities`

In [55]:
def extract_timedelta(time: pd.Series) -> pd.Series:
  """
  Convert a Series of numeric values (seconds) into timedeltas.

  Parameters
  ----------
  time : pd.Series
      Series containing durations expressed in seconds (int/float). 
      Null values are preserved as None.

  Returns
  -------
  pd.Series
      Series of Python ``datetime.timedelta`` objects. 
      Each element corresponds to the given number of seconds or None if missing.
  """

  return pd.Series([(timedelta(seconds=int(t)) if pd.notnull(t) else None) for t in time], dtype="object")

In [56]:
fact_activities_df = activities_df.copy()

In [57]:
fact_activities_df = fact_activities_df.drop(columns=['moving_time_td', 'elapsed_time_td'])
fact_activities_df.loc[:, "moving_time_td"]  = extract_timedelta(fact_activities_df["moving_time"])
fact_activities_df.loc[:, "elapsed_time_td"] = extract_timedelta(fact_activities_df["elapsed_time"])

In [58]:
fact_activities_df = pd.merge(fact_activities_df, dim_sport_type_df, how='left', on='sport_type')
fact_activities_df = fact_activities_df.rename(columns={'id_y': 'sport_type_id', 'id_x': 'id'})
fact_activities_df = pd.merge(fact_activities_df, dim_device_df, how='left', on='device_name')
fact_activities_df = fact_activities_df.rename(columns={'id_y': 'device_id', 'id_x': 'id'})
fact_activities_df = pd.merge(fact_activities_df, relative_effort_df, how='left', left_on='id', right_on='activity_id')


In [59]:
fact_activities_df['date'] = fact_activities_df['start_date_local_dt'].dt.date
fact_activities_df['time'] = fact_activities_df['start_date_local_dt'].dt.time

In [60]:
fact_activities_df = fact_activities_df.rename(columns={"start_date_local_dt": "datetime"})

In [61]:
fact_activities_df['workout_type'] = fact_activities_df.apply(
    lambda r: 0 if pd.isna(r['workout_type']) and r['sport_type'] == 'Run'
              else 20 if pd.isna(r['workout_type'])
              else r['workout_type'],
    axis=1
)
fact_activities_df = fact_activities_df.rename(columns={'workout_type': 'workout_type_id'})

In [62]:
fact_activities_cols_clean = [
  'id',
  'name',
  'description',
  'date',
  'time',
  'datetime',
  'relative_effort',
  'distance',
  'moving_time',
  'moving_time_td',
  'elapsed_time',
  'elapsed_time_td',
  'total_elevation_gain',
  'achievement_count',
  'kudos_count',
  'comment_count',
  'athlete_count',
  'photo_count',
  'commute',
  'manual',
  'visibility',
  'average_speed',
  'avg_pace_str',
  'avg_pace_float',
  'max_speed',
  'max_pace_str',
  'max_pace_float',
  'average_cadence',
  'average_watts',
  'max_watts',
  'weighted_average_watts',
  'has_heartrate',
  'average_heartrate',
  'max_heartrate',
  'pr_count',
  'suffer_score',
  'calories',
  'map_id',
  'gear_id',
  'location_id',
  'sport_type_id',
  'device_id',
  'workout_type_id'
]
fact_activities_df = fact_activities_df[fact_activities_cols_clean]

In [63]:
fact_activities_df.head()

Unnamed: 0,id,name,description,date,time,datetime,relative_effort,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,achievement_count,kudos_count,comment_count,athlete_count,photo_count,commute,manual,visibility,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_watts,max_watts,weighted_average_watts,has_heartrate,average_heartrate,max_heartrate,pr_count,suffer_score,calories,map_id,gear_id,location_id,sport_type_id,device_id,workout_type_id
0,16052008939,Afternoon Weight Training,Reska9Ô∏è‚É£4Ô∏è‚É£,2025-10-06,17:00:37,2025-10-06 17:00:37,8.0,0.0,3616,1:00:16,3616,1:00:16,0.0,0,5,0,1,0,False,False,followers_only,0.0,,,0.0,,,,,,,True,95.9,156.0,0,8.0,299.0,a16052008939,x00000000,,1000,1000.0,20.0
1,16038944725,13km Long Run‚õ∞Ô∏è,13km Long Run with Runna ‚úÖ\n\nGorz√≥w klasyczni...,2025-10-05,12:24:17,2025-10-05 12:24:17,49.0,13061.6,4597,1:16:37,4749,1:19:09,139.0,2,9,0,1,0,False,False,everyone,2.841,5:52,5.866479,3.74,4:27,4.456328,169.0,367.6,546.0,361.0,True,143.5,151.0,2,49.0,1006.0,a16038944725,g24134620,1199.0,1001,1000.0,2.0
2,16033236291,Evening Swim,Gorzowski klasykü•∞,2025-10-04,20:50:53,2025-10-04 20:50:53,14.0,2000.0,2343,0:39:03,2873,0:47:53,0.0,0,5,0,1,0,False,False,followers_only,0.854,,,1.8,,,,,,,True,127.8,147.0,0,14.0,539.0,a16033236291,x00000000,,1002,1000.0,20.0
3,16020263348,Tempo 6kmüî•,Tempo 6km with Runna ‚úÖ\n\nOstatnie mocniejsze ...,2025-10-03,16:44:47,2025-10-03 16:44:47,57.0,9416.9,2943,0:49:03,2943,0:49:03,17.0,7,10,1,1,0,False,False,everyone,3.2,5:12,5.208333,4.52,3:41,3.687316,169.2,383.8,534.0,393.0,True,152.5,169.0,7,57.0,710.0,a16020263348,g23642256,1179.0,1001,1000.0,3.0
4,16009596787,9km Easy Runüåû,9km Easy Run with Runna ‚úÖ\n\nStandardowe klepa...,2025-10-02,16:40:09,2025-10-02 16:40:09,40.0,9120.3,3078,0:51:18,3078,0:51:18,13.0,0,10,1,1,0,False,False,everyone,2.963,5:37,5.62493,3.6,4:38,4.62963,170.0,358.2,447.0,357.0,True,145.3,153.0,0,40.0,706.0,a16009596787,g24134620,1179.0,1001,1000.0,0.0


In [64]:
fact_activities_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "description": Text,
    "date": Date,
    "time": Time,
    "datetime": DateTime,
    "relative_effort": Float,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "total_elevation_gain": Float,
    "achievement_count": Integer,
    "kudos_count": Integer,
    "comment_count": Integer,
    "athlete_count": Integer,
    "photo_count": Integer,
    "commute": Boolean,
    "manual": Boolean,
    "visibility": String,
    "average_speed": Float,
    "avg_pace_str": String,
    "avg_pace_float": Float,
    "max_speed": Float,
    "max_pace_str": String,
    "max_pace_float": Float,
    "average_cadence": Float,
    "average_watts": Float,
    "max_watts": Float,
    "weighted_average_watts": Float,
    "has_heartrate": Boolean,
    "average_heartrate": Float,
    "max_heartrate": Float,
    "pr_count": Integer,
    "suffer_score": Float,
    "calories": Float,
    "gear_id": String,
    "location_id": Integer,
    "sport_type_id": Integer,
    "device_id": Integer,
    "workout_type_id": Integer
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_ACTIVITIES_TABLE} will be overwritten.")

fact_activities_df.to_sql(
    name=FACT_ACTIVITIES_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_activities_df_dtype_map,
    method="multi",
    chunksize=1000
)



-2

### Setup `gold.fact_segment_efforts`

In [65]:
fact_segment_efforts_df = seg_eff_df.copy()

In [66]:
fact_segment_efforts_df = fact_segment_efforts_df.drop(columns=['moving_time_td', 'elapsed_time_td'])
fact_segment_efforts_df.loc[:, "moving_time_td"]  = extract_timedelta(fact_segment_efforts_df["moving_time"])
fact_segment_efforts_df.loc[:, "elapsed_time_td"] = extract_timedelta(fact_segment_efforts_df["elapsed_time"])

In [67]:
fact_segment_efforts_df['date'] = fact_segment_efforts_df['start_date_local_dt'].dt.date
fact_segment_efforts_df['time'] = fact_segment_efforts_df['start_date_local_dt'].dt.time

In [68]:
fact_segment_efforts_cols_clean = [
  'id',
  'date',
  'time',
  'moving_time',
  'moving_time_td',
  'elapsed_time',
  'elapsed_time_td',
  'average_cadence',
  'device_watts',
  'average_watts',
  'average_heartrate',
  'max_heartrate',
  'pr_rank',
  'visibility',
  'kom_rank',
  'rank',
  'type',
  'segment_id',
  'activity_id'
]
fact_segment_efforts_df = fact_segment_efforts_df[fact_segment_efforts_cols_clean]

In [69]:
fact_segment_efforts_df.head()

Unnamed: 0,id,date,time,moving_time,moving_time_td,elapsed_time,elapsed_time_td,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,pr_rank,visibility,kom_rank,rank,type,segment_id,activity_id
0,3343296484852327920,2025-04-04,20:45:56,2692,0:44:52,2694,0:44:54,52.3,False,,94.9,107.0,,followers_only,,,,38033619,14080030310
1,3294936864762577520,2024-11-22,07:57:15,1449,0:24:09,1449,0:24:09,165.6,True,291.4,148.4,157.0,,everyone,,,,17455167,12956260994
2,3294936864763706992,2024-11-22,08:09:05,624,0:10:24,624,0:10:24,166.2,True,285.2,149.4,155.0,,everyone,,,,10082640,12956260994
3,3294936864763212400,2024-11-22,08:20:54,631,0:10:31,631,0:10:31,165.8,True,293.3,150.9,156.0,,everyone,,,,10082666,12956260994
4,3294936864765383280,2024-11-22,08:21:28,1444,0:24:04,1444,0:24:04,165.6,True,292.3,151.8,157.0,,everyone,,,,17455167,12956260994


In [70]:
fact_segment_efforts_df_dtype_map = {
    "id": BigInteger,
    "date": Date,
    "time": Time,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "average_cadence": Float,
    "device_watts": Boolean,
    "average_watts": Float,
    "average_heartrate": Float,
    "max_heartrate": Float,
    "pr_rank": Integer,
    "visibility": String,
    "kom_rank": Integer,
    "rank": Integer,
    "type": String,
    "activity_id": BigInteger,
    "segment_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_SEGMENTS_EFFORTS_TABLE} will be overwritten.")

fact_segment_efforts_df.to_sql(
    name=FACT_SEGMENTS_EFFORTS_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_segment_efforts_df_dtype_map,
    method="multi",
    chunksize=1000
)



-6

### Setup `gold.fact_best_efforts`

In [71]:
fact_best_efforts_df = best_eff_df.copy()

In [72]:
fact_best_efforts_df = pd.merge(fact_best_efforts_df, dim_effort_type_df, how='left', on='name')
fact_best_efforts_df = fact_best_efforts_df.rename(columns={'id_y': 'effort_type_id', 'id_x': 'id'})

In [73]:
fact_best_efforts_df = fact_best_efforts_df.drop(columns=['moving_time_td', 'elapsed_time_td'])
fact_best_efforts_df.loc[:, "moving_time_td"]  = extract_timedelta(fact_best_efforts_df["moving_time"])
fact_best_efforts_df.loc[:, "elapsed_time_td"] = extract_timedelta(fact_best_efforts_df["elapsed_time"])

In [74]:
fact_best_efforts_df['date'] = fact_best_efforts_df['start_date_local_dt'].dt.date
fact_best_efforts_df['time'] = fact_best_efforts_df['start_date_local_dt'].dt.time

In [75]:
fact_best_efforts_cols_clean = [
  'id',
  'date',
  'time',
  'moving_time',
  'moving_time_td',
  'elapsed_time',
  'elapsed_time_td',
  'rank',
  'type',
  'effort_type_id',
  'activity_id'
]
fact_best_efforts_df = fact_best_efforts_df[fact_best_efforts_cols_clean]

In [76]:
fact_best_efforts_df.head()

Unnamed: 0,id,date,time,moving_time,moving_time_td,elapsed_time,elapsed_time_td,rank,type,effort_type_id,activity_id
0,55038759090,2024-11-22,07:54:04,147,0:02:27,147,0:02:27,,,1000,12956260994
1,55038759091,2024-11-22,07:54:04,301,0:05:01,301,0:05:01,,,1001,12956260994
2,55038759092,2024-11-22,07:54:04,376,0:06:16,376,0:06:16,,,1002,12956260994
3,55038759086,2024-11-22,07:54:04,612,0:10:12,612,0:10:12,,,1003,12956260994
4,55038759087,2024-11-22,07:54:04,1238,0:20:38,1238,0:20:38,,,1004,12956260994


In [77]:
fact_best_efforts_df_dtype_map = {
    "id": BigInteger,
    "date": Date,
    "time": Time,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "rank": Integer,
    "type": String,
    "effort_type_id": BigInteger,
    "activity_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_BEST_EFFORTS_TABLE} will be overwritten.")

fact_best_efforts_df.to_sql(
    name=FACT_BEST_EFFORTS_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_best_efforts_df_dtype_map,
    method="multi",
    chunksize=1000
)



-4

### Setup `gold.fact_kudos`

In [78]:
fact_kudos_df = kudos_df.copy()

In [79]:
fact_kudos_df.head()

Unnamed: 0,first_name,last_name,full_name,activity_id
0,Kacper,G.,Kacper G.,15716821076
1,Jan,K.,Jan K.,15716821076
2,Jacek,S.,Jacek S.,15716821076
3,Ola,≈Å.,Ola ≈Å.,15716821076
4,Kacper,K.,Kacper K.,15716821076


In [80]:
fact_kudos_df_dtype_map = {
    "first_name": String,
    "last_name": String,
    "full_name": String,
    "activity_id": BigInteger
}
with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_KUDOS_TABLE} will be overwritten.")

fact_kudos_df.to_sql(
    name=FACT_KUDOS_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_kudos_df_dtype_map,
    method="multi",
    chunksize=1000
)



-8

### Setup `gold.fact_laps`

In [81]:
fact_laps_df = laps_df.copy()

In [82]:
fact_laps_df = fact_laps_df.drop(columns=['moving_time_td', 'elapsed_time_td'])
fact_laps_df.loc[:, "moving_time_td"]  = extract_timedelta(fact_laps_df["moving_time"])
fact_laps_df.loc[:, "elapsed_time_td"] = extract_timedelta(fact_laps_df["elapsed_time"])

In [83]:
fact_laps_df['date'] = fact_laps_df['start_date_local_dt'].dt.date
fact_laps_df['time'] = fact_laps_df['start_date_local_dt'].dt.time

In [84]:
fact_laps_cols_clean = [
  'id',
  'name',
  'lap_index',
  'split',
  'date',
  'time',
  'distance',
  'moving_time',
  'moving_time_td',
  'elapsed_time',
  'elapsed_time_td',
  'total_elevation_gain',
  'average_speed',
  'avg_pace_str',
  'avg_pace_float',
  'max_speed',
  'max_pace_str',
  'max_pace_float',
  'average_cadence',
  'average_heartrate',
  'max_heartrate',
  'activity_id'
]
fact_laps_df = fact_laps_df[fact_laps_cols_clean]

In [85]:
fact_laps_df.head()

Unnamed: 0,id,name,lap_index,split,date,time,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_heartrate,max_heartrate,activity_id
0,49980673377,Lap 1,1,1,2025-04-05,13:39:40,0.0,6066,1:41:06,6066,1:41:06,0.0,0.0,,,0.0,,,,106.9,149.0,14086094444
1,49956432295,Lap 1,1,1,2025-04-04,20:37:40,1000.0,811,0:13:31,811,0:13:31,8.4,1.23,,,3.4,,,51.7,94.8,107.0,14080030310
2,49956432299,Lap 2,2,2,2025-04-04,20:37:40,1000.0,745,0:12:25,745,0:12:25,3.8,1.34,,,1.814,,,52.1,95.0,107.0,14080030310
3,49956432304,Lap 3,3,3,2025-04-04,20:37:40,1000.0,704,0:11:44,704,0:11:44,3.4,1.42,,,1.709,,,52.9,95.4,106.0,14080030310
4,49956432306,Lap 4,4,4,2025-04-04,20:37:40,1000.0,720,0:12:00,720,0:12:00,0.0,1.39,,,1.791,,,52.1,94.1,100.0,14080030310


In [86]:
fact_laps_df_dtype_map = {
"id": BigInteger,
"name": String,
"lap_index": Integer,
"split": Integer,
"date": Date,
"time": Time,
"distance": Float,
"moving_time": Integer,
"moving_time_td": Interval,
"elapsed_time": Integer,
"elapsed_time_td": Interval,
"total_elevation_gain": Float,
"average_speed": Float,
"avg_pace_str": String,
"avg_pace_float": Float,
"max_speed": Float,
"max_pace_str": String,
"max_pace_float": Float,
"average_cadence": Float,
"average_heartrate": Float,
"max_heartrate": Float,
"activity_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_LAPS_TABLE} will be overwritten.")

fact_laps_df.to_sql(
    name=FACT_LAPS_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_laps_df_dtype_map,
    method="multi",
    chunksize=1000
)




-9

### Setup `gold.fact_zones`

In [87]:
fact_zones_df = zones_df.copy()

In [88]:
fact_zones_cols_clean = [
    'activity_id',
    'type',
    'zone_number',
    'zone_name',
    'time'
]
fact_zones_df = fact_zones_df[fact_zones_cols_clean]

In [89]:
fact_zones_df.head()

Unnamed: 0,activity_id,type,zone_number,zone_name,time
0,15923268347,heartrate,1,Z1 - Recovery,73.0
1,15923268347,heartrate,2,Z2 - Endurance,1706.0
2,15923268347,heartrate,3,Z3 - Tempo,1290.0
3,15923268347,heartrate,4,Z4 - Threshold,0.0
4,15923268347,heartrate,5,Z5 - Anaerobic,0.0


In [90]:
fact_zones_dtype_map = {
    "activity_id": BigInteger,
    "type": String,
    'zone_number': Integer,
    "zone_name": String,
    "time": Float
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_ZONES_TABLE} will be overwritten.")

fact_zones_df.to_sql(
    name=FACT_ZONES_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_zones_dtype_map,
    method="multi",
    chunksize=1000
)



-8

### Setup `gold.fact_maps`

In [91]:
fact_maps_df = maps_df.copy()

In [92]:
fact_maps_df = fact_maps_df.rename(columns={'id': 'map_id'})

In [93]:
fact_maps_df

Unnamed: 0,map_id,point_id,lat,lng
0,a14086094444,0,,
1,a14080030310,0,51.10761,17.12408
2,a14080030310,1,51.10761,17.12406
3,a14080030310,2,51.10756,17.12405
4,a14080030310,3,51.10744,17.12399
...,...,...,...,...
387621,a16038944725,436,52.73748,15.21183
387622,a16038944725,437,52.73764,15.21178
387623,a16038944725,438,52.73781,15.21179
387624,a16038944725,439,52.73852,15.21171


In [94]:
fact_maps_dtype_map = {
    "map_id": String,
    "point_id": Integer,
    'lat': Float,
    "lng": Float
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_MAPS_TABLE} will be overwritten.")

fact_maps_df.to_sql(
    name=FACT_MAPS_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_maps_dtype_map,
    method="multi",
    chunksize=1000
)



-388

### Primary keys definition

In [95]:
keys_instructions= [
    f"""CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};""",
    # ********** PRIMARY KEYS **********
    # --- dim_calendar ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_CALENDAR_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_CALENDAR_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_CALENDAR_TABLE}
          ADD CONSTRAINT {DIM_CALENDAR_TABLE}_pkey PRIMARY KEY (date);
      END IF;
    END $$;
    """,
    # --- dim_device ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_DEVICE_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_DEVICE_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_DEVICE_TABLE}
          ADD CONSTRAINT {DIM_DEVICE_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_effort_type ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_EFFORT_TYPE_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_EFFORT_TYPE_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_EFFORT_TYPE_TABLE}
          ADD CONSTRAINT {DIM_EFFORT_TYPE_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_gear ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_GEAR_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_GEAR_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_GEAR_TABLE}
          ADD CONSTRAINT {DIM_GEAR_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_location ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_LOCATION_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_LOCATION_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_LOCATION_TABLE}
          ADD CONSTRAINT {DIM_LOCATION_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_segment ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_SEGMENT_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_SEGMENT_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_SEGMENT_TABLE}
          ADD CONSTRAINT {DIM_SEGMENT_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_sport_type ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_SPORT_TYPE_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_SPORT_TYPE_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_SPORT_TYPE_TABLE}
          ADD CONSTRAINT {DIM_SPORT_TYPE_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_time ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_TIME_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_TIME_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_TIME_TABLE}
          ADD CONSTRAINT {DIM_TIME_TABLE}_pkey PRIMARY KEY (time);
      END IF;
    END $$;
    """,
    # --- dim_workout_type ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_WORKOUT_TYPE_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_WORKOUT_TYPE_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_WORKOUT_TYPE_TABLE}
          ADD CONSTRAINT {DIM_WORKOUT_TYPE_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- fact_activities ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{FACT_ACTIVITIES_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{FACT_ACTIVITIES_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{FACT_ACTIVITIES_TABLE}
          ADD CONSTRAINT {FACT_ACTIVITIES_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- fact_best_efforts ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{FACT_BEST_EFFORTS_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{FACT_BEST_EFFORTS_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{FACT_BEST_EFFORTS_TABLE}
          ADD CONSTRAINT {FACT_BEST_EFFORTS_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- fact_laps ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{FACT_LAPS_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{FACT_LAPS_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{FACT_LAPS_TABLE}
          ADD CONSTRAINT {FACT_LAPS_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """
]

In [96]:
with engine.begin() as conn:
    for sql in keys_instructions:
        conn.execute(text(sql))