### Import and config

In [1]:
# Imports
import os
import logging
from datetime import timedelta

from dotenv import load_dotenv

import pandas as pd
import numpy as np

from sqlalchemy import create_engine, text, Integer, Float, String, Boolean, Date, Interval, Text, BigInteger, Time


# Configuration
load_dotenv()

# DB
DB_URI = os.getenv('DB_URI')

# Silver tables
TARGET_S_SCHEMA = os.getenv('TARGET_S_SCHEMA')
ACTIVITIES_S_TABLE = os.getenv('ACTIVITIES_S_TABLE')
LOCATIONS_S_TABLE = os.getenv('LOCATIONS_S_TABLE')
GEAR_S_TABLE = os.getenv('GEAR_S_TABLE')
SEGMENTS_S_TABLE = os.getenv('SEGMENTS_S_TABLE')
BEST_EFFORTS_S_TABLE = os.getenv('BEST_EFFORTS_S_TABLE')
SEGMENTS_EFFORTS_S_TABLE = os.getenv('SEGMENTS_EFFORTS_S_TABLE')
LAPS_S_TABLE = os.getenv('LAPS_S_TABLE')
KUDOS_S_TABLE = os.getenv('KUDOS_S_TABLE')

# Gold tables
TARGET_G_SCHEMA = os.getenv('TARGET_G_SCHEMA')
FACT_ACTIVITIES_TABLE = os.getenv('FACT_ACTIVITIES_TABLE')
FACT_SEGMENTS_EFFORTS_TABLE = os.getenv('FACT_SEGMENTS_EFFORTS_TABLE')
FACT_BEST_EFFORTS_TABLE = os.getenv('FACT_BEST_EFFORTS_TABLE')
FACT_KUDOS_TABLE = os.getenv('FACT_KUDOS_TABLE')
FACT_LAPS_TABLE = os.getenv('FACT_LAPS_TABLE')

DIM_CALENDAR_TABLE = os.getenv('DIM_CALENDAR_TABLE')
DIM_TIME_TABLE = os.getenv('DIM_TIME_TABLE')
DIM_SPORT_TYPE_TABLE = os.getenv('DIM_SPORT_TYPE_TABLE')
DIM_DEVICE_TABLE = os.getenv('DIM_DEVICE_TABLE')
DIM_LOCATION_TABLE = os.getenv('DIM_LOCATION_TABLE')
DIM_GEAR_TABLE = os.getenv('DIM_GEAR_TABLE')
DIM_SEGMENT_TABLE = os.getenv('DIM_SEGMENT_TABLE')
DIM_EFFORT_TYPE_TABLE = os.getenv('DIM_EFFORT_TYPE_TABLE')
DIM_WORKOUT_TYPE_TABLE = os.getenv('DIM_WORKOUT_TYPE_TABLE')

# Other
LOG_LEVEL = os.getenv('LOG_LEVEL')

logging.basicConfig(
    level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
    format="%(asctime)s | %(levelname)s | %(message)s"
)

pd.set_option('display.max_columns', None)

### DB names validation

In [2]:
REQUIRED_ENV = [
  'DB_URI',
  'TARGET_S_SCHEMA','TARGET_G_SCHEMA',
  'ACTIVITIES_S_TABLE','LOCATIONS_S_TABLE','GEAR_S_TABLE','SEGMENTS_S_TABLE','BEST_EFFORTS_S_TABLE','SEGMENTS_EFFORTS_S_TABLE','LAPS_S_TABLE','KUDOS_S_TABLE',
  'DIM_CALENDAR_TABLE','DIM_TIME_TABLE','DIM_DEVICE_TABLE','DIM_SPORT_TYPE_TABLE','DIM_LOCATION_TABLE','DIM_GEAR_TABLE','DIM_SEGMENT_TABLE','DIM_EFFORT_TYPE_TABLE','DIM_WORKOUT_TYPE_TABLE',
  'FACT_ACTIVITIES_TABLE','FACT_SEGMENTS_EFFORTS_TABLE','FACT_BEST_EFFORTS_TABLE','FACT_LAPS_TABLE','FACT_KUDOS_TABLE'
]
missing = [k for k in REQUIRED_ENV if not os.getenv(k)]
if missing:
    raise RuntimeError(f"Missing env variables: {', '.join(missing)}")


### Request data from `silver` layer

In [3]:
engine = create_engine(
  DB_URI, 
  pool_pre_ping=True, 
  pool_size=5, 
  max_overflow=10
)
logging.info("Connection established")

2025-09-23 10:30:24,901 | INFO | Connection established


In [4]:
with engine.begin() as conn:
  activities_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE}"), conn)
  locations_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{LOCATIONS_S_TABLE}"), conn)
  gear_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{GEAR_S_TABLE}"), conn)
  segments_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{SEGMENTS_S_TABLE}"), conn)
  best_eff_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{BEST_EFFORTS_S_TABLE}"), conn)
  seg_eff_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{SEGMENTS_EFFORTS_S_TABLE}"), conn)
  laps_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{LAPS_S_TABLE}"), conn)
  kudos_df = pd.read_sql(text(f"SELECT * FROM {TARGET_S_SCHEMA}.{KUDOS_S_TABLE}"), conn)
logging.info(f"Data from {TARGET_S_SCHEMA}.{ACTIVITIES_S_TABLE} downloaded.")

2025-09-23 10:30:25,348 | INFO | Data from silver.activities downloaded.


In [5]:
activities_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,elev_low,elev_high,type,sport_type,workout_type,achievement_count,kudos_count,comment_count,athlete_count,photo_count,trainer,commute,manual,visibility,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_watts,max_watts,weighted_average_watts,has_heartrate,average_heartrate,max_heartrate,pr_count,total_photo_count,suffer_score,description,calories,device_name,start_lat,start_lng,map_id,gear_id,location_id
0,15855640218,K200süèéÔ∏è,2025-09-18 17:30:30,2025-09-18 19:30:30,Etc/GMT-2.0,9915.6,3272,0 days 00:54:32,3314,0 days 00:55:14,10.0,115.6,122.8,Run,Run,3.0,0,9,0,1,0,False,False,False,everyone,3.03,5:30,5.50055,4.82,3:27,3.457815,164.4,365.2,546.0,382.0,True,151.7,176.0,0,0,60.0,K200s with Runna ‚úÖ\n\nKilometr√≥wki z dwusetkam...,765.0,Garmin Forerunner 970,51.107177,17.123797,a15855640218,g24134620,1179.0
1,15843349072,9km Easy RunüëΩ,2025-09-17 16:08:34,2025-09-17 18:08:34,Etc/GMT-2.0,9051.8,3095,0 days 00:51:35,3142,0 days 00:52:22,16.0,114.8,123.0,Run,Run,,0,6,0,1,0,False,False,False,everyone,2.925,5:42,5.698006,4.12,4:03,4.045307,169.4,367.5,513.0,364.0,True,145.1,152.0,0,0,41.0,9km Easy Run with Runna ‚úÖ\n\nWyjƒÖtkowo ≈ºwawe e...,699.0,Garmin Forerunner 970,51.107162,17.123739,a15843349072,g24134620,1179.0
2,15831049874,Afternoon Weight Training,2025-09-16 15:01:07,2025-09-16 17:01:07,Etc/GMT-2.0,0.0,3825,0 days 01:03:45,3825,0 days 01:03:45,0.0,0.0,0.0,Workout,WeightTraining,,0,5,1,1,0,True,False,False,followers_only,0.0,,,0.0,,,,,,,True,94.0,222.0,0,0,9.0,Reska8Ô∏è‚É£8Ô∏è‚É£\nBench press PR: 85kgü•≥,254.0,Garmin Forerunner 970,,,a15831049874,x00000000,
3,15820198827,Tempo 2kmü•µ,2025-09-15 16:23:21,2025-09-15 18:23:21,Etc/GMT-2.0,9521.9,3241,0 days 00:54:01,3241,0 days 00:54:01,13.0,115.4,125.0,Run,Run,3.0,0,8,0,1,0,False,False,False,everyone,2.938,5:40,5.672793,4.94,3:22,3.373819,167.8,353.5,493.0,369.0,True,153.1,178.0,0,0,66.0,Tempo 2km Repeats with Runna ‚úÖ\n\nWysz≈Ço troch...,735.0,Garmin Forerunner 970,51.107301,17.124098,a15820198827,g24134620,1179.0
4,15805849875,15km Long Run‚òîÔ∏è,2025-09-14 09:59:25,2025-09-14 11:59:25,Etc/GMT-2.0,15059.0,5461,0 days 01:31:01,5488,0 days 01:31:28,31.0,114.6,125.4,Run,Run,2.0,0,4,0,1,0,False,False,False,everyone,2.758,6:03,6.043026,3.44,4:51,4.844961,172.0,335.5,455.0,334.0,True,144.8,153.0,0,0,66.0,15km Long Run with Runna ‚úÖ\n\nOkrutny beton po...,1170.0,Garmin Forerunner 970,51.107336,17.124136,a15805849875,g24134620,1179.0


In [6]:
locations_df.head()

Unnamed: 0,id,country,region,locality
0,1000,Austria,Lower Austria,Waidhofen an der Ybbs
1,1001,Austria,Upper Austria,Garsten
2,1002,Austria,Upper Austria,Gr√ºnburg
3,1003,Austria,Upper Austria,Linz
4,1004,Austria,Upper Austria,St. Ulrich bei Steyr


In [7]:
gear_df.head()

Unnamed: 0,id,name,distance_m,distance_km
0,g24134620,ASICS Novablast 5,333012.0,333.0
1,x00000000,No gear,0.0,0.0
2,b12572672,Cube Nuroad Pro,3734608.0,3734.6
3,g23642256,Adidas EVO SL,146967.0,147.0
4,g19800575,Nike Invincible Run 3 White,430102.0,430.1


In [8]:
segments_df.head()

Unnamed: 0,id,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,elevation_profile,elevation_profiles,climb_category,private,hazardous,starred,start_lat,start_lng,end_lat,end_lng,location_id
0,38033619,BƒÖczek counterclockwise,Walk,3722.4,0.0,8.0,123.8,118.2,,,0.0,False,False,False,51.104196,17.124846,51.104209,17.124837,1179
1,17455167,Pƒôtla od ≈õluzy,Run,3886.4,0.0,10.5,120.0,112.9,,,0.0,False,False,False,51.104195,17.124249,51.104082,17.124274,1179
2,10082640,Most Chrobrego- most Bartoszowicki,Run,1582.0,0.0,2.5,118.2,115.5,,,0.0,False,False,False,51.113182,17.108164,51.102416,17.122673,1179
3,10082666,Swojczycki - Sluza revers,Run,1580.4,0.0,2.7,118.8,114.6,,,0.0,False,False,False,51.103481,17.12464,51.114135,17.109978,1179
4,22997595,Po kostce do Grobli,Run,456.5,0.2,7.5,117.0,116.0,,,0.0,False,False,False,51.104347,17.125358,51.101168,17.129389,1179


In [9]:
best_eff_df.head()

Unnamed: 0,id,name,start_date_utc_dt,start_date_local_dt,local_timezone,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,rank,type,activity_id
0,55038759090,400m,2024-11-22 06:54:04,2024-11-22 07:54:04,Etc/GMT-1.0,400.0,147,0 days 00:02:27,147,0 days 00:02:27,,,12956260994
1,55038759091,1/2 mile,2024-11-22 06:54:04,2024-11-22 07:54:04,Etc/GMT-1.0,805.0,301,0 days 00:05:01,301,0 days 00:05:01,,,12956260994
2,55038759092,1K,2024-11-22 06:54:04,2024-11-22 07:54:04,Etc/GMT-1.0,1000.0,376,0 days 00:06:16,376,0 days 00:06:16,,,12956260994
3,55038759086,1 mile,2024-11-22 06:54:04,2024-11-22 07:54:04,Etc/GMT-1.0,1609.0,612,0 days 00:10:12,612,0 days 00:10:12,,,12956260994
4,55038759087,2 mile,2024-11-22 06:54:04,2024-11-22 07:54:04,Etc/GMT-1.0,3219.0,1238,0 days 00:20:38,1238,0 days 00:20:38,,,12956260994


### Setup `gold.dim_calendar`

In [10]:
activities_df['start_date_local_dt'].dt.date.min()

datetime.date(2016, 7, 11)

In [11]:
dim_calendar_df = pd.DataFrame(
  {'date': pd.date_range(
    activities_df['start_date_local_dt'].dt.date.min(), 
    activities_df['start_date_local_dt'].dt.date.max(), 
    freq="D")}
)
dim_calendar_df = dim_calendar_df.sort_values(by='date', ascending=False).reset_index(drop=True)

In [12]:
#Year
dim_calendar_df['year'] = dim_calendar_df['date'].dt.year
dim_calendar_df['year_start_date'] = dim_calendar_df['date'].dt.to_period('Y').dt.start_time

# Month
dim_calendar_df['month'] = dim_calendar_df['date'].dt.month
dim_calendar_df['month_year'] = dim_calendar_df['date'].dt.to_period('M').astype('str')
dim_calendar_df['month_start_date'] = dim_calendar_df['date'].dt.to_period('M').dt.start_time
dim_calendar_df['month_name_year'] = dim_calendar_df['date'].dt.strftime('%b-%Y')
dim_calendar_df['month_name'] = dim_calendar_df['date'].dt.strftime('%B') 

# Week
dim_calendar_df['week'] = dim_calendar_df['date'].dt.isocalendar().week
dim_calendar_df['week_start_date'] = dim_calendar_df['date'].dt.to_period('W-MON').dt.start_time

# Day
dim_calendar_df['day'] = dim_calendar_df['date'].dt.day
dim_calendar_df['day_of_year'] = dim_calendar_df['date'].dt.day_of_year
dim_calendar_df['day_of_week'] = dim_calendar_df['date'].dt.weekday + 1
dim_calendar_df['day_of_week_name'] = dim_calendar_df['date'].dt.day_name()
dim_calendar_df['is_weekend'] = dim_calendar_df['date'].dt.weekday + 1 >= 6

In [13]:
dim_calendar_df.head()

Unnamed: 0,date,year,year_start_date,month,month_year,month_start_date,month_name_year,month_name,week,week_start_date,day,day_of_year,day_of_week,day_of_week_name,is_weekend
0,2025-09-18,2025,2025-01-01,9,2025-09,2025-09-01,Sep-2025,September,38,2025-09-16,18,261,4,Thursday,False
1,2025-09-17,2025,2025-01-01,9,2025-09,2025-09-01,Sep-2025,September,38,2025-09-16,17,260,3,Wednesday,False
2,2025-09-16,2025,2025-01-01,9,2025-09,2025-09-01,Sep-2025,September,38,2025-09-16,16,259,2,Tuesday,False
3,2025-09-15,2025,2025-01-01,9,2025-09,2025-09-01,Sep-2025,September,38,2025-09-09,15,258,1,Monday,False
4,2025-09-14,2025,2025-01-01,9,2025-09,2025-09-01,Sep-2025,September,37,2025-09-09,14,257,7,Sunday,True


In [14]:
dim_calendar_df_dtype_map = {
    "date": Date,
    "year": Integer,
    "year_start_date": Date,
    "month": Integer,
    "month_year": String,
    "month_start_date": Date,
    "month_name_year": String,
    "month_name": String,
    "week": Integer,
    "week_start_date": Date,
    "day": Integer,
    "day_of_year": Integer,
    "day_of_week": Integer,
    "day_of_week_name": String,
    "is_weekend": Boolean
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_CALENDAR_TABLE} will be overwritten.")

dim_calendar_df.to_sql(
    name=DIM_CALENDAR_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_calendar_df_dtype_map,
    method="multi",
    chunksize=1000
)



-4

### Setup `gold.dim_time`

In [15]:
times = pd.date_range("1900-01-01 00:00:00", "1900-01-01 23:59:59", freq="s")
dim_time_df = pd.DataFrame({"datetime": times})

In [16]:
dim_time_df['time'] = dim_time_df['datetime'].dt.time
dim_time_df['hour'] = dim_time_df['datetime'].dt.hour
dim_time_df['minute'] = dim_time_df['datetime'].dt.minute
dim_time_df['second'] = dim_time_df['datetime'].dt.second
dim_time_df['hour_minute'] = dim_time_df['datetime'].dt.strftime("%H:%M")
dim_time_df['hour_label'] = dim_time_df['datetime'].dt.strftime("%H:00")
dim_time_df['day_part'] = pd.cut(dim_time_df['hour'], 
                                 bins=[-1,5,11,13,17,23],
                                 labels=['night','morning','lunch','afternoon','evening'],
                                 include_lowest=True)
dim_time_df = dim_time_df.drop(columns='datetime')

In [17]:
dim_time_df.head()

Unnamed: 0,time,hour,minute,second,hour_minute,hour_label,day_part
0,00:00:00,0,0,0,00:00,00:00,night
1,00:00:01,0,0,1,00:00,00:00,night
2,00:00:02,0,0,2,00:00,00:00,night
3,00:00:03,0,0,3,00:00,00:00,night
4,00:00:04,0,0,4,00:00,00:00,night


In [18]:
dim_time_df_dtype_map = {
    "time": Time,
    "hour": Integer,
    "minute": Integer,
    "second": Integer,
    "hour_minute": String,
    "hour_label": String,
    "day_part": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_TIME_TABLE} will be overwritten.")

dim_time_df.to_sql(
    name=DIM_TIME_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_time_df_dtype_map,
    method="multi",
    chunksize=1000
)



-87

### Setup `gold.dim_sport_type`

In [19]:
dim_sport_type_df = (activities_df['sport_type']
                     .copy()
                     .drop_duplicates()
                     .to_frame()
                     .reset_index(drop=True))
dim_sport_type_df['sport_type_id'] = 1000 + np.arange(len(dim_sport_type_df))

In [20]:
dim_sport_type_df = dim_sport_type_df[['sport_type_id', 'sport_type']]
dim_sport_type_df = dim_sport_type_df.rename(columns={'sport_type_id': 'id'})

In [21]:
dim_sport_type_df.head()

Unnamed: 0,id,sport_type
0,1000,Run
1,1001,WeightTraining
2,1002,Ride
3,1003,Walk
4,1004,Workout


In [22]:
dim_sport_type_df_dtype_map = {
    "id": Integer,
    "sport_type": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_SPORT_TYPE_TABLE} will be overwritten.")

dim_sport_type_df.to_sql(
    name=DIM_SPORT_TYPE_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_sport_type_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.dim_device`

In [23]:
dim_device_df = (activities_df['device_name']
                     .copy()
                     .drop_duplicates()
                     .to_frame()
                     .reset_index(drop=True))
dim_device_df['device_id'] = 1000 + np.arange(len(dim_device_df))

In [24]:
dim_device_df = dim_device_df[['device_id', 'device_name']]
dim_device_df = dim_device_df.rename(columns={'device_id': 'id'})
dim_device_df['device_name'] = dim_device_df['device_name'].fillna('No device')

In [25]:
dim_device_df.head()

Unnamed: 0,id,device_name
0,1000,Garmin Forerunner 970
1,1001,Garmin Edge 840
2,1002,No device
3,1003,Apple Watch SE
4,1004,Garmin Forerunner 945


In [26]:
dim_device_df_dtype_map = {
    "id": Integer,
    "device_name": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_DEVICE_TABLE} will be overwritten.")

dim_device_df.to_sql(
    name=DIM_DEVICE_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_device_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.dim_location`

In [27]:
dim_location_df = locations_df.copy()

In [28]:
dim_location_df.head()

Unnamed: 0,id,country,region,locality
0,1000,Austria,Lower Austria,Waidhofen an der Ybbs
1,1001,Austria,Upper Austria,Garsten
2,1002,Austria,Upper Austria,Gr√ºnburg
3,1003,Austria,Upper Austria,Linz
4,1004,Austria,Upper Austria,St. Ulrich bei Steyr


In [29]:
dim_location_df_dtype_map = {
    "id": Integer,
    "locality": String,
    "region": String,
    "country": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_LOCATION_TABLE} will be overwritten.")

dim_location_df.to_sql(
    name=DIM_LOCATION_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_location_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.dim_gear`

In [30]:
dim_gear_df = gear_df.copy()

In [31]:
dim_gear_df = dim_gear_df.sort_values(by='id')
dim_gear_df = dim_gear_df.reset_index(drop=True)

In [32]:
dim_gear_df.head()

Unnamed: 0,id,name,distance_m,distance_km
0,b12572672,Cube Nuroad Pro,3734608.0,3734.6
1,b13100260,Cube Nuroad Pro Wirtualnie,520771.0,520.8
2,g11165677,New Balance 1080 v12,1194617.0,1194.6
3,g11783267,Nike Zoom Fly 4,272798.0,272.8
4,g17673165,Nike Invincible Run 3 Black,666029.0,666.0


In [33]:
dim_gear_df_dtype_map = {
    "id": String,
    "name": String,
    "distance_m": Float,
    "distance_km": Float,
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_GEAR_TABLE} will be overwritten.")

dim_gear_df.to_sql(
    name=DIM_GEAR_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_gear_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.dim_segment`

In [34]:
dim_segment_df = segments_df.copy()

In [35]:
dim_segment_cols_clean = [
  'id',
  'name',
  'activity_type',
  'distance',
  'average_grade',
  'maximum_grade',
  'elevation_high',
  'elevation_low',
  'climb_category',
  'location_id'
]

In [36]:
dim_segment_df = dim_segment_df[dim_segment_cols_clean]
dim_segment_df = dim_segment_df.sort_values(by='id')
dim_segment_df = dim_segment_df.reset_index(drop=True)

In [37]:
dim_segment_df.head()

Unnamed: 0,id,name,activity_type,distance,average_grade,maximum_grade,elevation_high,elevation_low,climb_category,location_id
0,1094192,Opatowitz 2and1/2m,Ride,294.308,1.6,2.7,120.7,115.8,0.0,1179
1,1137415,Passeig De Garcia F√†ria Climb,Run,361.742,1.0,18.1,17.2,-1.3,0.0,1230
2,1137416,Carretera De Montju√Øc Climb,Run,439.109,2.8,8.1,19.2,6.9,0.0,1230
3,1332451,Pasikurowice,Ride,11505.7,0.2,3.6,134.6,111.0,0.0,1179
4,1616501,Legnicka - Kleci≈Ñska,Ride,1481.65,0.4,2.9,121.0,113.3,0.0,1179


In [38]:
dim_segment_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "activity_type": String,
    "distance": Float,
    "average_grade": Float,
    "maximum_grade": Float,
    "elevation_high": Float,
    "elevation_low": Float,
    "climb_category": Float,
    "location_id": Integer
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_SEGMENT_TABLE} will be overwritten.")

dim_segment_df.to_sql(
    name=DIM_SEGMENT_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_segment_df_dtype_map,
    method="multi",
    chunksize=1000
)



-2

### Setup `gold.dim_effort_type`

In [39]:
dim_effort_type_df = (best_eff_df['name']
                     .copy()
                     .drop_duplicates()
                     .to_frame()
                     .reset_index(drop=True))
dim_effort_type_df['id'] = 1000 + np.arange(len(dim_effort_type_df))

In [40]:
dim_effort_type_df = dim_effort_type_df[['id', 'name']]

In [41]:
dim_effort_type_df.head()

Unnamed: 0,id,name
0,1000,400m
1,1001,1/2 mile
2,1002,1K
3,1003,1 mile
4,1004,2 mile


In [42]:
dim_effort_type_df_dtype_map = {
    "id": Integer,
    "name": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_EFFORT_TYPE_TABLE} will be overwritten.")

dim_effort_type_df.to_sql(
    name=DIM_EFFORT_TYPE_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_effort_type_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.dim_workout_type`

In [43]:
def extract_timedelta(time: pd.Series) -> pd.Series:
  """
  Convert a Series of numeric values (seconds) into timedeltas.

  Parameters
  ----------
  time : pd.Series
      Series containing durations expressed in seconds (int/float). 
      Null values are preserved as None.

  Returns
  -------
  pd.Series
      Series of Python ``datetime.timedelta`` objects. 
      Each element corresponds to the given number of seconds or None if missing.
  """

  return pd.Series([(timedelta(seconds=int(t)) if pd.notnull(t) else None) for t in time], dtype="object")

In [44]:
dim_workout_type_df = pd.DataFrame([
    {"id": 0.0, "type": "Run - General"},
    {"id": 1.0, "type": "Run - Race"},
    {"id": 2.0, "type": "Run - Long Run"},
    {"id": 3.0, "type": "Run - Workout"},
    {"id": 10.0, "type": "Ride - General"},
    {"id": 11.0, "type": "Ride - Race"},
    {"id": 12.0, "type": "Ride - Workout"},
    {"id": 20.0, "type": "Other"}
])

In [45]:
dim_workout_type_df.head()

Unnamed: 0,id,type
0,0.0,Run - General
1,1.0,Run - Race
2,2.0,Run - Long Run
3,3.0,Run - Workout
4,10.0,Ride - General


In [46]:
dim_workout_type_df_dtype_map = {
    "id": Integer,
    "type": String
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{DIM_WORKOUT_TYPE_TABLE} will be overwritten.")

dim_workout_type_df.to_sql(
    name=DIM_WORKOUT_TYPE_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=dim_workout_type_df_dtype_map,
    method="multi",
    chunksize=1000
)



-1

### Setup `gold.fact_activities`

In [47]:
def extract_timedelta(time: pd.Series) -> pd.Series:
  """
  Convert a Series of numeric values (seconds) into timedeltas.

  Parameters
  ----------
  time : pd.Series
      Series containing durations expressed in seconds (int/float). 
      Null values are preserved as None.

  Returns
  -------
  pd.Series
      Series of Python ``datetime.timedelta`` objects. 
      Each element corresponds to the given number of seconds or None if missing.
  """

  return pd.Series([(timedelta(seconds=int(t)) if pd.notnull(t) else None) for t in time], dtype="object")

In [48]:
fact_activities_df = activities_df.copy()

In [49]:
fact_activities_df = fact_activities_df.drop(columns=['moving_time_td', 'elapsed_time_td'])
fact_activities_df.loc[:, "moving_time_td"]  = extract_timedelta(fact_activities_df["moving_time"])
fact_activities_df.loc[:, "elapsed_time_td"] = extract_timedelta(fact_activities_df["elapsed_time"])

In [50]:
fact_activities_df = pd.merge(fact_activities_df, dim_sport_type_df, how='left', on='sport_type')
fact_activities_df = fact_activities_df.rename(columns={'id_y': 'sport_type_id', 'id_x': 'id'})
fact_activities_df = pd.merge(fact_activities_df, dim_device_df, how='left', on='device_name')
fact_activities_df = fact_activities_df.rename(columns={'id_y': 'device_id', 'id_x': 'id'})


In [51]:
fact_activities_df['date'] = fact_activities_df['start_date_local_dt'].dt.date
fact_activities_df['time'] = fact_activities_df['start_date_local_dt'].dt.time

In [52]:
fact_activities_df['workout_type'] = fact_activities_df.apply(
    lambda r: 0 if pd.isna(r['workout_type']) and r['sport_type'] == 'Run'
              else 20 if pd.isna(r['workout_type'])
              else r['workout_type'],
    axis=1
)
fact_activities_df = fact_activities_df.rename(columns={'workout_type': 'workout_type_id'})

In [53]:
fact_activities_cols_clean = [
  'id',
  'name',
  'description',
  'date',
  'time',
  'distance',
  'moving_time',
  'moving_time_td',
  'elapsed_time',
  'elapsed_time_td',
  'total_elevation_gain',
  'achievement_count',
  'kudos_count',
  'comment_count',
  'athlete_count',
  'photo_count',
  'commute',
  'manual',
  'visibility',
  'average_speed',
  'avg_pace_str',
  'avg_pace_float',
  'max_speed',
  'max_pace_str',
  'max_pace_float',
  'average_cadence',
  'average_watts',
  'max_watts',
  'weighted_average_watts',
  'has_heartrate',
  'average_heartrate',
  'max_heartrate',
  'pr_count',
  'suffer_score',
  'calories',
  'gear_id',
  'location_id',
  'sport_type_id',
  'device_id',
  'workout_type_id'
]
fact_activities_df = fact_activities_df[fact_activities_cols_clean]

In [54]:
fact_activities_df.head()

Unnamed: 0,id,name,description,date,time,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,achievement_count,kudos_count,comment_count,athlete_count,photo_count,commute,manual,visibility,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_watts,max_watts,weighted_average_watts,has_heartrate,average_heartrate,max_heartrate,pr_count,suffer_score,calories,gear_id,location_id,sport_type_id,device_id,workout_type_id
0,15855640218,K200süèéÔ∏è,K200s with Runna ‚úÖ\n\nKilometr√≥wki z dwusetkam...,2025-09-18,19:30:30,9915.6,3272,0:54:32,3314,0:55:14,10.0,0,9,0,1,0,False,False,everyone,3.03,5:30,5.50055,4.82,3:27,3.457815,164.4,365.2,546.0,382.0,True,151.7,176.0,0,60.0,765.0,g24134620,1179.0,1000,1000.0,3.0
1,15843349072,9km Easy RunüëΩ,9km Easy Run with Runna ‚úÖ\n\nWyjƒÖtkowo ≈ºwawe e...,2025-09-17,18:08:34,9051.8,3095,0:51:35,3142,0:52:22,16.0,0,6,0,1,0,False,False,everyone,2.925,5:42,5.698006,4.12,4:03,4.045307,169.4,367.5,513.0,364.0,True,145.1,152.0,0,41.0,699.0,g24134620,1179.0,1000,1000.0,0.0
2,15831049874,Afternoon Weight Training,Reska8Ô∏è‚É£8Ô∏è‚É£\nBench press PR: 85kgü•≥,2025-09-16,17:01:07,0.0,3825,1:03:45,3825,1:03:45,0.0,0,5,1,1,0,False,False,followers_only,0.0,,,0.0,,,,,,,True,94.0,222.0,0,9.0,254.0,x00000000,,1001,1000.0,20.0
3,15820198827,Tempo 2kmü•µ,Tempo 2km Repeats with Runna ‚úÖ\n\nWysz≈Ço troch...,2025-09-15,18:23:21,9521.9,3241,0:54:01,3241,0:54:01,13.0,0,8,0,1,0,False,False,everyone,2.938,5:40,5.672793,4.94,3:22,3.373819,167.8,353.5,493.0,369.0,True,153.1,178.0,0,66.0,735.0,g24134620,1179.0,1000,1000.0,3.0
4,15805849875,15km Long Run‚òîÔ∏è,15km Long Run with Runna ‚úÖ\n\nOkrutny beton po...,2025-09-14,11:59:25,15059.0,5461,1:31:01,5488,1:31:28,31.0,0,4,0,1,0,False,False,everyone,2.758,6:03,6.043026,3.44,4:51,4.844961,172.0,335.5,455.0,334.0,True,144.8,153.0,0,66.0,1170.0,g24134620,1179.0,1000,1000.0,2.0


In [55]:
fact_activities_df_dtype_map = {
    "id": BigInteger,
    "name": String,
    "description": Text,
    "date": Date,
    "time": Time,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "total_elevation_gain": Float,
    "achievement_count": Integer,
    "kudos_count": Integer,
    "comment_count": Integer,
    "athlete_count": Integer,
    "photo_count": Integer,
    "commute": Boolean,
    "manual": Boolean,
    "visibility": String,
    "average_speed": Float,
    "avg_pace_str": String,
    "avg_pace_float": Float,
    "max_speed": Float,
    "max_pace_str": String,
    "max_pace_float": Float,
    "average_cadence": Float,
    "average_watts": Float,
    "max_watts": Float,
    "weighted_average_watts": Float,
    "has_heartrate": Boolean,
    "average_heartrate": Float,
    "max_heartrate": Float,
    "pr_count": Integer,
    "suffer_score": Float,
    "calories": Float,
    "gear_id": String,
    "location_id": Integer,
    "sport_type_id": Integer,
    "device_id": Integer,
    "workout_type_id": Integer
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_ACTIVITIES_TABLE} will be overwritten.")

fact_activities_df.to_sql(
    name=FACT_ACTIVITIES_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_activities_df_dtype_map,
    method="multi",
    chunksize=1000
)



-2

### Setup `gold.fact_segment_efforts`

In [56]:
fact_segment_efforts_df = seg_eff_df.copy()

In [57]:
fact_segment_efforts_df = fact_segment_efforts_df.drop(columns=['moving_time_td', 'elapsed_time_td'])
fact_segment_efforts_df.loc[:, "moving_time_td"]  = extract_timedelta(fact_segment_efforts_df["moving_time"])
fact_segment_efforts_df.loc[:, "elapsed_time_td"] = extract_timedelta(fact_segment_efforts_df["elapsed_time"])

In [58]:
fact_segment_efforts_df['date'] = fact_segment_efforts_df['start_date_local_dt'].dt.date
fact_segment_efforts_df['time'] = fact_segment_efforts_df['start_date_local_dt'].dt.time

In [59]:
fact_segment_efforts_cols_clean = [
  'id',
  'date',
  'time',
  'moving_time',
  'moving_time_td',
  'elapsed_time',
  'elapsed_time_td',
  'average_cadence',
  'device_watts',
  'average_watts',
  'average_heartrate',
  'max_heartrate',
  'pr_rank',
  'visibility',
  'kom_rank',
  'rank',
  'type',
  'segment_id',
  'activity_id'
]
fact_segment_efforts_df = fact_segment_efforts_df[fact_segment_efforts_cols_clean]

In [60]:
fact_segment_efforts_df.head()

Unnamed: 0,id,date,time,moving_time,moving_time_td,elapsed_time,elapsed_time_td,average_cadence,device_watts,average_watts,average_heartrate,max_heartrate,pr_rank,visibility,kom_rank,rank,type,segment_id,activity_id
0,3343296484852327920,2025-04-04,20:45:56,2692,0:44:52,2694,0:44:54,52.3,False,,94.9,107.0,,followers_only,,,,38033619,14080030310
1,3294936864762577520,2024-11-22,07:57:15,1449,0:24:09,1449,0:24:09,165.6,True,291.4,148.4,157.0,,everyone,,,,17455167,12956260994
2,3294936864763706992,2024-11-22,08:09:05,624,0:10:24,624,0:10:24,166.2,True,285.2,149.4,155.0,,everyone,,,,10082640,12956260994
3,3294936864763212400,2024-11-22,08:20:54,631,0:10:31,631,0:10:31,165.8,True,293.3,150.9,156.0,,everyone,,,,10082666,12956260994
4,3294936864765383280,2024-11-22,08:21:28,1444,0:24:04,1444,0:24:04,165.6,True,292.3,151.8,157.0,,everyone,,,,17455167,12956260994


In [61]:
fact_segment_efforts_df_dtype_map = {
    "id": BigInteger,
    "date": Date,
    "time": Time,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "average_cadence": Float,
    "device_watts": Boolean,
    "average_watts": Float,
    "average_heartrate": Float,
    "max_heartrate": Float,
    "pr_rank": Integer,
    "visibility": String,
    "kom_rank": Integer,
    "rank": Integer,
    "type": String,
    "activity_id": BigInteger,
    "segment_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_SEGMENTS_EFFORTS_TABLE} will be overwritten.")

fact_segment_efforts_df.to_sql(
    name=FACT_SEGMENTS_EFFORTS_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_segment_efforts_df_dtype_map,
    method="multi",
    chunksize=1000
)



-6

### Setup `gold.fact_best_efforts`

In [62]:
fact_best_efforts_df = best_eff_df.copy()

In [63]:
fact_best_efforts_df = pd.merge(fact_best_efforts_df, dim_effort_type_df, how='left', on='name')
fact_best_efforts_df = fact_best_efforts_df.rename(columns={'id_y': 'effort_type_id', 'id_x': 'id'})

In [64]:
fact_best_efforts_df = fact_best_efforts_df.drop(columns=['moving_time_td', 'elapsed_time_td'])
fact_best_efforts_df.loc[:, "moving_time_td"]  = extract_timedelta(fact_best_efforts_df["moving_time"])
fact_best_efforts_df.loc[:, "elapsed_time_td"] = extract_timedelta(fact_best_efforts_df["elapsed_time"])

In [65]:
fact_best_efforts_df['date'] = fact_best_efforts_df['start_date_local_dt'].dt.date
fact_best_efforts_df['time'] = fact_best_efforts_df['start_date_local_dt'].dt.time

In [66]:
fact_best_efforts_cols_clean = [
  'id',
  'date',
  'time',
  'moving_time',
  'moving_time_td',
  'elapsed_time',
  'elapsed_time_td',
  'rank',
  'type',
  'effort_type_id',
  'activity_id'
]
fact_best_efforts_df = fact_best_efforts_df[fact_best_efforts_cols_clean]

In [67]:
fact_best_efforts_df.head()

Unnamed: 0,id,date,time,moving_time,moving_time_td,elapsed_time,elapsed_time_td,rank,type,effort_type_id,activity_id
0,55038759090,2024-11-22,07:54:04,147,0:02:27,147,0:02:27,,,1000,12956260994
1,55038759091,2024-11-22,07:54:04,301,0:05:01,301,0:05:01,,,1001,12956260994
2,55038759092,2024-11-22,07:54:04,376,0:06:16,376,0:06:16,,,1002,12956260994
3,55038759086,2024-11-22,07:54:04,612,0:10:12,612,0:10:12,,,1003,12956260994
4,55038759087,2024-11-22,07:54:04,1238,0:20:38,1238,0:20:38,,,1004,12956260994


In [68]:
fact_best_efforts_df_dtype_map = {
    "id": BigInteger,
    "date": Date,
    "time": Time,
    "moving_time": Integer,
    "moving_time_td": Interval,
    "elapsed_time": Integer,
    "elapsed_time_td": Interval,
    "rank": Integer,
    "type": String,
    "effort_type_id": BigInteger,
    "activity_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_BEST_EFFORTS_TABLE} will be overwritten.")

fact_best_efforts_df.to_sql(
    name=FACT_BEST_EFFORTS_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_best_efforts_df_dtype_map,
    method="multi",
    chunksize=1000
)



-4

### Setup `gold.fact_kudos`

In [69]:
fact_kudos_df = kudos_df.copy()

In [70]:
fact_kudos_df.head()

Unnamed: 0,first_name,last_name,full_name,activity_id
0,Filip,C.,Filip C.,15831049874
1,Ola,≈Å.,Ola ≈Å.,15831049874
2,Mal,C.,Mal C.,15831049874
3,Agnieszka,G.,Agnieszka G.,15831049874
4,Wies≈Çawa,C.,Wies≈Çawa C.,15831049874


In [71]:
fact_kudos_df_dtype_map = {
    "first_name": String,
    "last_name": String,
    "full_name": String,
    "activity_id": BigInteger
}
with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_KUDOS_TABLE} will be overwritten.")

fact_kudos_df.to_sql(
    name=FACT_KUDOS_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_kudos_df_dtype_map,
    method="multi",
    chunksize=1000
)



-8

### Setup `gold.fact_laps`

In [72]:
fact_laps_df = laps_df.copy()

In [73]:
fact_laps_df = fact_laps_df.drop(columns=['moving_time_td', 'elapsed_time_td'])
fact_laps_df.loc[:, "moving_time_td"]  = extract_timedelta(fact_laps_df["moving_time"])
fact_laps_df.loc[:, "elapsed_time_td"] = extract_timedelta(fact_laps_df["elapsed_time"])

In [74]:
fact_laps_df['date'] = fact_laps_df['start_date_local_dt'].dt.date
fact_laps_df['time'] = fact_laps_df['start_date_local_dt'].dt.time

In [75]:
fact_laps_cols_clean = [
  'id',
  'name',
  'lap_index',
  'split',
  'date',
  'time',
  'distance',
  'moving_time',
  'moving_time_td',
  'elapsed_time',
  'elapsed_time_td',
  'total_elevation_gain',
  'average_speed',
  'avg_pace_str',
  'avg_pace_float',
  'max_speed',
  'max_pace_str',
  'max_pace_float',
  'average_cadence',
  'average_heartrate',
  'max_heartrate',
  'activity_id'
]
fact_laps_df = fact_laps_df[fact_laps_cols_clean]

In [76]:
fact_laps_df.head()

Unnamed: 0,id,name,lap_index,split,date,time,distance,moving_time,moving_time_td,elapsed_time,elapsed_time_td,total_elevation_gain,average_speed,avg_pace_str,avg_pace_float,max_speed,max_pace_str,max_pace_float,average_cadence,average_heartrate,max_heartrate,activity_id
0,49980673377,Lap 1,1,1,2025-04-05,13:39:40,0.0,6066,1:41:06,6066,1:41:06,0.0,0.0,,,0.0,,,,106.9,149.0,14086094444
1,49956432295,Lap 1,1,1,2025-04-04,20:37:40,1000.0,811,0:13:31,811,0:13:31,8.4,1.23,,,3.4,,,51.7,94.8,107.0,14080030310
2,49956432299,Lap 2,2,2,2025-04-04,20:37:40,1000.0,745,0:12:25,745,0:12:25,3.8,1.34,,,1.814,,,52.1,95.0,107.0,14080030310
3,49956432304,Lap 3,3,3,2025-04-04,20:37:40,1000.0,704,0:11:44,704,0:11:44,3.4,1.42,,,1.709,,,52.9,95.4,106.0,14080030310
4,49956432306,Lap 4,4,4,2025-04-04,20:37:40,1000.0,720,0:12:00,720,0:12:00,0.0,1.39,,,1.791,,,52.1,94.1,100.0,14080030310


In [77]:
fact_laps_df_dtype_map = {
"id": BigInteger,
"name": String,
"lap_index": Integer,
"split": Integer,
"date": Date,
"time": Time,
"distance": Float,
"moving_time": Integer,
"moving_time_td": Interval,
"elapsed_time": Integer,
"elapsed_time_td": Interval,
"total_elevation_gain": Float,
"average_speed": Float,
"avg_pace_str": String,
"avg_pace_float": Float,
"max_speed": Float,
"max_pace_str": String,
"max_pace_float": Float,
"average_cadence": Float,
"average_heartrate": Float,
"max_heartrate": Float,
"activity_id": BigInteger
}

with engine.begin() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};"))

logging.warning(f"Whole table {TARGET_G_SCHEMA}.{FACT_LAPS_TABLE} will be overwritten.")

fact_laps_df.to_sql(
    name=FACT_LAPS_TABLE,
    schema=TARGET_G_SCHEMA,
    con=engine,
    if_exists="replace",
    index=False,
    dtype=fact_laps_df_dtype_map,
    method="multi",
    chunksize=1000
)




-9

### Primary keys definition

In [78]:
keys_instructions= [
    f"""CREATE SCHEMA IF NOT EXISTS {TARGET_G_SCHEMA};""",
    # ********** PRIMARY KEYS **********
    # --- dim_calendar ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_CALENDAR_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_CALENDAR_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_CALENDAR_TABLE}
          ADD CONSTRAINT {DIM_CALENDAR_TABLE}_pkey PRIMARY KEY (date);
      END IF;
    END $$;
    """,
    # --- dim_device ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_DEVICE_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_DEVICE_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_DEVICE_TABLE}
          ADD CONSTRAINT {DIM_DEVICE_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_effort_type ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_EFFORT_TYPE_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_EFFORT_TYPE_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_EFFORT_TYPE_TABLE}
          ADD CONSTRAINT {DIM_EFFORT_TYPE_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_gear ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_GEAR_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_GEAR_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_GEAR_TABLE}
          ADD CONSTRAINT {DIM_GEAR_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_location ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_LOCATION_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_LOCATION_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_LOCATION_TABLE}
          ADD CONSTRAINT {DIM_LOCATION_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_segment ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_SEGMENT_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_SEGMENT_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_SEGMENT_TABLE}
          ADD CONSTRAINT {DIM_SEGMENT_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_sport_type ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_SPORT_TYPE_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_SPORT_TYPE_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_SPORT_TYPE_TABLE}
          ADD CONSTRAINT {DIM_SPORT_TYPE_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- dim_time ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_TIME_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_TIME_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_TIME_TABLE}
          ADD CONSTRAINT {DIM_TIME_TABLE}_pkey PRIMARY KEY (time);
      END IF;
    END $$;
    """,
    # --- dim_workout_type ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{DIM_WORKOUT_TYPE_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{DIM_WORKOUT_TYPE_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{DIM_WORKOUT_TYPE_TABLE}
          ADD CONSTRAINT {DIM_WORKOUT_TYPE_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- fact_activities ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{FACT_ACTIVITIES_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{FACT_ACTIVITIES_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{FACT_ACTIVITIES_TABLE}
          ADD CONSTRAINT {FACT_ACTIVITIES_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- fact_best_efforts ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{FACT_BEST_EFFORTS_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{FACT_BEST_EFFORTS_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{FACT_BEST_EFFORTS_TABLE}
          ADD CONSTRAINT {FACT_BEST_EFFORTS_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """,
    # --- fact_laps ---
    # PK
    f"""
    DO $$
    BEGIN
      IF to_regclass('{TARGET_G_SCHEMA}.{FACT_LAPS_TABLE}') IS NOT NULL
         AND NOT EXISTS (
           SELECT 1 FROM pg_constraint
           WHERE conrelid = to_regclass('{TARGET_G_SCHEMA}.{FACT_LAPS_TABLE}')
             AND contype = 'p'
         )
      THEN
        ALTER TABLE {TARGET_G_SCHEMA}.{FACT_LAPS_TABLE}
          ADD CONSTRAINT {FACT_LAPS_TABLE}_pkey PRIMARY KEY (id);
      END IF;
    END $$;
    """
]

In [79]:
with engine.begin() as conn:
    for sql in keys_instructions:
        conn.execute(text(sql))