In [23]:
from dotenv import load_dotenv
import os
import pandas as pd
from sqlalchemy import Text, Integer, BigInteger, Float, Boolean
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import JSONB
pd.set_option('display.max_columns', None)

### Load evn variables

In [10]:
load_dotenv()

True

### SQL setup

In [11]:
engine = create_engine(os.getenv('DB_URI'))

## Request data

### All activities Dataframe

In [12]:
activitie_list_query = "SELECT * FROM bronze.activities"
activities_list_df = pd.read_sql(activitie_list_query, engine)

### All activities with details Dataframe

In [13]:
activities_details_query = "SELECT * FROM bronze.activities_details"
activities_details_df = pd.read_sql(activities_details_query, engine)

### All kudos Dataframe

In [14]:
kudos_query = "SELECT * FROM bronze.kudos"
kudos_df = pd.read_sql(kudos_query, engine)

### Separate tables setup

In [16]:
dataframe_columns = {
  'activities' : [
    'id',
    'name',
    'distance',
    'moving_time',
    'elapsed_time',
    'total_elevation_gain',
    'type',
    'sport_type',
    'workout_type',
    'start_date',
    'start_date_local',
    'timezone',
    'utc_offset',
    'location_city',
    'location_state',
    'location_country',
    'achievement_count',
    'kudos_count',
    'comment_count',
    'athlete_count',
    'photo_count',
    'trainer',
    'commute',
    'manual',
    'private',
    'visibility',
    'flagged',
    'gear_id',
    'start_latlng',
    'end_latlng',
    'average_speed',
    'max_speed',
    'average_cadence',
    'average_watts',
    'max_watts',
    'weighted_average_watts',
    'device_watts',
    'kilojoules',
    'has_heartrate',
    'average_heartrate',
    'max_heartrate',
    'heartrate_opt_out',
    'display_hide_heartrate_option',
    'elev_high',
    'elev_low',
    'upload_id',
    'upload_id_str',
    'external_id',
    'from_accepted_tag',
    'pr_count',
    'total_photo_count',
    'has_kudoed',
    'suffer_score',
    'description',
    'calories',
    'perceived_exertion',
    'prefer_perceived_exertion',
    'hide_from_home',
    'device_name',
    'embed_token',
    'available_zones',
    'map_id',
    'gear_id'],
  'maps' : [
    'map_id',
    'map_polyline',
    'map_resource_state',
    'map_summary_polyline'],
  'gear' : [
    'gear_id',
    'gear_primary',
    'gear_name',
    'gear_nickname',
    'gear_resource_state',
    'gear_retired',
    'gear_distance',
    'gear_converted_distance'],
  'segment_efforts' : [
    'id',
    'resource_state',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'start_index',
    'end_index',
    'average_cadence',
    'device_watts',
    'average_watts',
    'average_heartrate',
    'max_heartrate',
    'pr_rank',
    'achievements',
    'visibility',
    'kom_rank',
    'hidden',
    'activity_id',
    'segment_id'],
  'segments' : [
    'segment_id',
    'segment_resource_state',
    'segment_name',
    'segment_activity_type',
    'segment_distance',
    'segment_average_grade',
    'segment_maximum_grade',
    'segment_elevation_high',
    'segment_elevation_low',
    'segment_start_latlng',
    'segment_end_latlng',
    'segment_elevation_profile',
    'segment_elevation_profiles',
    'segment_climb_category',
    'segment_city',
    'segment_state',
    'segment_country',
    'segment_private',
    'segment_hazardous',
    'segment_starred'],
  'laps' : [
    'id',
    'resource_state',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'average_speed',
    'max_speed',
    'lap_index',
    'split',
    'start_index',
    'end_index',
    'total_elevation_gain',
    'average_cadence',
    'device_watts',
    'average_watts',
    'average_heartrate',
    'max_heartrate',
    'pace_zone',
    'activity_id'],
  'best_efforts' : [
    'id',
    'activity_id',
    'resource_state',
    'name',
    'elapsed_time',
    'moving_time',
    'start_date',
    'start_date_local',
    'distance',
    'pr_rank',
    'achievements',
    'start_index',
    'end_index']
}

### Spliting data into tables

In [76]:
# Top level normalize
activities_details_df

# Activities
activities_cols = dataframe_columns['activities']
activities_df = activities_details_df[[c for c in activities_cols if c in activities_details_df.columns]].copy()



# Maps
maps_cols = dataframe_columns['maps']
maps_df = activities_details_df[[c for c in maps_cols if c in activities_details_df.columns]].copy()


# Gear
gear_cols = dataframe_columns['gear']
gear_df = activities_details_df[[c for c in gear_cols if c in activities_details_df.columns]].copy()
gear_df = gear_df.drop_duplicates()

# Segment efforts
seg_eff_cols = dataframe_columns['segment_efforts']
segments_eff_exploded_df = activities_details_df.copy().explode('segment_efforts').reset_index(drop=True)
segments_eff_df = pd.json_normalize(segments_eff_exploded_df['segment_efforts'], sep='_')
segments_eff_df = segments_eff_df[[c for c in seg_eff_cols if c in segments_eff_df.columns]]


# Segments
seg_cols = dataframe_columns['segments']
segments_exploded_df = activities_details_df.copy().explode('segment_efforts').reset_index(drop=True)
segments_df = pd.json_normalize(segments_exploded_df['segment_efforts'], sep='_')
segments_df = segments_df[[c for c in seg_cols if c in segments_df.columns]]

# Laps
lap_cols = dataframe_columns['laps']
laps_exploded_df = activities_details_df.copy().explode('laps').reset_index(drop=True)
laps_df = pd.json_normalize(laps_exploded_df['laps'], sep='_')
laps_df = laps_df[[c for c in lap_cols if c in laps_df.columns]]

# Best efforts
best_eff_cols = dataframe_columns['best_efforts']
best_eff_exploded_df = activities_details_df.copy().explode('best_efforts').reset_index(drop=True)
best_eff_df = pd.json_normalize(best_eff_exploded_df['best_efforts'], sep='_')
best_eff_df = best_eff_df[[c for c in lap_cols if c in best_eff_df.columns]].dropna(how="all")

# All dataframes in dictoinary
dataframes = {
    "activities": activities_df,
    "maps": maps_df,
    "gear": gear_df,
    "segment_efforts": segments_eff_df,
    "segments": segments_df,
    "laps": laps_df,
    "best_efforts": best_eff_df,
    "kudos" : kudos_df
}