In [12]:
import altair as alt
import polars as pl
from polars import col
from datetime import datetime

In [13]:
pl.scan_parquet('strava.parquet').collect().glimpse(max_items_per_column=10)

Rows: 1000
Columns: 53
$ resource_state                      <i64> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
$ name                                <str> 'Evening Run', 'Lunch Run', 'Afternoon Run', 'pre-run snack', 'POV of the mixed shawarma in my stomach:', 'Afternoon Run', 'ffs pressed save by accident', 'Lunch Run', 'Running Man 2024 lol', 'late for yoga'
$ distance                            <f64> 8803.0, 13518.3, 10001.9, 14008.1, 7752.5, 2876.2, 8767.0, 8011.2, 1474.2, 1180.0
$ moving_time                         <i64> 3402, 4598, 3744, 5319, 2862, 1006, 2700, 2864, 1211, 346
$ elapsed_time                        <i64> 3498, 4648, 3859, 5659, 3216, 1015, 2946, 2948, 5367, 349
$ total_elevation_gain                <f64> 33.9, 59.1, 41.5, 43.4, 34.6, 12.7, 41.5, 27.7, 0.0, 0.0
$ type                                <str> 'Run', 'Run', 'Run', 'Run', 'Run', 'Run', 'Run', 'Run', 'Walk', 'Run'
$ sport_type                          <str> 'Run', 'Run', 'Run', 'Run', 'Run', 'Run', 'Run', 'Run', 'Walk',

In [14]:
# dataframe for analysis - 2024 activities
lf = pl.scan_parquet('strava.parquet').select(
    col("name"),
    col("type"),
    col("distance"),
    col("moving_time"),
    col("elapsed_time"),
    col("total_elevation_gain"),
    col("start_date_local"),
    col("kudos_count"),
    col("comment_count"),
    col("photo_count"),
    col("elev_high"),
    ).with_columns(
        col("distance").cast(pl.Float32) / 1000,
        col("start_date_local").cast(pl.Datetime).cast(pl.Date)
        ).filter(
            col("start_date_local") >= datetime(2024, 1, 1)
        ).sort("start_date_local")

In [15]:
cumulative_gain = lf.select(
    col("total_elevation_gain"),col("start_date_local")
    ).with_columns(cumulative_elevation_gain=pl.col("total_elevation_gain").cum_sum()
    ).collect()

# building a cumulative gain chart
cumulative_gain_chart = alt.Chart(cumulative_gain).mark_line().encode(
    alt.X("start_date_local:T").title("date"),
    alt.Y("cumulative_elevation_gain:Q").title("elevation gain"), 
    tooltip=["start_date_local:T", "total_elevation_gain:Q", "cumulative_elevation_gain:Q"],
).properties(
    width=800,
    height=400,
    title="Cumulative Elevation Gain (metres) per day - 2024"
)

annotations = alt.Chart(pl.DataFrame([
    {"start_date_local": "2024-03-15", "cumulative_elevation_gain": 1800, "text": "12,215 bananas >"},
    {"start_date_local": "2024-05-27", "cumulative_elevation_gain": 7500, "text": "1,600 male giraffes >"},
    {"start_date_local": "2024-08-2", "cumulative_elevation_gain": 13500, "text": "70 elizabeth line trains >"},
])).mark_text(align="left", dx=5, dy=-10).encode(
    alt.X("start_date_local:T"),
    alt.Y("cumulative_elevation_gain:Q"),
    text="text:N"
)

c = cumulative_gain_chart + annotations
c

In [16]:
# kudos per activity
lf.select(
    sum=pl.sum("kudos_count"),
    count=pl.count("kudos_count"),
    kudos_per_activity=pl.sum("kudos_count")/pl.count("kudos_count")
    ).collect()

sum,count,kudos_per_activity
i64,u32,f64
5733,437,13.118993


In [17]:
# kudos per km
run_lf = lf.filter(col("type") == "Run").collect()
run_lf.select(
    kudos_sum=pl.sum("kudos_count"),
    km_sum=pl.sum("distance"),
    kudos_per_km=pl.sum("kudos_count")/pl.sum("distance")
)

kudos_sum,km_sum,kudos_per_km
i64,f32,f64
3089,1458.220703,2.118335


In [38]:
# scatterplot analysis - all sports
caption_analysis = lf.select(
    col("name"),
    col("type"),
    col("kudos_count"),
    col("start_date_local"),
    col("moving_time"),
    col("elapsed_time"),
    col("distance")).with_columns(
        custom_caption_flag=col("name").is_in([
            "Morning Run", "Lunch Run","Afternoon Run", "Evening Run",
            "Morning Ride", "Lunch Ride", "Afternoon Ride", "Evening Ride",
            "Morning Workout", "Lunch Workout", "Afternoon Workout", "Evening Workout"]
            ).not_().cast(pl.Int32)
        )

In [39]:
caption_analysis_run = caption_analysis.filter(col("type") == "Run").collect()

# building the chart - run
chart_run = alt.Chart(caption_analysis_run).mark_point().encode(
    alt.X("distance:Q").title("distance ran (km)"),
    alt.Y("kudos_count:Q").title("number of kudos"),
    alt.Color("custom_caption_flag:N", 
              title="custom caption used?", 
              scale=alt.Scale(domain=["Yes", "No"], range=["#ff6a00", "#9cd0f7"])),  
    tooltip="start_date_local:T",
).transform_calculate(
    custom_caption_flag="datum.custom_caption_flag == 1 ? 'Yes':'No'"
).configure_point(
    size=50,
    filled=True
).properties(
    width=400,
    height=400
).interactive()

chart_run

In [32]:
corr = caption_analysis_run.select(pl.corr("distance", "kudos_count", method="pearson"))
corr

distance
f64
0.561919


In [40]:
caption_analysis_bike = caption_analysis.filter(col("type") == "Ride").collect()

# building the chart - bike
chart_bike = alt.Chart(caption_analysis_bike).mark_point().encode(
    alt.X("distance:Q").title("distance biked (km)"),
    alt.Y("kudos_count:Q").title("number of kudos"),
    alt.Color("custom_caption_flag:N", 
              title="custom caption used?", 
              scale=alt.Scale(domain=["Yes", "No"], range=["#ff6a00", "#9cd0f7"])),  
    tooltip="start_date_local:T",
).transform_calculate(
    custom_caption_flag="datum.custom_caption_flag == 1 ? 'Yes':'No'"
).configure_point(
    size=50,
    filled=True
).properties(
    width=400,
    height=400
).interactive()

chart_bike

In [33]:
corr = caption_analysis_bike.select(pl.corr("distance", "kudos_count", method="pearson"))
corr

distance
f64
0.689147


In [75]:
elapsed_analysis = caption_analysis_run.group_by_dynamic("start_date_local", every="1mo").agg(
    col("moving_time").sum(),
    col("elapsed_time").sum())

elapsed_analysis.select(
    col("moving_time"),
    col("elapsed_time"),
    delta = col("elapsed_time") - col("moving_time"),
    activity_month = col("start_date_local").dt.strftime("%B"))

moving_time,elapsed_time,delta,activity_month
i64,i64,i64,str
40853,42458,1605,"""January"""
28758,31452,2694,"""February"""
51606,53934,2328,"""March"""
51819,56676,4857,"""April"""
89808,97391,7583,"""May"""
…,…,…,…
35158,37341,2183,"""August"""
32736,34990,2254,"""September"""
51638,56206,4568,"""October"""
57262,61422,4160,"""November"""
