In [97]:
import altair as alt
import polars as pl
from polars import col
from datetime import datetime

In [None]:
pl.scan_parquet('strava.parquet').collect().glimpse(max_items_per_column=10)

In [139]:
# dataframe for analysis - 2024 activities
lf = pl.scan_parquet('strava.parquet').select(
    col("name"),
    col("type"),
    col("distance"),
    col("moving_time"),
    col("elapsed_time"),
    col("total_elevation_gain"),
    col("start_date_local"),
    col("kudos_count"),
    col("comment_count"),
    col("photo_count"),
    col("elev_high"),
    ).with_columns(
        col("distance").cast(pl.Float32) / 1000,
        col("start_date_local").cast(pl.Datetime).cast(pl.Date)
        ).filter(
            col("start_date_local") >= datetime(2024, 1, 1)
        ).sort("start_date_local")

In [223]:
cumulative_gain = lf.select(
    col("total_elevation_gain"),col("start_date_local")
    ).with_columns(cumulative_elevation_gain=pl.col("total_elevation_gain").cum_sum()
    ).collect()

# building a cumulative gain chart
cumulative_gain_chart = alt.Chart(cumulative_gain).mark_line().encode(
    alt.X("start_date_local:T").title("date"),
    alt.Y("cumulative_elevation_gain:Q").title("elevation gain"), 
    tooltip=["start_date_local:T", "total_elevation_gain:Q", "cumulative_elevation_gain:Q"],
).properties(
    width=800,
    height=400,
    title="Cumulative Elevation Gain (metres) per day - 2024"
)

annotations = alt.Chart(pl.DataFrame([
    {"start_date_local": "2024-03-15", "cumulative_elevation_gain": 1800, "text": "12,215 bananas >"},
    {"start_date_local": "2024-05-27", "cumulative_elevation_gain": 7500, "text": "1,600 male giraffes >"},
    {"start_date_local": "2024-08-2", "cumulative_elevation_gain": 13500, "text": "70 elizabeth line trains >"},
])).mark_text(align="left", dx=5, dy=-10).encode(
    alt.X("start_date_local:T"),
    alt.Y("cumulative_elevation_gain:Q"),
    text="text:N"
)

c = cumulative_gain_chart + annotations
c

In [124]:
# kudos per activity
lf.select(
    sum=pl.sum("kudos_count"),
    count=pl.count("kudos_count"),
    kudos_per_activity=pl.sum("kudos_count")/pl.count("kudos_count")
    ).collect()

sum,count,kudos_per_activity
i64,u32,f64
5733,437,13.118993


In [123]:
# kudos per km
run_lf = lf.filter(col("type") == "Run").collect(streaming=True)
run_lf.select(
    kudos_sum=pl.sum("kudos_count"),
    km_sum=pl.sum("distance"),
    kudos_per_km=pl.sum("kudos_count")/pl.sum("distance")
)

kudos_sum,km_sum,kudos_per_km
i64,f32,f64
3089,1458.220703,2.118335


In [None]:
# scatterplot analysis - all sports
caption_analysis = lf.select(
    col("name"),
    col("type"),
    col("kudos_count"),
    col("start_date_local"),
    col("distance")).with_columns(
        custom_caption_flag=col("name").is_in([
            "Morning Run", "Lunch Run","Afternoon Run", "Evening Run",
            "Morning Ride", "Lunch Ride", "Afternoon Ride", "Evening Ride",
            "Morning Workout", "Lunch Workout", "Afternoon Workout", "Evening Workout"]
            ).not_().cast(pl.Int32)
        )

In [125]:
caption_analysis_run = caption_analysis.filter(col("type") == "Run").collect(streaming=True)

# building the chart - run
chart_run = alt.Chart(caption_analysis_run).mark_point().encode(
    alt.X("distance:Q").title("distance ran (km)"),
    alt.Y("kudos_count:Q").title("number of kudos"),
    alt.Color("custom_caption_flag:N", 
              title="custom caption used?", 
              scale=alt.Scale(domain=["Yes", "No"], range=["#ff6a00", "#9cd0f7"])),  
    tooltip="start_date_local:T",
).transform_calculate(
    custom_caption_flag="datum.custom_caption_flag == 1 ? 'Yes':'No'"
).configure_point(
    size=50,
    filled=True
).properties(
    width=400,
    height=400
).interactive()

chart_run

In [103]:
corr = lf.select(pl.corr("distance", "kudos_count", method="pearson")).collect()
corr

distance
f64
0.301658
