In [16]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [17]:
# read file
out_dir = "output"
input_file_path = os.path.join(out_dir, "example-results-01.csv")
df_orig = pd.read_csv(input_file_path)

FileNotFoundError: [Errno 2] No such file or directory: 'output/example-results-01.csv'

In [None]:
display(df_orig.describe())
display(df_orig.info())
display(df_orig.head(10))

In [None]:
# Make changes to dataframe for viewing elevation data

# sort by elevation and add "elevation_place"
df = df_orig.copy()
df.sort_values("elevation_gain_ft", ascending=False, inplace=True)
df["elevation_place"] = [i for i in range(1, len(df) + 1)]

# select columns I care most about and reorder them
columns = ["place", "elevation_place", "elevation_gain_ft", "distance_miles", "name", "gender", "city", "state", "country", "pace", "clock_time", "age"]
df = df[columns]

In [None]:
# Display the top 15 runners by elevation
display(df.iloc[:15])

## Elevation gain histograms

In [None]:
fig, ax = plt.subplots()
ax = sns.histplot(df, x="elevation_gain_ft", ax=ax)
ax.set_title("Elevation Dist for everyone", fontsize=18)
ax.set_xlabel("Elevation Gain (ft)", fontsize=16)
ax.set_ylabel("Count", fontsize=16)

out_file_path = os.path.join(out_dir, "output-elevation-distribution-all.png")
fig.savefig(out_file_path, dpi=300)

In [None]:
fig, ax = plt.subplots()
ax = sns.histplot(df[df["elevation_gain_ft"] > 5000], x="elevation_gain_ft", ax=ax)
ax.set_title("Elevation Dist for those with > 5k ft", fontsize=18)
ax.set_xlabel("Elevation Gain (ft)", fontsize=16)
ax.set_ylabel("Count", fontsize=16)

out_file_path = os.path.join(out_dir, "output-elevation-distribution-gt-5k-ft.png")
fig.savefig(out_file_path, dpi=300)

## Distance

In [None]:
fig, ax = plt.subplots()
ax = sns.histplot(df, x="distance_miles", ax=ax)
ax.set_title("Distance distribution", fontsize=18)
ax.set_xlabel("Distance (miles)", fontsize=16)
ax.set_ylabel("Count", fontsize=16)

out_file_path = os.path.join(out_dir, "output-distance-distribution.png")
fig.savefig(out_file_path, dpi=300)

## Age

In [None]:
fig, ax = plt.subplots()
ax = sns.histplot(df, x="age", binwidth=5)
ax.set_title("Age Distribution", fontsize=18)
ax.set_xlabel("Age (years)", fontsize=16)
ax.set_ylabel("Count", fontsize=16)

out_file_path = os.path.join(out_dir, "output-age-distribution.png")
fig.savefig(out_file_path, dpi=300)

## Location

In [None]:
# https://countrycode.org/
mapping = {
    "US" : "United States",
    "GB" : "Great Britain",
    "CA" : "Canada",
    "AU" : "Australia",
    "DE" : "Germany",
    "AL" : "Albania",
    "CO" : "Colombia",
    "FR" : "France",
    "IE" : "Ireland",
    "IL" : "Israel",
    "CN" : "China",
    "JP" : "Japan",
    "KE" : "Kenya",
    "PR" : "Puerto Rico",
    "KR" : "South Korea",
    "PT" : "Portugal",
    "SE" : "Sweden",
    "SG" : "Singapore",
    "VE" : "Venezuala"
}

df_country = pd.DataFrame(df[["country"]].value_counts(dropna=False))

#df_country["country_full_name"] = df_country["country"].map(mapping)
display(df_country)
df_country.columns

In [None]:
pd.DataFrame(df["country"].value_counts(dropna=False))

In [None]:
df_us = df[df["country"] == "US"]
df_state = pd.DataFrame(df_us[["state"]].value_counts(dropna=False))
total = len(df)
df_state["fraction"] = df_state["count"] / total
display(df_state)

# Gender

In [None]:
df_gender = pd.DataFrame(df[["gender"]].value_counts(dropna=False))
total = len(df)
df_gender["fraction"] = df_gender["count"] / total
display(df_gender)