## Importing Data using pandas

In [1]:
import pandas as pd

df = pd.read_csv("NYC Marathon Results.csv")
df

Unnamed: 0,Year,Race,Name,Gender,Age,State,Country,Overall,Finish Time,Finish
0,2024,NYC Marathon,Abdi Nageeye,M,35,,NLD,1,02:07:39,7659
1,2024,NYC Marathon,Evans Chebet,M,35,-0,KEN,2,02:07:45,7665
2,2024,NYC Marathon,Albert Korir,M,30,,KEN,3,02:08:00,7680
3,2024,NYC Marathon,Tamirat Tola,M,33,,ETH,4,02:08:12,7692
4,2024,NYC Marathon,Geoffrey Kamworor,M,31,-0,KEN,5,02:08:50,7730
...,...,...,...,...,...,...,...,...,...,...
1460281,1979,NYC Marathon,Alan Skriloff,M,31,NJ,USA,10473,7:11:30,25890
1460282,1979,NYC Marathon,Richard Traum,M,38,NY,USA,10474,7:21:31,26491
1460283,1979,NYC Marathon,Melissa Morrow,W,27,AZ,USA,10475,7:21:45,26505
1460284,1979,NYC Marathon,Linda Galbraith,W,32,NY,USA,10476,7:22:00,26520


## Chunking the data from 2020 - 2025 because the data is too big (1,460,286 rows)
#### (also for a better comparison with the hard way calculation)

In [3]:

nyc_marathon = df[df['Year'].between(2020, 2025)]
nyc_marathon

Unnamed: 0,Year,Race,Name,Gender,Age,State,Country,Overall,Finish Time,Finish
0,2024,NYC Marathon,Abdi Nageeye,M,35,,NLD,1,02:07:39,7659
1,2024,NYC Marathon,Evans Chebet,M,35,-0,KEN,2,02:07:45,7665
2,2024,NYC Marathon,Albert Korir,M,30,,KEN,3,02:08:00,7680
3,2024,NYC Marathon,Tamirat Tola,M,33,,ETH,4,02:08:12,7692
4,2024,NYC Marathon,Geoffrey Kamworor,M,31,-0,KEN,5,02:08:50,7730
...,...,...,...,...,...,...,...,...,...,...
1280717,2023,NYC Marathon,Rajni Singh,W,45,NJ,USA,35594,5:03:33,18213
1280718,2023,NYC Marathon,Bridget Schwartz,W,25,NY,USA,35595,5:03:34,18214
1280719,2023,NYC Marathon,Debora Pelosi,W,53,,ITA,35596,5:03:34,18214
1280720,2023,NYC Marathon,Briana Goewert,W,26,NY,USA,35597,5:03:34,18214


## Statistics using pandas

In [4]:
mean_age = nyc_marathon["Age"].mean()
median_age = nyc_marathon["Age"].median()
mode_age = nyc_marathon["Age"].mode()[0]

print("Overall Age Statistics:")
print(f"Mean Age: {mean_age:.1f} years")
print(f"Median Age: {median_age:.0f} years")
print(f"Mode Age: {mode_age:.0f} years")

Overall Age Statistics:
Mean Age: 40.7 years
Median Age: 40 years
Mode Age: 30 years


## Calculating the statistics in a hard way

In [5]:
import csv

# Since the data is too big, I plan to only calculate the result from 2020 - 2025
filename = "NYC Marathon Results.csv"
target_years = {"2020", "2021", "2022", "2023", "2024", "2025"}

ages = []

try:
    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)

        for row in reader:
            try:
                if row["Year"] in target_years:
                    age = int(row["Age"])
                    ages.append(age)
            except (ValueError, KeyError, TypeError):
                continue

except FileNotFoundError:
    print(f"Error: The file '{filename}' was not found.")
    print("Please make sure the file is in the same directory as the script,")
    print("or provide the full file path.")
    exit()

# Make the year appear in sequence
sorted_years_list = sorted(list(target_years))
years_str = ", ".join(sorted_years_list)

# Calculate Statistics The Hard Way
# Check if we actually found any data
if not ages:
    print(f"No valid age data found for the years {target_years}.")
else:
    print(f"Calculating statistics for {len(ages):,} runners from {years_str}")

    # Mean Calculation
    total_sum = 0
    for age in ages:
        total_sum += age

    count = len(ages)
    mean_age = total_sum / count

    # Median Calculation
    sorted_ages = sorted(ages)

    median_age = 0
    # Check if the count is odd or even
    if count % 2 == 1:  # if odd
        median_index = count // 2
        median_age = sorted_ages[median_index]
    else:  # if even
        index2 = count // 2
        index1 = index2 - 1
        median_age = (sorted_ages[index1] + sorted_ages[index2]) / 2

    # Mode Calculation (uses dictionary)
    age_counts = {}
    for age in ages:
        age_counts[age] = age_counts.get(age, 0) + 1

    # Find the age with the highest count in the dictionary
    max_count = 0
    mode_age = -1  # Start with a placeholder

    # Loop through both the key (age) and value (count) using items
    for age, count in age_counts.items():
        if count > max_count:
            max_count = count
            mode_age = age

    # Print the Results
    print("\nOverall Age Statistics")
    print(f"Mean Age: {mean_age:.1f} years")
    print(f"Median Age: {median_age} years")
    print(f"Mode Age: {mode_age} years (appeared {max_count} times)")

Calculating statistics for 179,509 runners from 2020, 2021, 2022, 2023, 2024, 2025

Overall Age Statistics
Mean Age: 40.7 years
Median Age: 40 years
Mode Age: 30 years (appeared 6044 times)


##### Note: Look at the number of row in pandas dataframe from 2020-2025 and the number of runners in the hard way calculation, they give the exact same number. Therefore, we can assume that both are correct (using the same data for the respective calculation)

## Simple Visualization

In [11]:
# # Round the calculated ages to the nearest whole number
rounded_mean = round(mean_age)
rounded_median = round(median_age)
rounded_mode = round(mode_age)
# Mode is already an int, but just to be safe I use round()


print("\nAge Statistics Sparklines:")
print("Each ❤️ represents 1 year")

# Create sparklines for each statistic
mean_bar = "❤️" * rounded_mean
median_bar = "❤️" * rounded_median
mode_bar = "❤️" * rounded_mode

print(f"{'Mean':<6} {mean_bar} ({rounded_mean} years)")
print(f"{'Median':<6} {median_bar} ({rounded_median} years)")
print(f"{'Mode':<6} {mode_bar} ({rounded_mode} years)")


Age Statistics Sparklines:
Each ❤️ represents 1 year
Mean   ❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️ (41 years)
Median ❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️ (40 years)
Mode   ❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️ (30 years)


##### Choosing heart to illustrate age is inspired from game. A game usually illustrate character's live using heart symbol.