# The big dataset of ultra-marathon running
# Exploratory Data Analysis
https://www.kaggle.com/datasets/aiaiaidavid/the-big-dataset-of-ultra-marathon-running

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("TWO_CENTURIES_OF_UM_RACES.csv", sep=",", low_memory=False)
print(data.shape)
data.head()

## Rename Columns

In [None]:
# Rename columns to snake_case
data.columns = [col.lower().replace(' ', '_') for col in data.columns]
data.rename(columns={"event_distance/length": "race_length"}, inplace=True)
data.head()

In [None]:
data.info()

This dataset is enormous. My intention is to show my skills in this portfolio, so let´s select a **subset of data**: 
- USA
- 50 km or 50 mi
- 2020

I´m going to use some different pandas skills to do the same work.
- == operator
- isin()

In [None]:
# SKILLS
# == operator
data[(data["race_length"] == "50km") | (data["race_length"] == "50mi")]

In [None]:
# SKILLS
# isin()
data[data["race_length"].isin(["50km", "50mi"])]

In [None]:
# Now, let´s obtain the next subset 2020, 50km/50mi
data[(data["race_length"].isin(["50km", "50mi"])) & (data["year_of_event"] == 2020)]

There is no colum with the country, but actually we have the "Event name" column. All of them have a pattern. Let´s work with it.

In [None]:
# Extract country from Event column
# use regex
data["event_country"] = data['event_name'].str.extract(r'\((.*?)\)')
print(data.event_country.unique())
data.head()

In [None]:
# use python for cycle
def extract_country(event):
    start_index = event.find('(') + 1
    end_index = event.find(')')
    return event[start_index:end_index]

data['event_country'] = data['event_name'].apply(extract_country)
print(data.event_country.unique())
data.head()

It does not seem to work properly. But to get the subset of the event in the USA, I think it could be useful.

In [None]:
# Now, let´s obtain the next subset 2020, USA, 50km/50mi
data_usa = data[(data["event_distance/length"].isin(["50km", "50mi"])) & (data["year_of_event"] == 2020) & (data["event_country"] == "USA")]
data_usa = data_usa.drop("event_country", axis = 1)
print(data_usa.shape)
data_usa.head()

In [None]:
# remove USA from event name
data_usa["event_name"] = data_usa["event_name"].str.split("(").str.get(0)
data_usa.head()

Not let´s look at the column "Athlete age category". Let´s clean up.

In [None]:
# add column Atlethe age
data_usa["athlete_age"] = 2020 - data_usa["athlete_year_of_birth"]
data_usa.head()

In [None]:
# remove h from atlethe performance
data_usa["athlete_performance"] = data_usa["athlete_performance"].str.strip(" h")
data_usa.head()

I consider the following columns relatively useless. 
- "Athlete club"
- "Athlete country"
- "Athlete age category"
- "Athlete year of birth"
That is why I will proceed to eliminate them.

In [None]:
data_usa = data_usa.drop(["athlete_club", "athlete_country", "athlete_age_category", "athlete_year_of_birth"], axis=1)
data_usa.head()

## Clean up null values

In [None]:
data_usa.isna().sum()

In [None]:
data_usa.dropna(subset=["athlete_age"], inplace=True)

In [None]:
# check if it was done correctly
data_usa.isna().sum()

## Check duplicates

In [None]:
data_usa[data_usa.duplicated() == True]

There is no duplicates in the dataset. So let´s reset the index.

## Reset index
Because I selected a subset and then deleted rows, the index does not have a correct order. 

Therefore, I am going to fix it.

In [None]:
data_usa.reset_index(drop=True)

## Fix types

In [None]:
data_usa.dtypes

In [None]:
# athlete_age is a integer, so let´s convert it into int
data_usa["athlete_age"] = data_usa["athlete_age"].astype(int)

In [None]:
# Athlete average speed is a number, it´s not a string
data_usa["athlete_average_speed"] = data_usa["athlete_average_speed"].astype(float)

In [None]:
# Conver Athlete performance object to float
# Convert string time to timedelta
data_usa['athlete_performance'] = pd.to_timedelta(data_usa['athlete_performance'])

# Convert timedelta to hours
data_usa['athlete_performance(hours)'] = data_usa['athlete_performance'] / pd.Timedelta(hours=1)

# drop Athlete performance column
data_usa.drop(["athlete_performance"], axis=1, inplace=True)

In [None]:
data_usa.dtypes

Well done.

## Graphs and Charts

### 1. Histograms

In [None]:
sns.histplot(data_usa["race_length"]);
plt.title("Race length")
plt.show()

In [None]:
sns.histplot(data_usa, x="race_length", hue="athlete_gender");
plt.title("Race length by gender")
plt.show()

It appears that 50km races are distributed equally between both sexes, but 50 mile races are predominantly chosen by men.

In [None]:
sns.displot(data_usa[data_usa["race_length"] == "50km"]["athlete_average_speed"])
plt.title("Average athlete speed per 50km races")
plt.show()

An observation, if I have an average speed of 9km/h it seems that it is more than the average speed. But it's not exactly the same. Because different 50 km races do not have same difficulty. Running in the mountains is not the same as a street race.

### 2. Violin plots

In [None]:
sns.violinplot(data= data_usa, x="race_length", y="athlete_average_speed", hue="athlete_gender");

In [None]:
# better version of the same graph
sns.violinplot(data= data_usa, x="race_length", y="athlete_average_speed", hue="athlete_gender", split=True, inner="quart", linewidth=1);

### 3. Linear Model plot

In [None]:
sns.lmplot(data=data_usa, x="athlete_age", y="athlete_average_speed", hue="athlete_gender");

## Questions from the data

Differences in speed from 50k and 50mi male to female

In [None]:
print(data_usa.groupby(["erace_length", "athlete_gender"])["athlete_average_speed"].mean())

sns.violinplot(data= data_usa, x="race_length", y="athlete_average_speed", hue="athlete_gender", split=True, inner="quart", linewidth=1);

What age groups are best in 50km race? And in 50mile race?

In [None]:
data_usa.query("race_length == '50km'").groupby("athlete_age")["athlete_average_speed"].agg(["mean", "count"]).sort_values()