In [26]:
import pandas as pd

In [27]:
marathon15 = pd.read_csv('marathon_results_2015.csv')
marathon16 = pd.read_csv('marathon_results_2016.csv')
marathon17 = pd.read_csv('marathon_results_2017.csv')

In [28]:
marathon15['Year'] = 2015
marathon16['Year'] = 2016
marathon17['Year'] = 2017

# Combine the dataframes
df_combined = pd.concat([marathon15, marathon16, marathon17], ignore_index=True)

# Now df_combined is your combined dataframe with the 'Year' column
df_combined.head()

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division,Year,Unnamed: 8
0,0.0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,,,...,1:47:59,2:02:39,0:04:56,-,2:09:17,1,1,1,2015,
1,1.0,4,"Tsegay, Yemane Adhane",30,M,Addis Ababa,,ETH,,,...,1:47:59,2:02:42,0:04:58,-,2:09:48,2,2,2,2015,
2,2.0,8,"Chebet, Wilson",29,M,Marakwet,,KEN,,,...,1:47:59,2:03:01,0:04:59,-,2:10:22,3,3,3,2015,
3,3.0,11,"Kipyego, Bernard",28,M,Eldoret,,KEN,,,...,1:48:03,2:03:47,0:05:00,-,2:10:47,4,4,4,2015,
4,4.0,10,"Korir, Wesley",32,M,Kitale,,KEN,,,...,1:47:59,2:03:27,0:05:00,-,2:10:49,5,5,5,2015,


In [29]:
# Columns to be dropped
columns_to_drop = ['Unnamed: 0', 'State', 'Citizen', 'Unnamed: 9', 'Proj Time', 'Bib', 'Unnamed: 8']

# Drop the specified columns from the dataframe
df = df_combined.drop(columns=columns_to_drop, errors='ignore')  # errors='ignore' to avoid error if a column is not present

# Display the first 5 rows to confirm the columns are dropped
df

Unnamed: 0,Name,Age,M/F,City,Country,5K,10K,15K,20K,Half,25K,30K,35K,40K,Pace,Official Time,Overall,Gender,Division,Year
0,"Desisa, Lelisa",25,M,Ambo,ETH,0:14:43,0:29:43,0:44:57,1:00:29,1:04:02,1:16:07,1:32:00,1:47:59,2:02:39,0:04:56,2:09:17,1,1,1,2015
1,"Tsegay, Yemane Adhane",30,M,Addis Ababa,ETH,0:14:43,0:29:43,0:44:58,1:00:28,1:04:01,1:16:07,1:31:59,1:47:59,2:02:42,0:04:58,2:09:48,2,2,2,2015
2,"Chebet, Wilson",29,M,Marakwet,KEN,0:14:43,0:29:43,0:44:57,1:00:29,1:04:02,1:16:07,1:32:00,1:47:59,2:03:01,0:04:59,2:10:22,3,3,3,2015
3,"Kipyego, Bernard",28,M,Eldoret,KEN,0:14:43,0:29:44,0:45:01,1:00:29,1:04:02,1:16:07,1:32:00,1:48:03,2:03:47,0:05:00,2:10:47,4,4,4,2015
4,"Korir, Wesley",32,M,Kitale,KEN,0:14:43,0:29:44,0:44:58,1:00:28,1:04:01,1:16:07,1:32:00,1:47:59,2:03:27,0:05:00,2:10:49,5,5,5,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79633,"Steinbach, Paula Eyvonne",61,F,Ontario,USA,0:46:44,1:35:41,2:23:35,3:12:44,3:23:31,4:12:06,5:03:08,5:55:18,6:46:57,0:16:24,7:09:39,26407,11972,344,2017
79634,"Avelino, Andrew R.",25,M,Fayetteville,USA,0:32:03,1:05:33,1:52:17,2:49:41,3:00:26,3:50:19,4:50:01,5:53:48,6:54:21,0:16:40,7:16:59,26408,14436,4774,2017
79635,"Hantel, Johanna",57,F,Malvern,USA,0:53:11,1:43:36,2:32:36,-,3:36:24,4:15:21,5:06:37,6:00:33,6:54:38,0:16:47,7:19:37,26409,11973,698,2017
79636,"Reilly, Bill",64,M,New York,USA,0:40:34,1:27:19,2:17:17,3:11:40,3:22:30,4:06:10,5:07:09,6:06:07,6:56:08,0:16:49,7:20:44,26410,14437,1043,2017


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming df is your cleaned dataframe and 'Official Time' and '10K' are the columns of interest.

# Convert '10K' and 'Official Time' to total seconds for the model.
def time_to_seconds(time_str):
    # Function to convert time string to seconds.
    try:
        h, m, s = time_str.split(':')
        return int(h) * 3600 + int(m) * 60 + int(s)
    except ValueError: # if conversion fails, return NaN
        return np.nan

df['10K_seconds'] = df['10K'].apply(time_to_seconds)
df['Official_Time_seconds'] = df['Official Time'].apply(time_to_seconds)

# Drop rows with NaN values that resulted from conversion errors
df = df.dropna(subset=['10K_seconds', 'Official_Time_seconds'])

# Define your feature and target variables
X = df[['10K_seconds']]  # Features
y = df['Official_Time_seconds']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Print the performance metric
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 1027124.8881184158


In [31]:
from sklearn.metrics import mean_absolute_error, r2_score

# Convert 'Age' and 'M/F' to numerical values for the model
df['Gender'] = df['M/F'].map({'M': 1, 'F': 0}) 

# Now define your features with 'Age' and 'Gender'
X = df[['10K_seconds', 'Age', 'Gender']]  # Features

# Continue with the same steps as before for splitting the data
# Initialize and train the multiple regression model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error, mean absolute error, and R-squared score
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the performance metrics
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared Score: {r2}')


Mean Squared Error: 1027124.8881184158
Mean Absolute Error: 738.036675796353
R-squared Score: 0.8357552274303383


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Gender'] = df['M/F'].map({'M': 1, 'F': 0})


In [32]:
# Convert 'Half' to total seconds for the model
# Instead of directly setting df['Half_seconds'], use the .loc method
df.loc[:, 'Half_seconds'] = df['Half'].apply(time_to_seconds)

# Drop rows with NaN values that resulted from conversion errors
# Drop rows with NaN values that resulted from conversion errors
df = df.dropna(subset=['10K_seconds', 'Half_seconds', 'Official_Time_seconds'])

# Define your feature and target variables
# Now including 'Half_seconds' as a predictor
X = df[['10K_seconds', 'Age', 'Gender', 'Half_seconds']]  # Features
y = df['Official_Time_seconds']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the multiple regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error, mean absolute error, and R-squared score
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the performance metrics
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared Score: {r2}')


Mean Squared Error: 521207.3630973528
Mean Absolute Error: 473.05386345166096
R-squared Score: 0.915532329936932


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'Half_seconds'] = df['Half'].apply(time_to_seconds)


In [33]:
# Assuming df is your cleaned dataframe and 'Half' is the column with the half marathon times.

# Convert 'Half' to total seconds for the model, using the same function as before.
df['Half_seconds'] = df['Half'].apply(time_to_seconds)

# Drop rows with NaN values that resulted from conversion errors
df = df.dropna(subset=['Half_seconds'])

# Now define your features with '10K_seconds', 'Age', 'Gender', and 'Half_seconds'
X = df[['10K_seconds', 'Age', 'Gender', 'Half_seconds']]  # Features

# Split the data into training and testing sets as before
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the multiple regression model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model using the previous metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the updated performance metrics
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared Score: {r2}')


Mean Squared Error: 521207.3630973528
Mean Absolute Error: 473.05386345166096
R-squared Score: 0.915532329936932


In [34]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Assuming you've already prepared your DataFrame `df` and selected features and target
X = df[['10K_seconds', 'Age', 'Gender', 'Half_seconds']]  # Features
y = df['Official_Time_seconds']  # Target

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize KNN with a specific number of neighbors, e.g., 5
knn = KNeighborsRegressor(n_neighbors=5)

# Train the model
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error for KNN: {mse}')


Mean Squared Error for KNN: 567183.7121066868


In [35]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)  # Output layer
])

# Compile the model
model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32)

# Evaluate the model
model.evaluate(X_test, y_test)


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1590/1590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 139646528.0000 - val_loss: 5520304.0000
Epoch 2/100
[1m1590/1590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 3748120.2500 - val_loss: 1224089.7500
Epoch 3/100
[1m1590/1590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 999136.4375 - val_loss: 624632.0625
Epoch 4/100
[1m1590/1590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 594009.6875 - val_loss: 491761.0938
Epoch 5/100
[1m1590/1590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 511817.5938 - val_loss: 457470.9062
Epoch 6/100
[1m1590/1590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 497801.4688 - val_loss: 453306.1250
Epoch 7/100
[1m1590/1590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 475927.7500 - val_loss: 449975.8750
Epoch 8/100
[1m1590/1590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s

525067.4375