In [2]:
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
#print(os.listdir("../input"))

In [3]:
import pandas as pd
from matplotlib import pyplot

In [4]:
df = pd.read_csv('gpx-tracks-from-hikr.org.csv')
df.head(n=2)

Unnamed: 0,_id,length_3d,user,start_time,max_elevation,bounds,uphill,moving_time,end_time,max_speed,gpx,difficulty,min_elevation,url,downhill,name,length_2d
0,5afb229e8f80884aaad9c6ea,10832.953016,Bergfritz,2018-05-11 07:37:40,1934.47,"{'min': {'type': 'Point', 'coordinates': [13.2...",612.88,12155.0,2018-05-11 11:38:23,1.595493,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T2 - Mountain hike,1322.96,http://www.hikr.org/tour/post131855.html,609.67,"Remsteinkopf, 1945 m",10832.953016
1,5afb229e8f80884aaad9c6eb,12259.376315,Bergfritz,2018-05-12 07:25:08,2186.21,"{'min': {'type': 'Point', 'coordinates': [13.1...",614.753,13876.0,2018-05-12 12:08:28,1.39432,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T3 - Difficult Mountain hike,1266.4,http://www.hikr.org/tour/post131856.html,1193.733,"Schuhflicker, 2214 m",12259.376315


## Adding additional features
The difficulty rating can be changed to a numeric value for easier processing.
Many estimators and models won't work with text values. We can simply extract the second letter which results in an ordinal encoding. Our values For categorical data which cannot be transformed that easily you may want to look into some builtin helpers like http://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.CategoricalEncoder.html. Keras also has a util for one-hot-encoding https://keras.io/utils/#to_categorical

In [5]:
df['avg_speed'] = df['length_3d']/df['moving_time']
df['difficulty_num'] = df['difficulty'].map(lambda x: int(x[1])).astype('int32')

## Removing Outliers

In [6]:
df.describe()

Unnamed: 0,length_3d,max_elevation,uphill,moving_time,max_speed,min_elevation,downhill,length_2d,avg_speed,difficulty_num
count,12141.0,10563.0,12141.0,12141.0,12141.0,10563.0,12141.0,12141.0,11613.0,12141.0
mean,18747.71,1934.281708,942.184362,12848.445268,1.746356,1003.33115,879.145539,18747.71,inf,2.867886
std,409309.8,784.968353,1065.498993,11599.792248,5.394065,813.001041,1028.618856,409309.8,,1.162571
min,0.0,-1.0,0.0,0.0,0.0,-32768.0,0.0,0.0,0.4465059,1.0
25%,8254.129,1382.275,420.142,5260.0,1.078841,560.02,256.519,8254.129,0.7513163,2.0
50%,12005.77,1986.7,882.0,12990.0,1.36702,960.09,823.199002,12005.77,0.8679202,3.0
75%,16458.13,2498.455848,1301.005,18514.0,1.604181,1389.485,1266.923,16458.13,1.225013,3.0
max,31891800.0,5633.462891,35398.006781,189380.0,192.768748,4180.0,52379.2,31891800.0,inf,6.0


### Suspicious speed values
Looking at min and max values it is apparent that there are some tracks which we want to exclude from our data set. An infinite average speed, or a min elevation of more than 30km below see level just don't seem right. We can remove the extremes at both sides and remove all rows where there are null values.

In [7]:
# drop na values
df.dropna()
df = df[df['avg_speed'] < 2.5] # an avg of > 2.5m/s is probably not a hiking activity

### Min elevation
A min elevation of -32km doesn't seem right.

In [8]:
def retain_values(df, column, min_quartile, max_quartile):
    q_min, q_max = df[column].quantile([min_quartile, max_quartile])
    print("Keeping values between {} and {} of column {}".format(q_min, q_max, column))
    return df[(df[column] > q_min) & (df[column] < q_max)]

# drop elevation outliers
df = retain_values(df, 'min_elevation', 0.01, 1)

Keeping values between 0.0 and 4180.0 of column min_elevation


## Correlations
We expect altitude and distance to be highly correlated with the moving time as these two features are used in most estimation formulas in use [citation needed].

In [9]:
df.corr()

Unnamed: 0,length_3d,max_elevation,uphill,moving_time,max_speed,min_elevation,downhill,length_2d,avg_speed,difficulty_num
length_3d,1.0,0.173294,0.591355,0.915491,0.213945,-0.128962,0.540192,1.0,0.242597,0.035156
max_elevation,0.173294,1.0,0.350041,0.293025,-0.034976,0.783448,0.282179,0.173294,-0.30973,0.460304
uphill,0.591355,0.350041,1.0,0.616879,0.067408,-0.003322,0.888202,0.591355,-0.013105,0.23052
moving_time,0.915491,0.293025,0.616879,1.0,0.073278,-0.045245,0.557498,0.915491,-0.09371,0.095606
max_speed,0.213945,-0.034976,0.067408,0.073278,1.0,-0.095708,0.072022,0.213945,0.360828,-0.004895
min_elevation,-0.128962,0.783448,-0.003322,-0.045245,-0.095708,1.0,-0.004718,-0.128962,-0.250249,0.23804
downhill,0.540192,0.282179,0.888202,0.557498,0.072022,-0.004718,1.0,0.540192,0.02772,0.183401
length_2d,1.0,0.173294,0.591355,0.915491,0.213945,-0.128962,0.540192,1.0,0.242597,0.035156
avg_speed,0.242597,-0.30973,-0.013105,-0.09371,0.360828,-0.250249,0.02772,0.242597,1.0,-0.15202
difficulty_num,0.035156,0.460304,0.23052,0.095606,-0.004895,0.23804,0.183401,0.035156,-0.15202,1.0


As expected, changes in altitude and the distance have the highest correlations with the moving time. Max elevation also shows low correlation as the terrain in higher altitudes can be more challenging than in lower altitudes. Interestingly the difficulty score doesn't seem to correlate as much with the moving time. This might be due to several reasons: The difficulty score of a whole tour is based on the most difficult section, it is set by users and thus varies due to subjectivity, a difficult track may be exposed and only for experienced hikers, but it is not automatically terrain which slows one down.

## Building the models

### A strong baseline
Before putting too much time into a sophisticated model it is important to develop a simple baseline which serves as an anchor point to benchmark any other model against it. For many problems these simple baselines are already hard to beat and allow to identify approaches which can be discarded early. Given the nature of the problem, we will use a linear regression model to predict the moving time based on the most correlated fields (`length_3d`, `uphill`, `downhill` and `max_elevation`)

In [10]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

y = df.reset_index()['moving_time']
x = df.reset_index()[['downhill', 'uphill', 'length_3d', 'max_elevation']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

lasso = Lasso()
lasso.fit(x_train, y_train)
print("Coefficients: {}".format(lasso.coef_))

Coefficients: [0.0751353  0.6106501  0.90925421 1.66314048]


In [11]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

y_pred_lasso = lasso.predict(x_test)

In [12]:
r2 = r2_score(y_test, y_pred_lasso)
mse = mean_squared_error(y_test, y_pred_lasso)

print("r2:\t{}\nMSE: \t{}".format(r2, mse))

r2:	0.8222421988676842
MSE: 	16144395.218590213


### GradientBoostingRegressor

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()
gbr.fit(x_train, y_train)
y_pred_gbr = gbr.predict(x_test)

In [14]:
r2 = r2_score(y_test, y_pred_gbr)
mse = mean_squared_error(y_test, y_pred_gbr)

print("r2:\t{}\nMSE: \t{}".format(r2, mse))

r2:	0.8754732770443745
MSE: 	11309819.41644879


### Regression with Keras

In [19]:
from keras import Sequential
#from keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop, Adam
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

model = Sequential()
model.add(Dense(12, input_shape=(4,)))
model.add(Dense(5, input_shape=(4,)))
model.add(Dense(1))
model.compile(optimizer=Adam(0.001), loss='mse')

2022-12-10 15:04:14.600090: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
hist = model.fit(x_train, y_train, epochs=50, batch_size=10, validation_split=0.15, 
          callbacks=[
            ModelCheckpoint(filepath='./keras-model.h5', save_best_only=True),
            EarlyStopping(patience=2),
            ReduceLROnPlateau()
          ],
          verbose=1
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50


In [21]:
model.load_weights(filepath='./keras-model.h5')
y_pred_keras = model.predict(x_test)

r2 = r2_score(y_test, y_pred_keras)
mse = mean_squared_error(y_test, y_pred_keras)

print("r2:\t{}\nMSE: \t{}".format(r2, mse))

r2:	0.8252966344692565
MSE: 	15866983.958958302


## Ensemble results

In [22]:
import numpy as np

combined = (y_pred_keras[:,0] + y_pred_gbr * 2) / 3.0
r2 = r2_score(y_test, combined)
mse = mean_squared_error(y_test, combined)

print("r2:\t{}\nMSE: \t{}".format(r2, mse))

r2:	0.8761505005122296
MSE: 	11248312.33632791


In [23]:
c = pd.DataFrame([combined, y_pred_keras[:,0], y_pred_lasso, y_pred_gbr, y_test]).transpose()
c.columns = ['combined', 'keras', 'lasso', 'tree', 'test']
c['diff_minutes'] = (c['test'] - c['combined']) / 60
c.describe()

Unnamed: 0,combined,keras,lasso,tree,test,diff_minutes
count,1847.0,1847.0,1847.0,1847.0,1847.0,1847.0
mean,16192.306574,16032.737937,16310.101931,16272.090893,16189.69085,-0.043595
std,9264.011649,9043.230425,9265.306021,9497.151138,9532.661133,55.912627
min,1290.15332,850.055786,404.868331,1214.176635,361.0,-466.404725
25%,11371.253909,11224.562012,11381.652809,11498.387112,10840.0,-21.549979
50%,15203.74898,14688.055664,14896.290843,15428.656594,15250.0,3.997588
75%,19438.544099,18831.654297,19349.833452,19724.331516,20176.0,27.474544
max,165380.884215,148323.078125,151458.190823,176593.677885,181590.0,407.698357
