# Final analysis

### Importing libraries

In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
from scipy import spatial
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import GradientBoostingRegressor
from tensorflow.keras.optimizers import RMSprop, Adam
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

import warnings
warnings.filterwarnings('ignore') # ignore warnings

2023-01-16 18:45:08.558614: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Load the cleaned data.

In [3]:
df = pd.read_csv('df_cleaned.csv')

## Correlations
We expect altitude and distance to be highly correlated with the moving time as these two features are used in most estimation formulas in use.

In [4]:
df.corr()

Unnamed: 0,length_3d,max_elevation,uphill,moving_time,max_speed,min_elevation,downhill,length_2d,avg_speed,difficulty_num
length_3d,1.0,0.162568,0.264588,0.852642,0.075354,-0.109997,0.273401,1.0,0.130308,0.086269
max_elevation,0.162568,1.0,0.320926,0.361493,-0.076644,0.809108,0.220951,0.162568,-0.382237,0.487529
uphill,0.264588,0.320926,1.0,0.330333,-0.027443,0.047296,0.854022,0.264588,-0.104035,0.25548
moving_time,0.852642,0.361493,0.330333,1.0,-0.090105,0.031861,0.30067,0.852642,-0.349202,0.177689
max_speed,0.075354,-0.076644,-0.027443,-0.090105,1.0,-0.082146,-0.01404,0.075354,0.329579,0.008756
min_elevation,-0.109997,0.809108,0.047296,0.031861,-0.082146,1.0,0.04116,-0.109997,-0.259089,0.271046
downhill,0.273401,0.220951,0.854022,0.30067,-0.01404,0.04116,1.0,0.273401,-0.035203,0.172832
length_2d,1.0,0.162568,0.264588,0.852642,0.075354,-0.109997,0.273401,1.0,0.130308,0.086269
avg_speed,0.130308,-0.382237,-0.104035,-0.349202,0.329579,-0.259089,-0.035203,0.130308,1.0,-0.182321
difficulty_num,0.086269,0.487529,0.25548,0.177689,0.008756,0.271046,0.172832,0.086269,-0.182321,1.0


As expected, changes in altitude and the distance have the highest correlations with the moving time. Max elevation also shows low correlation as the terrain in higher altitudes can be more challenging than in lower altitudes. Interestingly the difficulty score doesn't seem to correlate as much with the moving time. This might be due to several reasons: The difficulty score of a whole tour is based on the most difficult section, it is set by users and thus varies due to subjectivity, a difficult track may be exposed and only for experienced hikers, but it is not automatically terrain which slows one down.

## Recommendation system

In [5]:
df.head()

Unnamed: 0,_id,length_3d,user,start_time,max_elevation,bounds,uphill,moving_time,end_time,max_speed,gpx,difficulty,min_elevation,url,downhill,name,length_2d,avg_speed,difficulty_num,country
0,5afb229e8f80884aaad9c6ea,10832.953016,Bergfritz,2018-05-11 07:37:40,1934.47,"{'min': {'type': 'Point', 'coordinates': [13.2...",612.88,12155.0,2018-05-11 11:38:23,1.595493,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T2 - Mountain hike,1322.96,http://www.hikr.org/tour/post131855.html,609.67,"Remsteinkopf, 1945 m",10832.953016,0.891234,2,Österreich
1,5afb229e8f80884aaad9c6eb,12259.376315,Bergfritz,2018-05-12 07:25:08,2186.21,"{'min': {'type': 'Point', 'coordinates': [13.1...",614.753,13876.0,2018-05-12 12:08:28,1.39432,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T3 - Difficult Mountain hike,1266.4,http://www.hikr.org/tour/post131856.html,1193.733,"Schuhflicker, 2214 m",12259.376315,0.883495,3,Österreich
2,5afb229e8f80884aaad9c6ee,19581.273819,rkroebl,2018-05-11 05:44:58,697.57,"{'min': {'type': 'Point', 'coordinates': [8.61...",310.662,18197.0,2018-05-11 12:54:25,1.542405,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T2 - Mountain hike,438.5,http://www.hikr.org/tour/post131845.html,305.372,Waldstätterweg: Buochs - Beckenried und Gersau...,19581.273819,1.076072,2,Switzerland
3,5afb229e8f80884aaad9c6ef,8927.813277,siso,2018-05-12 04:28:16,2613.96,"{'min': {'type': 'Point', 'coordinates': [8.83...",922.87,10905.0,2018-05-12 13:46:34,3.859908,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T1 - Valley hike,1685.33,http://www.hikr.org/tour/post131818.html,927.19,Pizzo d’Era (2618 m) – Skitour,8927.813277,0.81869,1,Switzerland
4,5afb229e8f80884aaad9c6f0,8925.37885,ivanbutti,2018-05-12 05:08:25,1666.58,"{'min': {'type': 'Point', 'coordinates': [9.44...",1032.625,14660.0,2018-05-12 10:04:34,4.073263,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T3+ - Difficult Mountain hike,682.73,http://www.hikr.org/tour/post131816.html,1009.965,Mattinata sul Due Mani,8925.37885,0.608825,3,Italia


In [6]:
def find_array(user):
    user_array = []
    # first number the main of length_3d of all the hikes of the user
    user_array.append(df[df['user'] == user]['length_3d'].mean())
    # second number the main of max_elevation of all the hikes of the user
    user_array.append(df[df['user'] == user]['max_elevation'].mean())
    # third number the main of min_elevation of all the hikes of the user
    user_array.append(df[df['user'] == user]['min_elevation'].mean())
    # fourth number the main of moving_time of all the hikes of the user
    user_array.append(df[df['user'] == user]['moving_time'].mean())
    # fifth number the main of difficulty_num of all the hikes of the user
    user_array.append(df[df['user'] == user]['difficulty_num'].mean())

    return user_array

Here is a example of the reccomenation system in action. For the user (for the default case 'siso') is calculated the similarity between the user and all other users. The similarity is calculated by the cosine similarity of the user's and the other user's ratings. The ratings are the difficulty scores of the tours the user has completed. The similarity is then used to calculate the weighted average of the difficulty scores of the tours the other users have completed. The weighted average is weighted by the similarity of the other user to the user. The weighted average is then sorted and the top 1 tour is recommended to the user.

In [7]:
#input_user = input('Enter your name: ')
input_user = 'siso'

users = df['user'].unique()

input_array = find_array(input_user)

distances = []

if input_user in users:
    for user in users:
        # if the user and the imput user have not the same country we don't need to compare them (we suppose that the country is the same for all the hikes of the user)
        if user == input_user or df[df['user'] == user]['country'].values[0] != df[df['user'] == input_user]['country'].values[0]:
            continue
        else:
            user_array = find_array(user)
            distance_user = 1 - spatial.distance.cosine(user_array, input_array)
            obj = [distance_user, user]
            distances.append(obj)

distances.sort(key=lambda x: x[0])
distances.reverse()

best_match_user = distances[0][1]
print('The best match user is: ', best_match_user)

The best match user is:  schmidi87


In [8]:
# find the best track from the best match user
def find_best_track(input_user, best_match_user, df):
    df_input_user = df[df['user'] == input_user]
    df_best_match_user = df[df['user'] == best_match_user]
    
    # filter the colums that I need, just length_3d, max_elevation, min_elevation, moving_time, difficulty_num
    df_input_user = df_input_user[['length_3d', 'max_elevation', 'min_elevation', 'moving_time', 'difficulty_num']]
    df_best_match_user = df_best_match_user[['length_3d', 'max_elevation', 'min_elevation', 'moving_time', 'difficulty_num']]

    # calculate the score for each track based on the distance of the corresponding columns of the input_array
    df_best_match_user['score'] = 0
    for index, row in df_best_match_user.iterrows():
        score = 0
        for i in range(len(input_array)):
            score += 1 - spatial.distance.cosine([input_array[i]], [row[i]])
        df_best_match_user.loc[index, 'score'] = score
    
    # return the index of the track with the highest score
    return df_best_match_user['score'].idxmax()

In [9]:
reccomended_path_index = find_best_track(input_user, best_match_user, df)
print('The best track for you is:')
df.iloc[reccomended_path_index]

The best track for you is:


_id                                        5afb255c8f80884aaad9ec6b
length_3d                                              18285.970975
user                                                      schmidi87
start_time                                      2015-05-30 08:36:00
max_elevation                                                   NaN
bounds            {'min': {'type': 'Point', 'coordinates': [7.61...
uphill                                                          0.0
moving_time                                                 20173.0
end_time                                        2015-05-30 16:04:46
max_speed                                                  1.615608
gpx               <?xml version="1.0" encoding="UTF-8"?>\n<gpx x...
difficulty                             T4+ - High-level Alpine hike
min_elevation                                                   NaN
url                         http://www.hikr.org/tour/post94872.html
downhill                                        

In this section we will try to build a regression model to find out the realtionship between the difficulty score and the other features. We will use the linear regression model from sklearn.

## Building the models

Let's build a model to predict the moving time based on the features we have. As that is a baseline example, we will use a simple linear regression model. The field on wich we wanto to base our prediction are (`length_3d`, `uphill`, `downhill` and `max_elevation`). We will use the `sklearn` and `Keras` libraries to build the model.

We will use lasso regression because it is a good baseline model for this type of data. For simple subsets of variables, it provides greater prediction accuracy than other regression models. The lasso procedure encourages simple, sparse models.

In [15]:
# Do some data cleaning
# adding the neccecery columns
df['avg_speed'] = df['length_3d']/df['moving_time']
df['difficulty_num'] = df['difficulty'].map(lambda x: int(x[1])).astype('int32')
# drop outliers
df.dropna()
df = df[df['avg_speed'] < 2.5]

def retain_values(df, column, min_quartile, max_quartile):
    q_min, q_max = df[column].quantile([min_quartile, max_quartile])
    print("Keeping values between {} and {} of column {}".format(q_min, q_max, column))
    return df[(df[column] > q_min) & (df[column] < q_max)]

# drop elevation outliers
df = retain_values(df, 'min_elevation', 0.01, 1)

Keeping values between 8.778500000000001 and 3625.74 of column min_elevation


In [16]:
y = df.reset_index()['moving_time']
x = df.reset_index()[['downhill', 'uphill', 'length_3d', 'max_elevation']]

# split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

lasso = Lasso()
lasso.fit(x_train, y_train)
print ('Lasso R2: ', r2_score(y_test, lasso.predict(x_test)))

Lasso R2:  0.6851939601255553


In [17]:
y_pred_lasso = lasso.predict(x_test)

MSE represents the residual error which is nothing but sum of squared difference between actual values and the predicted / estimated values divided by total number of records. R-Squared represents the fraction of variance captured by the regression model.

In [18]:
r2 = r2_score(y_test, y_pred_lasso)
mse = mean_squared_error(y_test, y_pred_lasso)

print('R2: ', r2)
print('MSE: ', mse)

R2:  0.6851939601255553
MSE:  8886157.550095418


Let's now calculate the so called residual. Gradient boosting Regression calculates the difference between the current prediction and the known correct target value. This difference is called the residual.

In [19]:
gbr = GradientBoostingRegressor()
gbr.fit(x_train, y_train)
y_pred_gbr = gbr.predict(x_test)

In [20]:
r2 = r2_score(y_test, y_pred_gbr)
mse = mean_squared_error(y_test, y_pred_gbr)

print('R2: ', r2)
print('MSE: ', mse)

R2:  0.8260670098957209
MSE:  4909676.935811803


Here we can see that the model is not very accurate. The R2 score is 0.5 which is not very good. We can try to improve the model by using a different model. We will use a neural network to see if we can improve the results.

In [21]:
model = Sequential()
model.add(Dense(12, input_shape=(4,)))
model.add(Dense(5, input_shape=(4,)))
model.add(Dense(1))
model.compile(optimizer=Adam(0.001), loss='mse')

2023-01-16 18:52:43.748068: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
hist = model.fit(x_train, y_train, epochs=50, batch_size=10, validation_split=0.15, 
          callbacks=[
            ModelCheckpoint(filepath='./keras-model.h5', save_best_only=True),
            EarlyStopping(patience=2),
            ReduceLROnPlateau()
          ],
          verbose=1
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


In [23]:
# load the weights from the already trained model
model.load_weights(filepath='./keras-model.h5')
y_pred_keras = model.predict(x_test)

r2 = r2_score(y_test, y_pred_keras)
mse = mean_squared_error(y_test, y_pred_keras)

print('R2: ', r2)
print('MSE: ', mse)

R2:  0.7085038611746751
MSE:  8228179.535181139


Ensemble results

We saw that the model created with the gradient boosting regressor was better than the one created with the lasso regression but still not very accurate. We will now try to improve the results by combining the results of the gbr model and the neural network model. We will use the mean of the two models as the final result.

In [24]:
combined = (y_pred_keras[:,0] + y_pred_gbr * 2) / 3.0
r2 = r2_score(y_test, combined)
mse = mean_squared_error(y_test, combined)

print('R2: ', r2)
print('MSE: ', mse)

R2:  0.8129701503133924
MSE:  5279367.294060537


In order to have a better view on the results we will insert them in a dataframe. And study the descriptive statistics of the results.

In [25]:
c = pd.DataFrame([combined, y_pred_keras[:,0], y_pred_lasso, y_pred_gbr, y_test]).transpose()
c.columns = ['combined', 'keras', 'lasso', 'tree', 'test']
c['diff_minutes'] = (c['test'] - c['combined']) / 60
c.describe()

Unnamed: 0,combined,keras,lasso,tree,test,diff_minutes
count,1101.0,1101.0,1101.0,1101.0,1101.0,1101.0
mean,14316.750231,14333.09543,14395.092828,14308.577632,14262.384196,-0.906101
std,4648.805957,4536.48498,4805.596916,4820.038045,5315.36134,38.301466
min,1423.933081,3229.308838,2432.130807,-1252.553748,1850.0,-201.624038
25%,10877.890705,10940.536133,10807.15198,10795.237911,10402.0,-22.4339
50%,14527.348327,14416.668945,14411.058603,14477.113318,14312.0,-0.329926
75%,17566.922851,17586.236328,17741.614495,17766.908691,18080.0,23.576289
max,24377.802248,34146.910156,39130.817434,25047.053957,29009.0,119.64565


From the results above we can see that the model are quite accurate. The R2 score is 0.81 for the combined model which is good. We can try to improve the results by using a different model.