# Final analysis

### Importing libraries

In [1]:
import pandas as pd
from scipy import spatial
import warnings
warnings.filterwarnings('ignore') # ignore warnings

Load the cleaned data.

In [2]:
df = pd.read_csv('df_cleaned.csv')

## Correlations
We expect altitude and distance to be highly correlated with the moving time as these two features are used in most estimation formulas in use.

In [3]:
df.corr()

Unnamed: 0,length_3d,max_elevation,uphill,moving_time,max_speed,min_elevation,downhill,length_2d,avg_speed,difficulty_num
length_3d,1.0,0.162568,0.264588,0.852642,0.075354,-0.109997,0.273401,1.0,0.130308,0.086269
max_elevation,0.162568,1.0,0.320926,0.361493,-0.076644,0.809108,0.220951,0.162568,-0.382237,0.487529
uphill,0.264588,0.320926,1.0,0.330333,-0.027443,0.047296,0.854022,0.264588,-0.104035,0.25548
moving_time,0.852642,0.361493,0.330333,1.0,-0.090105,0.031861,0.30067,0.852642,-0.349202,0.177689
max_speed,0.075354,-0.076644,-0.027443,-0.090105,1.0,-0.082146,-0.01404,0.075354,0.329579,0.008756
min_elevation,-0.109997,0.809108,0.047296,0.031861,-0.082146,1.0,0.04116,-0.109997,-0.259089,0.271046
downhill,0.273401,0.220951,0.854022,0.30067,-0.01404,0.04116,1.0,0.273401,-0.035203,0.172832
length_2d,1.0,0.162568,0.264588,0.852642,0.075354,-0.109997,0.273401,1.0,0.130308,0.086269
avg_speed,0.130308,-0.382237,-0.104035,-0.349202,0.329579,-0.259089,-0.035203,0.130308,1.0,-0.182321
difficulty_num,0.086269,0.487529,0.25548,0.177689,0.008756,0.271046,0.172832,0.086269,-0.182321,1.0


As expected, changes in altitude and the distance have the highest correlations with the moving time. Max elevation also shows low correlation as the terrain in higher altitudes can be more challenging than in lower altitudes. Interestingly the difficulty score doesn't seem to correlate as much with the moving time. This might be due to several reasons: The difficulty score of a whole tour is based on the most difficult section, it is set by users and thus varies due to subjectivity, a difficult track may be exposed and only for experienced hikers, but it is not automatically terrain which slows one down.

## Recommendation system

In [4]:
df.head()

Unnamed: 0,_id,length_3d,user,start_time,max_elevation,bounds,uphill,moving_time,end_time,max_speed,gpx,difficulty,min_elevation,url,downhill,name,length_2d,avg_speed,difficulty_num,country
0,5afb229e8f80884aaad9c6ea,10832.953016,Bergfritz,2018-05-11 07:37:40,1934.47,"{'min': {'type': 'Point', 'coordinates': [13.2...",612.88,12155.0,2018-05-11 11:38:23,1.595493,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T2 - Mountain hike,1322.96,http://www.hikr.org/tour/post131855.html,609.67,"Remsteinkopf, 1945 m",10832.953016,0.891234,2,Österreich
1,5afb229e8f80884aaad9c6eb,12259.376315,Bergfritz,2018-05-12 07:25:08,2186.21,"{'min': {'type': 'Point', 'coordinates': [13.1...",614.753,13876.0,2018-05-12 12:08:28,1.39432,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T3 - Difficult Mountain hike,1266.4,http://www.hikr.org/tour/post131856.html,1193.733,"Schuhflicker, 2214 m",12259.376315,0.883495,3,Österreich
2,5afb229e8f80884aaad9c6ee,19581.273819,rkroebl,2018-05-11 05:44:58,697.57,"{'min': {'type': 'Point', 'coordinates': [8.61...",310.662,18197.0,2018-05-11 12:54:25,1.542405,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T2 - Mountain hike,438.5,http://www.hikr.org/tour/post131845.html,305.372,Waldstätterweg: Buochs - Beckenried und Gersau...,19581.273819,1.076072,2,Switzerland
3,5afb229e8f80884aaad9c6ef,8927.813277,siso,2018-05-12 04:28:16,2613.96,"{'min': {'type': 'Point', 'coordinates': [8.83...",922.87,10905.0,2018-05-12 13:46:34,3.859908,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T1 - Valley hike,1685.33,http://www.hikr.org/tour/post131818.html,927.19,Pizzo d’Era (2618 m) – Skitour,8927.813277,0.81869,1,Switzerland
4,5afb229e8f80884aaad9c6f0,8925.37885,ivanbutti,2018-05-12 05:08:25,1666.58,"{'min': {'type': 'Point', 'coordinates': [9.44...",1032.625,14660.0,2018-05-12 10:04:34,4.073263,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",T3+ - Difficult Mountain hike,682.73,http://www.hikr.org/tour/post131816.html,1009.965,Mattinata sul Due Mani,8925.37885,0.608825,3,Italia


Do the clustering

In [5]:
def find_array(user):
    user_array = []
    # first number the main of length_3d of all the hikes of the user
    user_array.append(df[df['user'] == user]['length_3d'].mean())
    # second number the main of max_elevation of all the hikes of the user
    user_array.append(df[df['user'] == user]['max_elevation'].mean())
    # third number the main of min_elevation of all the hikes of the user
    user_array.append(df[df['user'] == user]['min_elevation'].mean())
    # fourth number the main of moving_time of all the hikes of the user
    user_array.append(df[df['user'] == user]['moving_time'].mean())
    # fifth number the main of difficulty_num of all the hikes of the user
    user_array.append(df[df['user'] == user]['difficulty_num'].mean())

    return user_array

In [35]:
input_user = input('Enter your name: ')
input_user = 'siso'

users = df['user'].unique()

input_array = find_array(input_user)

distances = []

if input_user in users:
    for user in users:
        if user == input_user:
            continue
        else:
            user_array = find_array(user)
            distance_user = 1 - spatial.distance.cosine(user_array, input_array)
            obj = [distance_user, user]
            distances.append(obj)

distances.sort(key=lambda x: x[0])
distances.reverse()

best_match_user = distances[0][1]

The best match user is:  schmidi87


In [38]:
# find the best track from the best match user
def find_best_track(input_user, best_match_user, df):
    df_input_user = df[df['user'] == input_user]
    df_best_match_user = df[df['user'] == best_match_user]
    
    # filter the colums that I need, just length_3d, max_elevation, min_elevation, moving_time, difficulty_num
    df_input_user = df_input_user[['length_3d', 'max_elevation', 'min_elevation', 'moving_time', 'difficulty_num']]
    df_best_match_user = df_best_match_user[['length_3d', 'max_elevation', 'min_elevation', 'moving_time', 'difficulty_num']]

    # calculate the score for each track based on the distance of the corresponding columns of the input_array
    df_best_match_user['score'] = 0
    for index, row in df_best_match_user.iterrows():
        score = 0
        for i in range(len(input_array)):
            score += 1 - spatial.distance.cosine([input_array[i]], [row[i]])
        df_best_match_user.loc[index, 'score'] = score
    
    # return the index of the track with the highest score
    return df_best_match_user['score'].idxmax()

In [39]:
reccomended_path_index = find_best_track(input_user, best_match_user, df)
print('The best track for you is:')
df.iloc[reccomended_path_index]

The best track for you is:


_id                                        5afb255c8f80884aaad9ec6b
length_3d                                              18285.970975
user                                                      schmidi87
start_time                                      2015-05-30 08:36:00
max_elevation                                                   NaN
bounds            {'min': {'type': 'Point', 'coordinates': [7.61...
uphill                                                          0.0
moving_time                                                 20173.0
end_time                                        2015-05-30 16:04:46
max_speed                                                  1.615608
gpx               <?xml version="1.0" encoding="UTF-8"?>\n<gpx x...
difficulty                             T4+ - High-level Alpine hike
min_elevation                                                   NaN
url                         http://www.hikr.org/tour/post94872.html
downhill                                        