## Exploratory Data Analysis (EDA)

Goal - Create a similarity metric based on just the numerical data (elevation gain, distance, star rating)

In [148]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

### READ IN PICKLE DATA FILE

In [149]:
#Read in pickle data file
data=pd.read_pickle('../data/alltrails_ontario.pkl')
data1=pd.read_pickle('../data/alltrails_ontario.pkl')
# data1=

### CURATION
1. Change trail attributes to string for TF-IDF
2. Remove nan from elevation data

In [150]:
#Replace nan with 0. - I had to do this since elevation had a few rows with nan
data.elevation[np.isnan(data.elevation)]=0.
data_copy=data.copy()
# data.distance.astype('float')
# data.dtypes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [151]:
#Curate trail attributes to create single words for each array element
trail_attributes=data.trail_attributes
hattrib=[]
for l1 in trail_attributes:
    hattrib.append([l2.replace(' ', '') for l2 in l1])
print (hattrib[0])

# Add the curated attibutes as a new column ('tags') in data
data['tags']=pd.Series(hattrib)
#convert list of strings to text - this is necessary for running TFIDF
data['tags']=[' '.join(tag) for tag in data.tags.values]
# data.elevation[628]
data.tags[0]
# data[data['elevation']==0]
data.head(5)

['dogsonleash', 'wheelchairfriendly', 'kidfriendly', 'strollerfriendly', 'hiking', 'mountainbiking', 'naturetrips', 'snowshoeing', 'trailrunning', 'walking', 'forest', 'paved', 'views', 'snow']


Unnamed: 0,difficulty,distance,elevation,name,nreviews,review,route_type,stars,trail_attributes,tags
0,MODERATE,5.6,32.0,Taylor Creek Trail,23,"Great for strollers, bikes etc. Opposite side ...",Out & Back,3.7,"[dogs on leash, wheelchair friendly, kid frien...",dogsonleash wheelchairfriendly kidfriendly str...
1,MODERATE,4.7,64.0,Hilton Falls Trail,238,What a gem! I was so pleasantly surprised by ...,Loop,4.3,"[dogs on leash, cross country skiing, fishing,...",dogsonleash crosscountryskiing fishing hiking ...
2,MODERATE,2.3,100.0,Niagara Glen Trail,135,Beautiful area with several trails. Loved exp...,Loop,4.7,"[dogs on leash, kid friendly, hiking, nature t...",dogsonleash kidfriendly hiking naturetrips wal...
3,MODERATE,7.2,128.0,Nassagaweya and Bruce Trail Loop from Rattlesn...,170,Great views! We went in January so there weren...,Loop,4.2,"[dogs on leash, kid friendly, hiking, nature t...",dogsonleash kidfriendly hiking naturetrips sno...
4,MODERATE,15.3,427.0,Lion's Head Loop Via Bruce Trail,117,Amazing trail with stunning lookouts. Hiked it...,Loop,4.8,"[dogs on leash, hiking, nature trips, walking,...",dogsonleash hiking naturetrips walking birdwat...


In [152]:
#Convert difficulty rating to a 2D array (Easy-[1,0], moderate-[1,1], hard-[0,1])
difficulty=data.difficulty
arr=[]
for i in difficulty:
    if i=='EASY':
        arr.append([1,0])
    elif i=='MODERATE':
        arr.append([1,1])
    elif i=='HARD':
        arr.append([0,1])
# arr
data['num_difficulty']=arr
data.head(5)

Unnamed: 0,difficulty,distance,elevation,name,nreviews,review,route_type,stars,trail_attributes,tags,num_difficulty
0,MODERATE,5.6,32.0,Taylor Creek Trail,23,"Great for strollers, bikes etc. Opposite side ...",Out & Back,3.7,"[dogs on leash, wheelchair friendly, kid frien...",dogsonleash wheelchairfriendly kidfriendly str...,"[1, 1]"
1,MODERATE,4.7,64.0,Hilton Falls Trail,238,What a gem! I was so pleasantly surprised by ...,Loop,4.3,"[dogs on leash, cross country skiing, fishing,...",dogsonleash crosscountryskiing fishing hiking ...,"[1, 1]"
2,MODERATE,2.3,100.0,Niagara Glen Trail,135,Beautiful area with several trails. Loved exp...,Loop,4.7,"[dogs on leash, kid friendly, hiking, nature t...",dogsonleash kidfriendly hiking naturetrips wal...,"[1, 1]"
3,MODERATE,7.2,128.0,Nassagaweya and Bruce Trail Loop from Rattlesn...,170,Great views! We went in January so there weren...,Loop,4.2,"[dogs on leash, kid friendly, hiking, nature t...",dogsonleash kidfriendly hiking naturetrips sno...,"[1, 1]"
4,MODERATE,15.3,427.0,Lion's Head Loop Via Bruce Trail,117,Amazing trail with stunning lookouts. Hiked it...,Loop,4.8,"[dogs on leash, hiking, nature trips, walking,...",dogsonleash hiking naturetrips walking birdwat...,"[1, 1]"


In [153]:
data.to_pickle('alltrails_ontario_curated.pkl')

### Check numerical data


In [154]:
# Create scaling for numerical data
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder ,LabelBinarizer
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

### Euclidean distance metric for numerical data

In [155]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.spatial.distance import cdist

#Create Euclidean distance based scaling based for numerical data
numerical_data=data[['elevation','distance','stars']]
# scaler=StandardScaler().fit_transform(numerical_data)

#Test input from user
tst = [[100,10,4]]
distance=euclidean_distances(numerical_data,tst)
dft_sort=pd.DataFrame.from_records(distance).sort_values(by=0)
top10=dft_sort[:10].index.values
print (top10[0])
df_out=data.iloc[dft_sort[:10].index.values][['name', 'distance', 'elevation', 'stars', 'trail_attributes']]
df_out=data.iloc[top10][['name', 'distance', 'elevation', 'stars', 'trail_attributes']]
df_out
# dft_sort
# print (top10)
# data.iloc[dft_sort[:10].index.values, ][['name']]
# data[data['elevation']>1000]

703


Unnamed: 0,name,distance,elevation,stars,trail_attributes
703,"Bruce, Woodland, and Escarpment Trails Loop fr...",8.5,99.0,4.5,"[dogs on leash, kid friendly, hiking, nature t..."
1203,Cedar Lake South Loop Trail,9.0,102.0,4.0,"[camping, hiking, nature trips, trail running,..."
94,Mer Bleue Conservation Area Loop Trail,7.1,100.0,3.8,"[dogs on leash, cross country skiing, hiking, ..."
95,Moore Park Ravine to David Balfour Park Loop,7.6,102.0,4.5,"[dogs on leash, kid friendly, hiking, nature t..."
218,Foley Mountain Loop Trail,6.6,100.0,4.0,"[dogs on leash, kid friendly, hiking, nature t..."
631,Seguin Recreational Trail,7.9,103.0,4.3,"[dog friendly, kid friendly, hiking, mountain ..."
261,Happy Valley Via Oak Ridges Trail,6.4,99.0,4.2,"[dogs on leash, kid friendly, hiking, walking]"
280,Speyside Trail,13.7,101.0,3.9,"[dog friendly, hiking, nature trips, trail run..."
60,Newtonbrook Creek Pathway,9.8,96.0,4.2,"[dogs on leash, wheelchair friendly, stroller ..."
223,Hanlon Creek Trail,6.0,98.0,4.1,"[dog friendly, cross country skiing, hiking, n..."


In [156]:
scaler=MinMaxScaler()
numerical_data=data[['elevation','distance','stars']]
# scaled_num[['elevation','distance','stars']]
scaled_num=scaler.fit_transform(numerical_data[['elevation','distance','stars']])
# numerical_data
scaled_num

array([[0.00701908, 0.02830189, 0.74      ],
       [0.01403817, 0.02358491, 0.86      ],
       [0.02193463, 0.01100629, 0.94      ],
       ...,
       [0.01162536, 0.01415094, 0.66      ],
       [0.00263216, 0.00890985, 0.2       ],
       [0.01030928, 0.01100629, 0.8       ]])

In [157]:
trail_names=data['name'].str.lower()
# trail_names
names=[name for name in trail_names]
# names

In [158]:
#Numerical data normalization
numerical_data=data[['elevation','distance','stars']]
elev=data['elevation']
scaled_elev=MinMaxScaler(elev)
print (max(elev))

scaler=RobustScaler()
scaled_mnd=scaler.fit_transform(numerical_data)
scaled_mnd=pd.DataFrame(scaled_mnd, columns=['elevation','distance','stars'])

#Cosine similarity for numerical data
# cosine_sim_num=cosine_similarity(scaled_mnd,scaled_mnd)
cosine_sim_num=cosine_similarity(numerical_data,numerical_data)
cosine_sim_num[10][200]
scaled_elev
# scaled_mnd

4559.0


MinMaxScaler(copy=True,
       feature_range=0        32.0
1        64.0
2       100.0
3       128.0
4       427.0
5       222.0
6        71.0
7       197.0
8       259.0
9       301.0
10      460.0
11       81.0
12       93.0
13      370.0
14      156.0
15      174.0
16      106.0
17       93.0
18      408.0
19       57.0
20    ...211     33.0
1212     53.0
1213     12.0
1214     47.0
Name: elevation, Length: 1215, dtype: float64)

###  Test  similarity based on numerical data

In [159]:
trail_ui='Manitou Mountaion Trail to Calabogie Lake Loop'

#Extract index for user input trail name
# idx=data.index[data['name']=='Point Pelee Tip and East Point Beach'].tolist()
idx=data.index[data['name']==trail_ui].tolist()
# print (idx[0])
print (idx)

#Compute pairwise similarity score
# score=list(enumerate(cosine_sim_num[idx[0]]))
# print (score[313])
# sorted_score=sorted(score,key=lambda x: x[1],reverse=True)
# sorted_score[1:40]
# print (data.iloc[1])
# print (data.iloc[292])
data.iloc[0]

[92]


difficulty                                                   MODERATE
distance                                                          5.6
elevation                                                          32
name                                               Taylor Creek Trail
nreviews                                                           23
review              Great for strollers, bikes etc. Opposite side ...
route_type                                                 Out & Back
stars                                                             3.7
trail_attributes    [dogs on leash, wheelchair friendly, kid frien...
tags                dogsonleash wheelchairfriendly kidfriendly str...
num_difficulty                                                 [1, 1]
Name: 0, dtype: object

In [167]:
trail_ui='Manitou Mountaion Trail to Calabogie Lake Loop'

#Extract index for user input trail name
# idx=data.index[data['name']=='Point Pelee Tip and East Point Beach'].tolist()
idx=data.index[data['name']==trail_ui].tolist()
# print (idx[0])
print (idx[0])

#Compute pairwise similarity score
score=list(enumerate(cosine_sim_num[idx[0]]))
# print (score[313])
sorted_score=sorted(score,key=lambda x: x[1],reverse=True)
# sorted_score[1:40]
score
cosine_sim_num
# print (data.iloc[1])
# print (data.iloc[292])
# data.iloc[0]

92


array([[1.        , 0.99397247, 0.98661128, ..., 0.99171952, 0.99938055,
        0.99194676],
       [0.99397247, 1.        , 0.99853987, ..., 0.99981534, 0.99640106,
        0.99954377],
       [0.98661128, 0.99853987, 1.        , ..., 0.99938559, 0.99045259,
        0.99895001],
       ...,
       [0.99171952, 0.99981534, 0.99938559, ..., 1.        , 0.99459211,
        0.99972562],
       [0.99938055, 0.99640106, 0.99045259, ..., 0.99459211, 1.        ,
        0.99419638],
       [0.99194676, 0.99954377, 0.99895001, ..., 0.99972562, 0.99419638,
        1.        ]])

In [161]:
#Not sure I cando this when it's a user input
tst=[1,0,0]

train=[[1,0,0], [0,0,1], [0,1,0],[1,0,0],[0,0,1]]

difficulty=data[['difficulty']]
difficulty=difficulty.astype('category')

lbt=LabelBinarizer()
diffic_cat=lbt.fit_transform(difficulty)
# scaler_diffic=StandardScaler().fit_transform(diffic_cat)
# train
scaler_diffic
cosine_sim_diffic=cosine_similarity(scaler_diffic,scaler_diffic)
cosine_sim_diffic
# scaler_diffic
# diffic_cat
cos_simd=cosine_similarity(diffic_cat, diffic_cat)
cos_simd[0][:15]
# difficulty
# scaler_diffic

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 1.])

In [162]:
# Text processing
indx=0
data1.iloc[indx]['elevation']
# data1
# print (idx)

32.0

In [163]:
df=pd.read_pickle('../data/alltrails_ontario.pkl')
trail_data=pd.read_pickle('../data/alltrails_ontario.pkl')
trail_data.elevation[np.isnan(trail_data.elevation)]=0.
df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,difficulty,distance,elevation,name,nreviews,review,route_type,stars,trail_attributes
0,MODERATE,5.6,32.0,Taylor Creek Trail,23,"Great for strollers, bikes etc. Opposite side ...",Out & Back,3.7,"[dogs on leash, wheelchair friendly, kid frien..."
1,MODERATE,4.7,64.0,Hilton Falls Trail,238,What a gem! I was so pleasantly surprised by ...,Loop,4.3,"[dogs on leash, cross country skiing, fishing,..."
2,MODERATE,2.3,100.0,Niagara Glen Trail,135,Beautiful area with several trails. Loved exp...,Loop,4.7,"[dogs on leash, kid friendly, hiking, nature t..."
3,MODERATE,7.2,128.0,Nassagaweya and Bruce Trail Loop from Rattlesn...,170,Great views! We went in January so there weren...,Loop,4.2,"[dogs on leash, kid friendly, hiking, nature t..."
4,MODERATE,15.3,427.0,Lion's Head Loop Via Bruce Trail,117,Amazing trail with stunning lookouts. Hiked it...,Loop,4.8,"[dogs on leash, hiking, nature trips, walking,..."


In [164]:
# vectorize and scale difficulty (categorical)
difficulty = trail_data[['difficulty']]
difficulty = difficulty.apply(lambda x: x.str.replace(' ','')).astype('category')    
enc = LabelBinarizer().fit(difficulty)
df_cat = enc.transform(difficulty)
scaler_cat = StandardScaler().fit(df_cat)
df_cat_scaled = scaler_cat.transform(df_cat)

# scale numerical data (continuous)
trail_data_numerical = trail_data[['elevation','distance',
                                   'stars']]
scaler_num = StandardScaler().fit(trail_data_numerical)
df_num_scaled = scaler_num.transform(trail_data_numerical)

# vectorize text data
trail_attributes=data.trail_attributes
hattrib=[]
for l1 in trail_attributes:
    hattrib.append([l2.replace(' ', '') for l2 in l1])
print (hattrib[0])

# Add the curated attibutes as a new column ('tags') in data
trail_data['tags']=pd.Series(hattrib)
trail_data['tags']=[' '.join(tag) for tag in trail_data.tags.values]
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(trail_data['tags']).toarray()
tfidf_matrix[0]
# concatenate and get similarity
all_features = np.concatenate([tfidf_matrix,df_cat_scaled,df_num_scaled],axis=1)
cosine_sim = cosine_similarity(all_features, all_features)
cosine_sim.shape
x=sorted(cosine_sim[0], reverse=True)
print (x)
# x=np.sort(cosine_sim[0])
# print (x)

['dogsonleash', 'wheelchairfriendly', 'kidfriendly', 'strollerfriendly', 'hiking', 'mountainbiking', 'naturetrips', 'snowshoeing', 'trailrunning', 'walking', 'forest', 'paved', 'views', 'snow']
[0.9999999999999999, 0.9999999999999999, 0.9430374843304704, 0.9268698653726071, 0.9073848981338718, 0.906103198540165, 0.9059658070189537, 0.9035034205188557, 0.8949284631218942, 0.8887099584520426, 0.8884473553092931, 0.8877377564675576, 0.8809374804669632, 0.8767519382789146, 0.8719550765293337, 0.8667844871364166, 0.866328467565922, 0.8662869636585937, 0.861912054669539, 0.860914578777538, 0.8569948278822563, 0.8557445705008204, 0.8556892744051189, 0.8508847690818292, 0.8492018774873149, 0.8476235923364059, 0.8457995979952065, 0.8456157797565471, 0.8444174664264584, 0.8441776328493572, 0.8437738650678872, 0.8431827471832131, 0.8422222106370283, 0.8414721107403715, 0.8408870120567015, 0.840668551027841, 0.8404556646150045, 0.8402353422946724, 0.83957807760536, 0.8390417773736983, 0.8390384624

