## GOAL

To create a similarity metric based on trail features

Input dataset:
Hiking trails from Ontario scraped from alltrails.com

#### Data cleaning:
Trails removed in the following cases:
1. when there is no user rating, reviews
2. Removed unwanted tags
3. Private and closed trails (info from trail name)
Other:
data['name'].str.lower()

data=data[data['review']!='']

data=data[data['trail_attributes']!='']

##### Cosine similarity metric based on numerical, categorical and text data
##### Extract keywords from user reviews for each trail (check for details in the appropriate section)



In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import pickle
import os
import string
import re

# Similarity measures
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, linear_kernel
from scipy.spatial.distance import cdist

# NLP
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler, RobustScaler, OneHotEncoder ,LabelBinarizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.summarization import keywords

In [2]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

### READ IN PICKLE DATA FILE

In [3]:
#Read in pickle data file
data=pd.read_pickle('../data/alltrails_ontario_curated_0215.pkl')

###  Data curation

In [4]:
# Remove private and closed trails
data = data[~data.trailName.str.contains('closed|private')]

#Remove unused columns
data.drop(['review_keywords', 'review_keywords1','urlname', 'log_elevation', 'log_distance'], axis=1)
data=data[data['distance']>1]
data=data[data['distance']<50]
data=data[data['elevation']>1]
data=data[data['elevation']<1000]
data=data[data['stars']>1]

# Reset trail index
data=data.reset_index(drop=True)
data.shape
data1 = data.copy()

In [5]:
data.head(5)

Unnamed: 0,difficulty,distance,elevation,name,nreviews,review,route_type,stars,trail_attributes,trailName,tagstr,urlname,tags_str,log_elevation,log_distance,review_keywords,review_keywords1,lname
0,MODERATE,5.6,32.0,Taylor Creek Trail,23,"Great for strollers, bikes etc. Opposite side ...",Out & Back,3.7,"[dogs on leash, wheelchair friendly, kid frien...",taylor creek trail,dogsonleash wheelchairfriendly kidfriendly str...,taylor-creek-trail,"dogs on leash, wheelchair friendly, kid friend...",3.465736,1.722767,"[bike, nice, dog]",[bike nice dog],taylor creek trail
1,MODERATE,4.7,64.0,Hilton Falls Trail,238,What a gem! I was so pleasantly surprised by ...,Loop,4.3,"[dogs on leash, cross country skiing, fishing,...",hilton falls trail,dogsonleash crosscountryskiing fishing hiking ...,hilton-falls-trail,"dogs on leash, cross country skiing, fishing, ...",4.158883,1.547563,"[nice, family, fall, waterfalls, entrances, wo...",[nice family fall waterfalls entrances wooded ...,hilton falls trail
2,MODERATE,2.3,100.0,Niagara Glen Trail,135,Beautiful area with several trails. Loved exp...,Loop,4.7,"[dogs on leash, kid friendly, hiking, nature t...",niagara glen trail,dogsonleash kidfriendly hiking naturetrips wal...,niagara-glen-trail,"dogs on leash, kid friendly, hiking, nature tr...",4.60517,0.832909,"[views, stairs, nice, fall, niagara, rock, sce...",[views stairs nice fall niagara rock scenery f...,niagara glen trail
3,MODERATE,7.2,128.0,Nassagaweya and Bruce Trail Loop from Rattlesn...,170,Great views! We went in January so there weren...,Loop,4.2,"[dogs on leash, kid friendly, hiking, nature t...",nassagaweya and bruce trail loop from rattlesn...,dogsonleash kidfriendly hiking naturetrips sno...,nassagaweya-and-bruce-trail-loop-from-rattlesn...,"dogs on leash, kid friendly, hiking, nature tr...",4.85203,1.974081,"[beautiful, view, nice, day, bruce, moderately...",[beautiful view nice day bruce moderately leve...,nassagaweya and bruce trail loop from rattlesn...
4,MODERATE,15.3,427.0,Lion's Head Loop Via Bruce Trail,117,Amazing trail with stunning lookouts. Hiked it...,Loop,4.8,"[dogs on leash, hiking, nature trips, walking,...",lion's head loop via bruce trail,dogsonleash hiking naturetrips walking birdwat...,lion's-head-loop-via-bruce-trail-,"dogs on leash, hiking, nature trips, walking, ...",6.056784,2.727853,"[view, long, day, beautiful, rocks, water, you...",[view long day beautiful rocks water youre loo...,lion's head loop via bruce trail


In [6]:
# data[data.trail_attributes.map(lambda x: len(x)==0)]
# data.head(2)
# data['tagstr']=data.trail_attributes[]
data.trail_attributes.iloc[754]

['cross country skiing', 'nature trips', 'snowshoeing', 'views', 'snow']

####  Curate tags to retain subset of tags that are most useful for the user

In [7]:
tags = data.trail_attributes

In [8]:
# Create a curated list of tags - remove unwanted tags
wanted_tags = ['dogs on leash', 'wheelchair friendly', 'kid friendly',
 'hiking', 'mountain biking', 'trail running', 'forest', 
 'fishing', 'horseback riding', 'bird watching', 'lake', 'river', 'waterfall',
 'wild flowers', 'wildlife', 'rocky', 'beach',
 'dog friendly', 'scramble', 'camping', 'rock climbing', 'cave', 
 'paddle sports', 'backpacking']

curated_tags = tags.apply(lambda tag: [el for el in tag if el in wanted_tags])

In [9]:
# curated_tags
data['tagstr'] = curated_tags.apply(lambda tag: [el.replace(' ', '') for el in tag])
# Remove rows with no trail attributes after curation above
data=data[data.tagstr.map(lambda x: len(x)>0)]
# Sort tags
data['tagstr']=data['tagstr'].apply(sorted)

In [10]:
# Create sentences from trail feature lists
data['tags_sent']=[' '.join(tag) for tag in data['tagstr'].values]
# data.head()

####  Transform text data from trail attributes

In [11]:
# Create cosine similarity metric based on tag strings
tcv=CountVectorizer()
tcv_tags_mat=tcv.fit_transform(data['tags_sent'])

## Save the vectorizer vocabulary using pickle
# pickle.dump(tcv.vocabulary_,open("tags_vocab_cv.pkl","wb"))
print (tcv.vocabulary_)

# Create cosine similarity matrix
cosine_sim_tagstrcv = cosine_similarity(tcv_tags_mat,tcv_tags_mat)
# tcv_tags_mat

{'dogsonleash': 6, 'forest': 8, 'hiking': 9, 'kidfriendly': 11, 'mountainbiking': 13, 'trailrunning': 19, 'wheelchairfriendly': 21, 'birdwatching': 2, 'fishing': 7, 'horsebackriding': 10, 'lake': 12, 'river': 15, 'rocky': 17, 'waterfall': 20, 'wildflowers': 22, 'wildlife': 23, 'beach': 1, 'dogfriendly': 5, 'scramble': 18, 'camping': 3, 'cave': 4, 'rockclimbing': 16, 'paddlesports': 14, 'backpacking': 0}


### Reverse mapping of indices with corresponding trail names

In [12]:
# Reverse map ofindices with corresponding trail names
trail_indices=pd.Series(data.index, index=data['name']).drop_duplicates()

###  Transform trail difficulty data

In [13]:
# Binarize labels using LabelBinarizer
data['difficulty'].astype('category')
# data
lb=LabelBinarizer()
lb_diffic_mat=lb.fit_transform(data['difficulty'])

# Create cosine similarity matrix
cosine_sim_diffic = cosine_similarity(lb_diffic_mat,lb_diffic_mat)
lb_diffic_mat.shape

(824, 3)

###  Cosine similarity metric for numerical data

In [14]:
# Standardscaling for numerical data
scaler=StandardScaler()
nd=data[['distance', 'elevation', 'stars']]
scaled_nd=scaler.fit_transform(nd)

cosine_sim_num=cosine_similarity(scaled_nd, scaled_nd)

###  Create combined cosine similarity based on all features

In [15]:
# Combined features where stars, distance and elevation are used for the numerical data
combined_features_sim3 = np.concatenate([tcv_tags_mat.toarray(), lb_diffic_mat, scaled_nd], axis=1)
combined_features_sim3

#Calculate similarity matrix for combined features
cosine_sim_allfeat3=cosine_similarity(combined_features_sim3, combined_features_sim3)

In [16]:
df1=pd.DataFrame([cosine_sim_allfeat3[2], data['distance'], data['elevation'], data['stars'], data['tagstr'], data['difficulty']]).T
df1.columns=['cs', 'dist','elev','ratng', 'feat','diff']
df2=df1.sort_values(by=['cs'], ascending=False)
# df1.dtypes
# df2.iloc[:10]

In [17]:
# Combined features where stars, distance and elevation are used for the numerical data
combined_features_csim3 = np.concatenate([cosine_sim_diffic, cosine_sim_tagstrcv, cosine_sim_num], axis=1)
combined_features_csim3

#Calculate similarity matrix for combined features
ccosine_sim_allfeat3=cosine_similarity(combined_features_csim3, combined_features_csim3)

In [18]:
df1=pd.DataFrame([ccosine_sim_allfeat3[2], data['distance'], data['elevation'], data['stars'], data['tagstr'], data['difficulty']]).T
df1.columns=['cs', 'dist','elev','ratng', 'feat','diff']
df2=df1.sort_values(by=['cs'], ascending=False)
df1.dtypes
df2.iloc[:10]

Unnamed: 0,cs,dist,elev,ratng,feat,diff
2,1.0,2.3,100,4.7,"[dogsonleash, forest, hiking, kidfriendly, river]",MODERATE
90,0.991821,4.5,100,4.5,"[birdwatching, dogsonleash, forest, hiking, ki...",MODERATE
232,0.987458,3.5,101,4.5,"[birdwatching, dogsonleash, forest, hiking, ki...",MODERATE
12,0.987351,2.3,93,4.8,"[birdwatching, cave, dogsonleash, forest, hiki...",MODERATE
740,0.986003,3.5,71,5.0,"[birdwatching, dogsonleash, forest, hiking, ki...",MODERATE
795,0.98425,1.8,15,5.0,"[dogsonleash, hiking, kidfriendly, mountainbik...",MODERATE
17,0.983774,4.8,93,4.5,"[birdwatching, dogsonleash, forest, hiking, ro...",MODERATE
531,0.983115,2.3,93,5.0,"[birdwatching, dogsonleash, forest, hiking]",MODERATE
59,0.979372,5.0,48,4.8,"[birdwatching, dogsonleash, forest, hiking, ki...",MODERATE
22,0.979336,1.9,71,4.6,"[dogsonleash, forest, hiking, trailrunning, wi...",MODERATE


###  Write cosine similarity based on all features into a pickle file

In [19]:
# Save cosine similarity matrix to be called in the web app
# np.savetxt('../data/cosine_sim_allfeat3.dat', cosine_sim_allfeat3)

###  Testing review text vectorization

In [20]:
def preprocess(text):
    words=[word for sentence in sent_tokenize(text) for word in word_tokenize(sentence)]
    words=[word for word in words if len(word)>3]
    words=[word for word in words if word not in gensim.parsing.preprocessing.STOPWORDS]
    words=[WordNetLemmatizer().lemmatize(word) for word in words]
    return words

In [21]:
reviews=data.review
#tokenize text
# reviews.apply(lambda x: x.lower())
reviews=reviews.str.lower()
reviews=reviews.str.replace('\"', '')
reviews=reviews.str.replace("\'", '')
reviews=reviews.str.replace('\"', '')
reviews=reviews.str.replace('\!', '')
reviews=reviews.str.replace('\/', ' ')
reviews=reviews.str.replace(',', '')
reviews=reviews.str.replace('(', '')
reviews=reviews.str.replace(')', '')
reviews=reviews.str.replace('.', '')
reviews=reviews.str.replace('\d+', '')
# Remove additional frequent words
reviews=reviews.replace(to_replace={'hik', 'walk','run','trail', 'interesting',\
                                    'good','great','lot','recommend','area','park',\
                                    'love','like','way','easy', 'try', 'path', 'little', \
                                    'complete', 'definite', 'clear', 'close' \
                                    'fortunately', 'highly', 'place', 'start' \
                                    'hour', 'closed', 'condition', 'ers', 'overall', 'anything' \
                                    'local', 'nice', 'gett', 'local', 'line', 'anything', 'think' \
                                    'link', 'life', 'link', 'hour', 'day', 'better', 'you', 'surprising',
                                    'afternoon', 'night'
                                   }, value='', regex=True)

# Extract words longer than 3 characters
rv=[word for word in reviews] # if len(word)>3]

# Pre-process text using function above
processed_text=[preprocess(text) for text in rv]

In [22]:
# Create sentences of the cleaned version of the trail reviews
processed_sent=[]
for wordlist in processed_text:
    sent=' '.join(wordlist)
#     print (len(wordlist))
    if len(wordlist) <10:
        processed_sent.append('No reviews')
    else:
        stemmed_sent = PorterStemmer().stem(sent)
        lemmatized_sent = WordNetLemmatizer().lemmatize(stemmed_sent)
        processed_sent.append(''.join(lemmatized_sent))

In [23]:
# Extract keywords using the processed reviews
review_keywords=[]
for i,n in enumerate(processed_sent):
    if (n.lower() != 'no reviews' and len(n.split()) >10):
        keywrds = (keywords(n, ratio=0.10, lemmatize=True)) #.replace('\n', ' ')
        tx = [word for word in keywrds.split()]
        review_keywords.append(tx)    
    else:
        review_keywords.append(['No reviews'])

In [24]:
data['Keywords']=pd.Series((v for v in review_keywords))
data['KeyFeatures']=pd.Series(([' '.join(v)] for v in review_keywords))

In [37]:
## Write dataframe as a pickle file
# data=data.drop(['log_elevation', 'log_distance', 'urlname', 'review_keywords', 'review_keywords1', 'urlname', 'nreviews', 'review'], axis=1)
# data.to_pickle('../data/alltrails_ontario_curated_0219.pkl')

In [36]:
data.head(3)

Unnamed: 0,difficulty,distance,elevation,name,route_type,stars,trail_attributes,trailName,tagstr,tags_str,lname,tags_sent,Keywords,KeyFeatures
0,MODERATE,5.6,32.0,Taylor Creek Trail,Out & Back,3.7,"[dogs on leash, wheelchair friendly, kid frien...",taylor creek trail,"[dogsonleash, forest, hiking, kidfriendly, mou...","dogs on leash, wheelchair friendly, kid friend...",taylor creek trail,dogsonleash forest hiking kidfriendly mountain...,"[bike, river, dog, mountain, multi]",[bike river dog mountain multi]
1,MODERATE,4.7,64.0,Hilton Falls Trail,Loop,4.3,"[dogs on leash, cross country skiing, fishing,...",hilton falls trail,"[birdwatching, dogsonleash, fishing, forest, h...","dogs on leash, cross country skiing, fishing, ...",hilton falls trail,birdwatching dogsonleash fishing forest hiking...,"[fall, waterfall, family, beautiful, entrance,...",[fall waterfall family beautiful entrance bruc...
2,MODERATE,2.3,100.0,Niagara Glen Trail,Loop,4.7,"[dogs on leash, kid friendly, hiking, nature t...",niagara glen trail,"[dogsonleash, forest, hiking, kidfriendly, river]","dogs on leash, kid friendly, hiking, nature tr...",niagara glen trail,dogsonleash forest hiking kidfriendly river,"[rock, water, amazing, view, river, niagara, f...",[rock water amazing view river niagara fall ti...
