# Entity Linking for All Trails

In [1]:
import pandas as pd
from similarity.jarowinkler import JaroWinkler

In [2]:
df = pd.read_csv('alltrails_final.csv')
temp_df = pd.DataFrame(df['national_park'].unique(), columns=['national_park'])

In [3]:
temp_df = temp_df.assign(park_id=(temp_df['national_park']).astype('category').cat.codes)
temp_df['park_id'] = 'a_' + temp_df['park_id'].astype(str)
temp_df.head(10)

Unnamed: 0,national_park,park_id
0,Zion National Park,a_11778
1,Twin Peaks Wilderness,a_10786
2,Arches National Park,a_279
3,Uinta-Wasatch-Cache National Forest,a_10812
4,Bryce Canyon National Park,a_1296
5,"Moab, Utah",a_6935
6,Mount Timpanogos Wilderness,a_7181
7,Mount Olympus Wilderness,a_7153
8,Grand Staircase - Escalante National Monument,a_4032
9,Red Butte Canyon Research Natural Area,a_8732


In [49]:
df2 = pd.read_csv('er_part1.csv').drop(columns=['id','state'])
df2 = df2.rename(columns={"name": "national_park"})
temp_df2 = pd.DataFrame(df2['national_park'].unique(), columns=['national_park'])
print(len(df2))
print(len(temp_df2))
# print(df2.head(2))

76
76


In [50]:
merged = pd.merge(df, df2, on='national_park', how='left')
# merged = merged.drop(['name_y'])
print(len(merged))

33602


In [51]:
merged.head(5)

Unnamed: 0,id,url,name,national_park,rating,difficulty,length,elevation_gain,route_type,no_shade,...,time,Favorite Trails,Scenic Drive,Best Time to Go,Where to Stay,Admission Fee,Must-Have Experience,hotels,inception,highest_peak
0,1,https://www.alltrails.com/trail/us/utah/the-zi...,The Zion Narrows Riverside Walk,Zion National Park,4.5,easy,1.9,193.0,Out & back,0,...,45.0,"Angel’s Landing, The Narrows, Canyon Overlook",Zion Canyon Scenic Drive (only accessible by s...,"Visit in winter for fewer crowds, but be prepa...","Zion Lodge, Cable Mountain, Best Western Plus","$35 per vehicle or $20 per individual, valid f...",Check out the Kolob Canyons section for fewer ...,"[('Zion Mountain Ranch', 229.0, 'https://www.n...",1919-11-19T00:00:00Z,Horse Ranch Mountain
1,2,https://www.alltrails.com/trail/us/utah/lake-b...,Lake Blanche Trail,Twin Peaks Wilderness,5.0,hard,6.9,2706.0,Out & back,0,...,4.37,,,,,,,,,
2,3,https://www.alltrails.com/trail/us/utah/devils...,Devils Garden Loop Trail with 7 Arches,Arches National Park,5.0,hard,7.8,1131.0,Loop,1,...,3.7,"Delicate Arch, Balanced Rock, The Windows, Sky...",Main Park Road,"Spring and fall have milder temperatures, but ...","Comfort Inn, Moab Springs Ranch, Airbnb","$30 per vehicle or $15 per individual, valid f...",Join a Park Ranger for a tour through the Fier...,"[('Moab Springs Ranch', 119.0, 'https://www.na...",1971-11-12T00:00:00Z,Elephant Butte
3,4,https://www.alltrails.com/trail/us/utah/delica...,Delicate Arch Trail,Arches National Park,5.0,moderate,3.4,626.0,Out & back,1,...,1.21,"Delicate Arch, Balanced Rock, The Windows, Sky...",Main Park Road,"Spring and fall have milder temperatures, but ...","Comfort Inn, Moab Springs Ranch, Airbnb","$30 per vehicle or $15 per individual, valid f...",Join a Park Ranger for a tour through the Fier...,"[('Moab Springs Ranch', 119.0, 'https://www.na...",1971-11-12T00:00:00Z,Elephant Butte
4,5,https://www.alltrails.com/trail/us/utah/bells-...,Bells Canyon Trail to Lower Falls,Uinta-Wasatch-Cache National Forest,4.5,moderate,4.6,1453.0,Out & back,0,...,2.5,,,,,,,,,


In [56]:
merged.to_csv('er_final_part.csv', index=False)

In [53]:
merged_nan = merged.dropna(subset=['Scenic Drive'])
print(len(merged_nan))

1686


In [5]:
temp_df2 = temp_df2.assign(park_id=(temp_df2['national_park']).astype('category').cat.codes)
temp_df2['park_id'] = 'b_' + temp_df2['park_id'].astype(str)
temp_df2.head(10)

Unnamed: 0,national_park,park_id
0,Yellowstone National Park,b_74
1,Sequoia National Park,b_63
2,Isle Royale National Park,b_36
3,Yosemite National Park,b_76
4,Zion National Park,b_77
5,Hawaii Volcanoes National Park,b_33
6,Denali National Park,b_18
7,Bryce Canyon National Park,b_8
8,Grand Teton National Park,b_27
9,Arches National Park,b_1


In [8]:
both_df = pd.concat([temp_df, temp_df2])
both_df.head(10)

Unnamed: 0,national_park,park_id
0,Zion National Park,a_11778
1,Twin Peaks Wilderness,a_10786
2,Arches National Park,a_279
3,Uinta-Wasatch-Cache National Forest,a_10812
4,Bryce Canyon National Park,a_1296
5,"Moab, Utah",a_6935
6,Mount Timpanogos Wilderness,a_7181
7,Mount Olympus Wilderness,a_7153
8,Grand Staircase - Escalante National Monument,a_4032
9,Red Butte Canyon Research Natural Area,a_8732


## RDF Creation

In [68]:
joined_data = pd.read_csv('er_final_part.csv', header=None, skiprows=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [69]:
from rdflib import Graph, URIRef, Literal, XSD, Namespace, RDF
import csv
import json

In [70]:
FOAF = Namespace('http://xmlns.com/foaf/0.1/')
MYNS = Namespace('http://inf558.org/myfakenamespace#')
SCHEMA = Namespace('http://schema.org/')


my_kg = Graph()
my_kg.bind('myns', MYNS)
my_kg.bind('foaf', FOAF)
my_kg.bind('schema', SCHEMA)

In [71]:
for row, value in joined_data.iterrows():

    node_uri = URIRef(value[1])
    my_kg.add((node_uri, RDF.type, MYNS['ToursitAttraction']))
    my_kg.add((node_uri, SCHEMA.url, Literal(value[1])))
    my_kg.add((node_uri, SCHEMA.name, Literal(value[2])))
    my_kg.add((node_uri, SCHEMA.park, Literal(value[3])))
    my_kg.add((node_uri, SCHEMA.rating, Literal(int(value[4]))))
    my_kg.add((node_uri, SCHEMA.difficulty, Literal(value[5])))
    my_kg.add((node_uri, SCHEMA.length, Literal(value[6])))
    my_kg.add((node_uri, SCHEMA.elevation_gain, Literal(value[7])))
    my_kg.add((node_uri, SCHEMA.route_type, Literal(value[8])))
    my_kg.add((node_uri, SCHEMA.no_shade, Literal(value[9])))
    my_kg.add((node_uri, SCHEMA.off_trail, Literal(value[10])))

    my_kg.add((node_uri, SCHEMA.scramble, Literal(value[11])))
    my_kg.add((node_uri, SCHEMA.over_grown, Literal(int(value[12]))))
    my_kg.add((node_uri, SCHEMA.snow, Literal(value[13])))
    my_kg.add((node_uri, SCHEMA.bugs, Literal(value[14])))
    my_kg.add((node_uri, SCHEMA.rocky, Literal(value[15])))
    my_kg.add((node_uri, SCHEMA.fee, Literal(value[16])))
    my_kg.add((node_uri, SCHEMA.backpacking, Literal(value[17])))
    my_kg.add((node_uri, SCHEMA.bike_touring, Literal(value[18])))
    my_kg.add((node_uri, SCHEMA.bird_watching, Literal(value[19])))
    my_kg.add((node_uri, SCHEMA.camping, Literal(value[20])))


    my_kg.add((node_uri, SCHEMA.cross_country_skiing, Literal(value[21])))
    my_kg.add((node_uri, SCHEMA.fishing, Literal(value[22])))
    my_kg.add((node_uri, SCHEMA.hiking, Literal(value[23])))
    my_kg.add((node_uri, SCHEMA.horseback_riding, Literal(value[24])))
    my_kg.add((node_uri, SCHEMA.mountain_biking, Literal(value[25])))
    my_kg.add((node_uri, SCHEMA.nature_trips, Literal(value[26])))
    my_kg.add((node_uri, SCHEMA.ohv_offroad_driving, Literal(value[27])))
    my_kg.add((node_uri, SCHEMA.paddle_sports, Literal(value[28])))
    my_kg.add((node_uri, SCHEMA.road_biking, Literal(value[29])))
    my_kg.add((node_uri, SCHEMA.rock_climbing, Literal(value[30])))

    my_kg.add((node_uri, SCHEMA.scenic_driving, Literal(value[31])))
    my_kg.add((node_uri, SCHEMA.skiing, Literal(value[32])))
    my_kg.add((node_uri, SCHEMA.snowshoeing, Literal(value[33])))
    my_kg.add((node_uri, SCHEMA.running, Literal(value[34])))
    my_kg.add((node_uri, SCHEMA.via_ferrata, Literal(value[35])))
    my_kg.add((node_uri, SCHEMA.walking, Literal(value[36])))
    my_kg.add((node_uri, SCHEMA.beach, Literal(value[37])))
    my_kg.add((node_uri, SCHEMA.cave, Literal(value[38])))
    my_kg.add((node_uri, SCHEMA.city_walk, Literal(value[39])))
    my_kg.add((node_uri, SCHEMA.event, Literal(value[40])))

    my_kg.add((node_uri, SCHEMA.forest, Literal(value[41])))
    my_kg.add((node_uri, SCHEMA.historic_site, Literal(value[42])))
    my_kg.add((node_uri, SCHEMA.hot_springs, Literal(value[43])))
    my_kg.add((node_uri, SCHEMA.lake, Literal(value[44])))
    my_kg.add((node_uri, SCHEMA.pub_walk, Literal(value[45])))
    my_kg.add((node_uri, SCHEMA.rails_trails, Literal(value[46])))
    my_kg.add((node_uri, SCHEMA.river, Literal(value[47])))
    my_kg.add((node_uri, SCHEMA.views, Literal(value[48])))
    my_kg.add((node_uri, SCHEMA.waterfall, Literal(value[49])))
    my_kg.add((node_uri, SCHEMA.wildflowers, Literal(value[50])))

    my_kg.add((node_uri, SCHEMA.wildlife, Literal(value[51])))
    my_kg.add((node_uri, SCHEMA.dog_friendly, Literal(value[52])))
    my_kg.add((node_uri, SCHEMA.kid_friendly, Literal(value[53])))
    my_kg.add((node_uri, SCHEMA.paved, Literal(value[54])))
    my_kg.add((node_uri, SCHEMA.partially_paved, Literal(value[55])))
    my_kg.add((node_uri, SCHEMA.wheelchair_friendly, Literal(value[56])))
    my_kg.add((node_uri, SCHEMA.stroller_friendly, Literal(value[57])))
    my_kg.add((node_uri, SCHEMA.light, Literal(value[58])))
    my_kg.add((node_uri, SCHEMA.moderate, Literal(value[59])))
    my_kg.add((node_uri, SCHEMA.heavy, Literal(value[60])))

    my_kg.add((node_uri, SCHEMA.state, Literal(value[61])))
    my_kg.add((node_uri, SCHEMA.time, Literal(value[62])))

my_kg.serialize('trial_triples.ttl', format="turtle")