## 01. Import Libraries

In [10]:
# import libraries

import pandas as pd
import time
import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import os
import logging
import requests
import bs4
from bs4 import BeautifulSoup
import requests


## 02. Set Up ChromeDriver

In [27]:
# Setup chrome options

chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")

In [29]:
# set up driver:

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

## 03. Scrape Article for Top NYC Landmarks / Tourist Attractions

In [32]:
# project URL: Places to Visit in New York City

# (https://www.thrillophilia.com/destinations/new-york-city/places-to-visit)

In [34]:
# as we are creating a list, selenium will suffice here
# get page contents:

list_url = "https://www.thrillophilia.com/destinations/new-york-city/places-to-visit"
driver.get(list_url)

In [38]:
# elements of interest will be the countries themselves, 
# after inspecting, the div class can be "left-side",

#div class = "left-side"

#h3 class = "h3 title"

In [49]:
# use find element to create a list of the attractions

attractions_elem = driver.find_elements(by = By.CLASS_NAME, value = 'left-side')

In [57]:
# checking to see if it worked:

attractions_elem[0].text

'01\nStatue Of Liberty'

In [66]:
attractions_elem[1].text

'02\nTimes Square'

In [68]:
# so it appears this actually worked!

attractions_elem[24].text

'25\nNew York Harbor'

In [72]:
# as our list is only 25 attractions long, we should expect that there will be no item in the 25 index!
# if we recieve an error, we know that we've only scraped the landmarks/attractions!

attractions_elem[25].text

IndexError: list index out of range

In [88]:
# now that we know that worked, let's transform this into a workable list:

attractions_list = [element.text for element in attractions_elem]
    
attractions_list

['01\nStatue Of Liberty',
 '02\nTimes Square',
 '03\nCentral Park',
 '04\nMetropolitan Museum Of Art',
 '05\nBroadway And The Theater District',
 '06\nThe Museum Of Modern Art',
 '07\nWorld Trade Center',
 '08\nRockefeller Center',
 '09\nEmpire State Building',
 '10\n9/11 Memorial And Museum',
 '11\nHigh Line',
 '12\nBrooklyn Bridge',
 '13\nFifth Avenue',
 '14\nOne World Observatory',
 '15\nWall Street',
 '16\nNew York Public Library',
 '17\nSt. Patricks Cathedral',
 '18\nSolomon R. Guggenheim Museum',
 '19\nBryant Park',
 '20\nCarnegie Hall',
 '21\nAmerican Museum Of Natural History',
 '22\nWashington Square Park',
 '23\nRadio City Music Hall',
 '24\nLiberty Island',
 '25\nNew York Harbor']

In [100]:
# perfect, now let's clean this up a bit!
# splitting at /n 
# by using an index of [1], we are only keeping the items AFTER the split, in this case the name

attractions = [item.split('\n')[1] for item in attractions_list]
attractions

['Statue Of Liberty',
 'Times Square',
 'Central Park',
 'Metropolitan Museum Of Art',
 'Broadway And The Theater District',
 'The Museum Of Modern Art',
 'World Trade Center',
 'Rockefeller Center',
 'Empire State Building',
 '9/11 Memorial And Museum',
 'High Line',
 'Brooklyn Bridge',
 'Fifth Avenue',
 'One World Observatory',
 'Wall Street',
 'New York Public Library',
 'St. Patricks Cathedral',
 'Solomon R. Guggenheim Museum',
 'Bryant Park',
 'Carnegie Hall',
 'American Museum Of Natural History',
 'Washington Square Park',
 'Radio City Music Hall',
 'Liberty Island',
 'New York Harbor']

In [106]:
# now we have a list of attractions. 
# I am going to add a lat/lng column here then manually populate those values
# this will possibly be used as an additional filter for Kepler
# will likely use this to demonstrate rider activity in relation to landmarks!

In [119]:
attractions_df = pd.DataFrame(attractions, columns = ['attraction'])
attractions_df

Unnamed: 0,attraction
0,Statue Of Liberty
1,Times Square
2,Central Park
3,Metropolitan Museum Of Art
4,Broadway And The Theater District
5,The Museum Of Modern Art
6,World Trade Center
7,Rockefeller Center
8,Empire State Building
9,9/11 Memorial And Museum


In [121]:
# now to add our lat & lng columns before exporting to .csv to hand populate the latitude and longitude

attractions_df['lat'] = ''
attractions_df['lng'] = ''

attractions_df

Unnamed: 0,attraction,lat,lng
0,Statue Of Liberty,,
1,Times Square,,
2,Central Park,,
3,Metropolitan Museum Of Art,,
4,Broadway And The Theater District,,
5,The Museum Of Modern Art,,
6,World Trade Center,,
7,Rockefeller Center,,
8,Empire State Building,,
9,9/11 Memorial And Museum,,


In [124]:
# cool! exporting this, then going to add these by hand

attractions_df.to_csv('attractions_scraped.csv')