In [18]:
#Import necessary libraries

from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import requests

In [19]:
#This is the webscraper that goes through every page on the MoCO Parks website

extensions = []
names = []

for i in np.arange(1, 18, 1):
    #17 pages on MoCo website, so this will go through every page
    url = "https://montgomeryparks.org/parks-trails/parks/page/" + str(i) + "/"
    response = urlopen(Request(url=url, headers={"user-agent": "my-app"}))
    doc = BeautifulSoup(response, "html.parser")

    #Getting headers of page name, so park name
    tags = "h3"
    for i in doc.find_all(tags):
        j = i.string
        names.append(j)
        j1 = j.lower()
        j2 = j1.replace(" ", "-") + "/"
        j3 = j2.replace(".", "").replace("'", "")
        extensions.append(j3)

In [20]:
#Extensions for every park --> attach to https://montgomeryparks.org/parks-and-trails/

extensions_new = []

for i in extensions:
    k = i.split(extensions[-8][6])
    extensions_new.append("".join(k))



extensions = extensions_new

In [21]:
#This code cell is meant to save the list *extensions* as a dataframe so it is not lost

extensions_df = pd.DataFrame(data={"Extensions": extensions})
extensions_df.to_csv("extensions.csv")

In [22]:
#Attach url extensions to main url
urls = []

for i in extensions:
    urls.append("https://montgomeryparks.org/parks-and-trails/" + i)

def isascii(s):
    """Check if the characters in string s are in ASCII, U+0-U+7F."""
    return len(s) == len(s.encode())


#Trying to find if there are any urls with special/non-ascii characters that can't be processed
#In this case, it is alright to remove them
for i in np.arange(0, len(urls)):
    if isascii(urls[i]) == False:
        print(i)
        print(urls[i])
        #urls.remove(i)

len(urls)

63
https://montgomeryparks.org/parks-and-trails/caroline-freeland-urban park/
68
https://montgomeryparks.org/parks-and-trails/cedar-island-conservation park/


410

In [27]:
names.remove(names[63])
urls.remove(urls[63])
#names.remove(names[69])

In [30]:
next_remove = names[67]
names.remove(next_remove)
urls.remove(urls[67])

In [31]:
#Removing parks with names that have special characters, now will check to see if *names* and *urls* list match up
len(names) == len(urls)

True

In [32]:
#Get park descriptions for each park

texts = []

for i in urls:
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15'}
    r = requests.get(i, headers=headers, allow_redirects=False)
    soup = BeautifulSoup(r.content, 'lxml')
    try:
        tx=soup.find("div", {"class": "p-y"}).get_text()
        texts.append(tx)
    except:
        texts.append("N/A")

In [37]:
texts_df = pd.DataFrame(data={"Name": names, "URL":urls, "Text":texts})
#texts_df.to_csv("texts.csv")

In [38]:
texts_df = texts_df[(texts_df["Text"] != "N/A") & (texts_df["Text"] != "\n\n ")].reset_index().drop("index", axis=1)
texts_df

Unnamed: 0,Name,URL,Text
0,Aberdeen Local Park,https://montgomeryparks.org/parks-and-trails/a...,"\nThis 14.5-acre park has a lot to offer, incl..."
1,Acorn Urban Park,https://montgomeryparks.org/parks-and-trails/a...,\nVisit this small park in downtown Silver Spr...
2,Adventure Conservation Park,https://montgomeryparks.org/parks-and-trails/a...,\nAdventure Conservation Park was established ...
3,Agricultural History Farm Park,https://montgomeryparks.org/parks-and-trails/a...,"\nSee the past, present, and future of Montgom..."
4,Amity Drive Neighborhood Park,https://montgomeryparks.org/parks-and-trails/a...,\nAmity Drive Neighborhood Park features an ac...
...,...,...,...
375,Woodfield Local Park,https://montgomeryparks.org/parks-and-trails/w...,\nThis park sits adjacent to the Great Seneca ...
376,Woodlawn Manor Cultural Park,https://montgomeryparks.org/parks-and-trails/w...,\nWelcome to Woodlawn Manor Cultural Park!\n\n...
377,Woodside Urban Park,https://montgomeryparks.org/parks-and-trails/w...,\nWoodside Urban Park features several recreat...
378,Woodstock Equestrian Special Park,https://montgomeryparks.org/parks-and-trails/w...,\nWoodstock Equestrian Special Park sits in th...


In [39]:
park_info_list = []
amenities=['tennis', 'basketball', 'playground', 'gazebo', 'soccer','volleyball','grills','shelter','trails', 'picnic']

for i in texts_df["Text"]:
    park_info = dict()
    ac = re.search("(\d+\.?\d+)\sacres? | (\d+\.?\d+)\-acre", i)
    if ac == None:
        park_info["size"] = 0
    elif "-" in ac.group(0):
        j = ac.group(0).split("-")
        park_info["size"] = j[0]
    else:
        j = ac.group(0).split(" ")
        park_info["size"] = j[0]

    year = re.search("\s(\d\d\d\d)", i)
    if year == None:
        park_info["year"] = 0
    else:
        park_info["year"] = year.group(0)

    words = i.split()
    for a in amenities:
                # Check if the word is already in dictionary
        if a in words:
                    # Increment count of word by 1
            park_info[a] =  1
        else:
                    # Add the word to dictionary with count 1
            park_info[a] = 0
    park_info_list.append(park_info)

In [40]:
#Creating a large dataframe with park details and amenities
park_info_list_df = pd.DataFrame(park_info_list)
df = pd.concat([texts_df, park_info_list_df], axis=1)
df

Unnamed: 0,Name,URL,Text,size,year,tennis,basketball,playground,gazebo,soccer,volleyball,grills,shelter,trails,picnic
0,Aberdeen Local Park,https://montgomeryparks.org/parks-and-trails/a...,"\nThis 14.5-acre park has a lot to offer, incl...",14.5,1983,1,1,0,0,1,1,0,0,0,1
1,Acorn Urban Park,https://montgomeryparks.org/parks-and-trails/a...,\nVisit this small park in downtown Silver Spr...,300,1850,0,0,0,0,0,0,0,0,0,0
2,Adventure Conservation Park,https://montgomeryparks.org/parks-and-trails/a...,\nAdventure Conservation Park was established ...,14,1969,0,0,0,0,0,0,0,0,0,0
3,Agricultural History Farm Park,https://montgomeryparks.org/parks-and-trails/a...,"\nSee the past, present, and future of Montgom...",455,0,0,0,0,0,0,0,0,0,0,0
4,Amity Drive Neighborhood Park,https://montgomeryparks.org/parks-and-trails/a...,\nAmity Drive Neighborhood Park features an ac...,0,0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,Woodfield Local Park,https://montgomeryparks.org/parks-and-trails/w...,\nThis park sits adjacent to the Great Seneca ...,0,1994,0,0,0,0,0,0,0,0,0,0
376,Woodlawn Manor Cultural Park,https://montgomeryparks.org/parks-and-trails/w...,\nWelcome to Woodlawn Manor Cultural Park!\n\n...,0,2017,0,0,0,0,0,0,0,0,0,0
377,Woodside Urban Park,https://montgomeryparks.org/parks-and-trails/w...,\nWoodside Urban Park features several recreat...,2.3,1973,1,1,0,0,0,1,0,0,0,0
378,Woodstock Equestrian Special Park,https://montgomeryparks.org/parks-and-trails/w...,\nWoodstock Equestrian Special Park sits in th...,872,2009,0,0,0,0,0,0,0,0,1,0


In [41]:
#df.to_csv("Park Info Dataframe.csv")

In [44]:
#Time to get addresses for all the parks
#Addresses are enetered in numerous different formats, so we have to use code that is flexible for many cases
park_df = pd.read_csv("/work/Park Info Dataframe.csv")

addresses = []

for i in park_df["URL"]:
    response = urlopen(Request(url=i, headers={"user-agent": "my-app"}))
    doc = BeautifulSoup(response, "html.parser")

    tags1 = "dd"
    try:
        g = str(doc.find_all(tags)[1])
        g = g.replace("\n", "")
        g2 = re.findall("\<dd\>\<p\>([\d\s\w]+)\.\<br\>([\d\,\s\w]+)\<\/br\>", g)
        if len(g2) > 0:
            string1 = g2[0][0] + ", " + g2[0][1]
            addresses.append(string1)
        else:
            g3 = re.findall("(\d+\s\w+\s\w+)\.\s(\w+\,\sMD\d{0, 6})", g)
            if len(g3) > 0:
                string2 = g3[0][0] + ", " + g3[0][1]
                addresses.append(string2)
            else:
                g4 = re.findall("\<p\>([\d\s\w]+)\<br\>([\d\w\s\,]+)\<", g)
                if len(g4) > 0:
                    string4 = g4[0][0] + ", " + g4[0][1]
                    addresses.append(string4)
                else:
                    addresses.append(0)
    except:
        addresses.append(0)


In [45]:
#Getting park identifier codes
#Have to use try/except here because one park has a code that can't be read --> because it's one case, we can
#manually enter it
codes = []

for i in park_df["URL"]:
    response = urlopen(Request(url=i, headers={"user-agent": "my-app"}))
    doc = BeautifulSoup(response, "html.parser")

    try:
        g = doc.find_all("a", {"class": "modal-popup btn"})[0]
        codes.append(re.findall("\?find\=(\w\d\d)\"\s", str(g))[0])
    except:
        codes.append(0)

In [46]:
#Updating park_df dataframe to include addresses and codes
park_df["Address"] = addresses
park_df["Code"] = codes

In [47]:
#Download dataframe
#park_df.to_csv("Park Dataframe.csv")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0e770588-8ebc-4bcb-bac2-a1aa232843cc' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>