# PCNSW Church Directory Map
This code is to extract the church details from the PCNSW website, then geolocate each church, then create a Folium interactive map to display their location and other details.

## Import Modules

In [1]:
import requests
from bs4 import BeautifulSoup
import re


In [62]:
import pandas as pd
import numpy as np


In [308]:
import folium
from folium.plugins import MarkerCluster, Search
from branca.colormap import linear


In [134]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from time import sleep
import time

## Define web scraping parameters

In [25]:
# Step 1: Define a list of URLs to scan for church links
base_url = 'https://preskey.org.au'
urls = [
    base_url + '/index.php/new-south-wales-act',
    base_url + '/index.php/new-south-wales-act/2',
    base_url + '/index.php/new-south-wales-act/3',
    base_url + '/index.php/new-south-wales-act/4',
    base_url + '/index.php/new-south-wales-act/5',
    base_url + '/index.php/new-south-wales-act/6',
    base_url + '/index.php/new-south-wales-act/7'
]

In [26]:
# Step 2: Collect all church links across the pages
church_links = []
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Use regex to find all links with 'category_id='
    for link in soup.find_all('a', href=True):
        if re.search(r'category_id=\d+', link['href']):
            full_link = base_url + link['href']
            church_links.append(full_link)

In [152]:
def extract_church_data(url):
    church_page = requests.get(url)
    church_soup = BeautifulSoup(church_page.text, 'html.parser')

    # Extract church name from the <h1> tag
    church_name = church_soup.find('h1', class_='pos-title').string.strip() if church_soup.find('h1', class_='pos-title') and church_soup.find('h1', class_='pos-title').string else "Unknown"
    
    # Extract subtitle if available
    subtitle = church_soup.find('p', class_='pos-subtitle').string.strip() if church_soup.find('p', class_='pos-subtitle') and church_soup.find('p', class_='pos-subtitle').string else "Unknown"

    # Extract AM and PM service times (optional fields)
    am_service_time = "Unknown"
    pm_service_time = "Unknown"
    
    am_service = church_soup.find('h3', string="AM Service Time")
    if am_service and am_service.find_next_sibling(string=True):
        am_service_time = am_service.find_next_sibling(string=True).strip()

    pm_service = church_soup.find('h3', string="PM Service Time")
    if pm_service and pm_service.find_next_sibling(string=True):
        pm_service_time = pm_service.find_next_sibling(string=True).strip()

    # Extract address details properly
    street = city = state = postcode = "Unknown"

    address_section = church_soup.find('div', class_='pos-address')
    if address_section:
        address_items = address_section.find_all('li')
        for item in address_items:
            if 'Street' in item.get_text():
                street = item.get_text().split(":")[1].strip()
            elif 'City' in item.get_text():
                city = item.get_text().split(":")[1].strip()
            elif 'State' in item.get_text():
                state = item.get_text().split(":")[1].strip()
            elif 'Postcode' in item.get_text():
                postcode = item.get_text().split(":")[1].strip()

    # Extract contact details
    phone = website = email = "Unknown"
    
    contact_section = church_soup.find('div', class_='pos-contact')
    if contact_section:
        phone_item = contact_section.find('li', string=re.compile('Phone:'))
        if phone_item:
            phone = phone_item.get_text().split(":")[1].strip()

        website_link = contact_section.find('a', href=True)
        if website_link:
            website = website_link['href'].strip()

        # Extract email address if available
        email_item = contact_section.find('li', string=re.compile('E-mail:'))
        if email_item:
            email = email_item.get_text().split(":")[1].strip()

    # Extract Minister/Pastor Name
    minister = "Unknown"
    minister_section = church_soup.find('h3', string="Minister/Pastor")
    if minister_section:
        minister_link = minister_section.find_next('a', href=True)
        if minister_link:
            minister = minister_link.get_text().strip()


    # Extract Presbytery
    presbytery_tag = church_soup.find('div', class_='element element-relateditems')
    if presbytery_tag:
        presbytery = presbytery_tag.get_text().strip()
    else:
        presbytery = 'unknown'

    return {
        'Name': church_name,
        'Subtitle': subtitle,
        'AM Service Time': am_service_time,
        'PM Service Time': pm_service_time,
        'Street': street,
        'City': city,
        'State': state,
        'Postcode': postcode,
        'Phone': phone,
        'Email': email,
        'Website': website,
        'Minister/Pastor': minister,
        'URL': url,  # Just for reference
        'Presbytery': presbytery
    }

In [149]:
def modified_extract_church_data(url):
    result = {
        'Name': None,
        'Subtitle': 'unknown',
        'Minister': 'unknown',
        'AM Service': 'unknown',
        'PM Service': 'unknown',
        'Street': 'unknown',
        'City': 'unknown',
        'State': 'unknown',
        'Postcode': 'unknown',
        'Phone': 'unknown',
        'Website': 'unknown',
        'Email': 'unknown',
        'Presbytery': 'unknown'
    }

    try:
        # Fetch the church page
        church_page = requests.get(church_url)
        soup = BeautifulSoup(church_page.content, 'html.parser')

        # Extract church name
        title_tag = soup.find('h1', class_='pos-title')
        if title_tag:
            result['Name'] = title_tag.get_text().strip()

        # Extract subtitle
        subtitle_tag = soup.find('p', class_='pos-subtitle')
        if subtitle_tag:
            result['Subtitle'] = subtitle_tag.get_text().strip()

        # Extract service times
        am_service = soup.find('h3', text="AM Service Time")
        if am_service:
            am_service_time = am_service.find_next_sibling(text=True)
            result['AM Service'] = am_service_time.strip() if am_service_time else 'unknown'

        pm_service = soup.find('h3', text="PM Service Time")
        if pm_service:
            pm_service_time = pm_service.find_next_sibling(text=True)
            result['PM Service'] = pm_service_time.strip() if pm_service_time else 'unknown'

        # Extract address details
        address_block = soup.find('div', class_='pos-address')
        if address_block:
            street = address_block.find('li', class_='element element-text first')
            city = address_block.find('li', class_='element element-text')
            state = address_block.find('li', class_='element element-select')
            postcode = address_block.find('li', class_='element element-text last')

            result['Street'] = street.text.split(":")[1].strip() if street else 'unknown'
            result['City'] = city.text.split(":")[1].strip() if city else 'unknown'
            result['State'] = state.text.split(":")[1].strip() if state else 'unknown'
            result['Postcode'] = postcode.text.split(":")[1].strip() if postcode else 'unknown'
        
        # Extract phone and website
        contact_block = soup.find('div', class_='pos-contact')
        if contact_block:
            phone = contact_block.find('li', class_='element element-text first')
            website = contact_block.find('a', href=True)

            if website:
                result['Website'] = website['href'].strip()
            if phone:
                result['Phone'] = phone.get_text().split(":")[1].strip()
            

        # Extract minister/pastor name
        minister_tag = soup.find('h3', text="Minister/Pastor")
        if minister_tag:
            minister_name = minister_tag.find_next('a')
            if minister_name:
                result['Minister'] = minister_name.get_text().strip()

        # Extract Presbytery
        presbytery_tag = soup.find('div', class_='element element-relateditems')
        if presbytery_tag:
            result['Presbytery'] = presbytery_tag.get_text().strip()

    except Exception as e:
        print(f"Error extracting data for {church_url}: {e}")

    return result

## Collect church data

In [153]:
# Step 4: Loop through the church links and collect data
church_data = []
for link in church_links:
    data = extract_church_data(link)
    church_data.append(data)


In [154]:
# Step 5: Convert the collected data into a pandas DataFrame
df = pd.DataFrame(church_data)


In [155]:
df.head()

Unnamed: 0,Name,Subtitle,AM Service Time,PM Service Time,Street,City,State,Postcode,Phone,Email,Website,Minister/Pastor,URL,Presbytery
0,Abbotsford,Unknown,9.30,6.00,443 Great North Rd,Abbotsford,New South Wales,2046,Unknown,Unknown,http://www.abbotsfordpc.org,Peter Morphew,https://preskey.org.au/index.php/new-south-wal...,Presbytery of Sydney Inner West
1,Albion Park,Unknown,9.45,6.00,250 Tongarra Rd,Albion Park,New South Wales,2527,Unknown,Unknown,http://albionparkpc.org.au,Gordon Coleman,https://preskey.org.au/index.php/new-south-wal...,Presbytery of the Illawarra
2,Albury,Unknown,See sidebar for Service times,Unknown,402 Wagga Rd,Lavington,New South Wales,2541,Unknown,Unknown,http://www.alburychurch.org.au,David Hayes,https://preskey.org.au/index.php/new-south-wal...,Presbytery of Southern Rivers
3,Annandale-Leichhardt,Unknown,10.00 (Sunday School each Sunday at 10),Unknown,Cnr Johnston & Collins Sts,Annandale,New South Wales,2038,Unknown,Unknown,http://hunterbaillie.com,Thomas Wall,https://preskey.org.au/index.php/new-south-wal...,Presbytery of Sydney Inner West
4,"Arabic, Evangelical",Unknown,Unknown,Unknown,236 Targo Rd,Toongabbie,New South Wales,2146,Unknown,Unknown,Unknown,Nabil Zakhary,https://preskey.org.au/index.php/new-south-wal...,unknown


In [156]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Name             252 non-null    object
 1   Subtitle         252 non-null    object
 2   AM Service Time  252 non-null    object
 3   PM Service Time  252 non-null    object
 4   Street           252 non-null    object
 5   City             252 non-null    object
 6   State            252 non-null    object
 7   Postcode         252 non-null    object
 8   Phone            252 non-null    object
 9   Email            252 non-null    object
 10  Website          252 non-null    object
 11  Minister/Pastor  252 non-null    object
 12  URL              252 non-null    object
 13  Presbytery       252 non-null    object
dtypes: object(14)
memory usage: 27.7+ KB


In [157]:
# Step 6: Save to a CSV file (optional)
df.to_csv('church_data_extended.csv', index=False)

## Geocode the location of each church

In [158]:
# Initialize Nominatim Geocoder
geolocator = Nominatim(user_agent="church_locator")

In [159]:
def get_lat_lon(address):
    """
    Function to get latitude and longitude using Geopy
    """
    try:
        location = geolocator.geocode(address)
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except:
        return None, None

In [160]:
def get_coords_sub(address):
    location = geolocator.geocode(address)
    if location:
        return location.latitude, location.longitude
    else:
        return np.nan, np.nan

In [161]:
def get_coords_with_fallback(street, city, state, postcode, retries=3):
    attempt = 0
    while attempt < retries:
        try:
            if attempt < 1:
                address = f"{street}, {city}, {state}, {postcode}, Australia"
            elif attempt < 2:
                address = f"{street}, {city}, {state}, Australia"
            elif attempt < 3:
                address = f"{city}, {state}, Australia"
            else:
                return np.nan, np.nan, attempt    
                
            lat, lon = get_coords_sub(address)
            # If coordinates are found, return them
            if not np.isnan(lat) and not np.isnan(lon):
                return lat, lon, attempt
                
        except Exception as e:
            print(f"Error for address {address}")
            print(f"Attempt #{attempt}")
            print(f"Error type: {type(e).__name__}")
            attempt += 1
            time.sleep(1)  # Wait for a second before retrying

        # Increment the attempt counter and wait for 1 second before retrying
        attempt += 1
        time.sleep(1)
        
    # Return NaN if all retries fail
    return np.nan, np.nan, attempt


In [162]:
# Add a new column to the DataFrame for latitude and longitude
df['Latitude'] = None
df['Longitude'] = None
df['Attempt'] = None

## Collect geocode data

In [163]:
# Geocode each address (you may need to limit the request rate)
for i, row in df.iterrows():
    #full_address = f"{row['Street']}, {row['City']}, {row['State']}, {row['Postcode']}, Australia"
    lat, lon, attempt = get_coords_with_fallback(row['Street'], row['City'], row['State'], row['Postcode']) #get_lat_lon(full_address)
    
    # Add latitude and longitude to the DataFrame
    df.at[i, 'Latitude'] = lat
    df.at[i, 'Longitude'] = lon
    df.at[i, 'Attempt'] = attempt
    sleep(1)  # Sleep for 1 second to respect the rate limit of the geocoding service


Error for address 443 Great North Rd, Abbotsford, New South Wales, 2046, Australia
Attempt #0
Error type: GeocoderUnavailable
Error for address 29 Queen St, Auburn, New South Wales, 2144, Australia
Attempt #0
Error type: GeocoderUnavailable
Error for address 7 Campbell St, Balmain, New South Wales, 2041, Australia
Attempt #0
Error type: GeocoderUnavailable
Error for address 269 Forest Rd, Bexley, New South Wales, 2207, Australia
Attempt #0
Error type: GeocoderUnavailable
Error for address 42 John St, Camden, New South Wales, 2570, Australia
Attempt #0
Error type: GeocoderUnavailable
Error for address 182 Concord Rd, North Stratfield, New South Wales, 2137, Australia
Attempt #0
Error type: GeocoderUnavailable
Error for address 64 George St, East Maitland, New South Wales, 2323, Australia
Attempt #0
Error type: GeocoderUnavailable
Error for address 14-16 Young St, West Gosford, New South Wales, 2250, Australia
Attempt #0
Error type: GeocoderUnavailable
Error for address 116 Oliver St, Gr

In [164]:
# Test to see how many churches did not get location data assigned
df[df.Latitude.isna()]


Unnamed: 0,Name,Subtitle,AM Service Time,PM Service Time,Street,City,State,Postcode,Phone,Email,Website,Minister/Pastor,URL,Presbytery,Latitude,Longitude,Attempt
55,Concord (North Strathfield),Unknown,9.00 & 10.30,Unknown,182 Concord Rd,North Stratfield,New South Wales,2137,Unknown,Unknown,Unknown,Peter Crickitt,https://preskey.org.au/index.php/new-south-wal...,Presbytery of Sydney Inner West,,,3


In [165]:
# Step 6: Save to a CSV file (optional)
df.to_csv('church_data_extended.csv', index=False)

Adjust coordinates when there are two pins at the same location

In [201]:
import random

In [202]:
def adjust_coordinates(df):
    df['Lat_adj'] = None
    df['Lon_adj'] = None
    
    seen_coords = {}
    offset = 0.0001  # Small value to nudge markers

    for i, row in df.iterrows():
        lat, lon = row['Latitude'], row['Longitude']
        if (lat, lon) in seen_coords:
            # Adjust the coordinates slightly to avoid overlap
            new_lat = lat + random.uniform(-offset, offset)
            new_lon = lon + random.uniform(-offset, offset)
            df.at[i, 'Lat_adj'] = new_lat
            df.at[i, 'Lon_adj'] = new_lon
        else:
            df.at[i, 'Lat_adj'] = lat
            df.at[i, 'Lon_adj'] = lon
            seen_coords[(lat, lon)] = 1  # Mark coordinates as seen

    return df

In [203]:
# Apply the function to adjust overlapping coordinates
df = adjust_coordinates(df)

In [206]:
df[df.Name.str.contains('Ashfield')]

Unnamed: 0,Name,Subtitle,AM Service Time,PM Service Time,Street,City,State,Postcode,Phone,Email,Website,Minister/Pastor,URL,Presbytery,Latitude,Longitude,Attempt,Lat_adj,Lon_adj
6,Ashfield-Petersham,Unknown,"Ashfield: 9:00 am, 10:15am, 6:00 pm & Mandarin...",Unknown,Cnr Liverpool Rd & Knox St,Ashfield,New South Wales,2131,Unknown,Unknown,http://www.ashfieldpresbyterian.org.au,David Balzer,https://preskey.org.au/index.php/new-south-wal...,Presbytery of Sydney Inner West,-33.889478,151.127412,2,-33.889478,151.127412
111,Korean (Ashfield),Unknown,8.30,12.00,Cnr Knox St & Liverpool Rd,Ashfield,New South Wales,2131,Unknown,Unknown,Unknown,Joung-Yeol Kim,https://preskey.org.au/index.php/new-south-wal...,Presbytery of Sydney Inner West,-33.889478,151.127412,2,-33.889558,151.127403


## Create Folium Map

In [299]:
# Define different colors for specificity (change colors to what you like)
specificity_colors = {0: 'darkblue', 1: 'blue', 2: 'gray'}

# Generate unique colors for each presbytery
folium_colour_options = ['pink', 'black', 'darkpurple', 'lightblue', 'red', 'beige', 
                         'darkgreen', 'orange', 'lightgreen', 'lightred', 'darkblue', 'darkred', 
                         'cadetblue', 'green', 'lightgray', 'blue', 'purple']
presbyteries = df['Presbytery'].unique()
presbytery_colors = {}
for i, presbytery in enumerate(presbyteries):
    if i < len(folium_colour_options):
        presbytery_colors[presbytery] = folium_colour_options[i]
    else:
        presbytery_colors[presbytery] = folium_colour_options[i-len(folium_colour_options)]
presbytery_colors['unknown'] = 'gray'

## Create as two separate maps

In [275]:
# Step 2: Create a folium map centered on New South Wales
m_spec = folium.Map(location=[-33.8688, 151.2093], zoom_start=10)  # Centered around Sydney, adjust as needed
m_pres = folium.Map(location=[-33.8689, 151.2094], zoom_start=10) 

Create the layers to display colour either by address specificity or by Presbytery

Loop through the dataframe and add all the markers

In [277]:
# Loop through the DataFrame and add markers
for i, row in df.iterrows():
    # Extract data
    lat = row['Lat_adj']
    lon = row['Lon_adj']
    if pd.isna(lat) or pd.isna(lon):
        continue
    church_name = row['Name']
    subtitle = row['Subtitle']
    minister = row['Minister/Pastor'] if row['Minister/Pastor'] else "Unknown"
    am_time = row['AM Service Time'] if row['AM Service Time'] != 'unknown' else ""
    pm_time = row['PM Service Time'] if row['PM Service Time'] != 'unknown' else ""
    website = row['Website']
    presbytery = row['Presbytery']
    attempt = row['Attempt']

    # Construct the popup content
    popup_content = f"<b>{church_name}</b><br>"
    if subtitle != 'Unknown':
        popup_content += f"<i>{subtitle}</i><br>"
    popup_content += f"Minister: {minister}<br>"
    if am_time != 'Unknown':
        popup_content += f"AM Service: {am_time}<br>"
    if pm_time != 'Unknown':
        popup_content += f"PM Service: {pm_time}<br>"
    if website != 'Unknown':
        popup_content += f"<a href='{website}' target='_blank'>Website</a><br><br>"
    
    # Add address details
    if row['Street'] != 'Unknown':
        popup_content += f"{row['Street']}<br>"
    
    pc = row['Postcode'] if row['Postcode'] != 'Unknown' else ''
    city = row['City'] if row['City'] != 'Unknown' else ''
    popup_content += f"{city}, {pc}<br>"

    popup = folium.Popup(popup_content, max_width=300)

    # Set marker color by address specificity (attempt number)
    spec_color = specificity_colors.get(attempt, 'gray')  # fallback to gray if unknown attempt

    # Add marker to the specificity layer
    folium.Marker(
        location=[lat, lon],
        popup=popup,
        tooltip=church_name,
        icon=folium.Icon(color=spec_color)
    ).add_to(m_spec) #specificity_layer
    
    # Set marker color by Presbytery
    pres_color = presbytery_colors.get(presbytery, 'gray')  # fallback to gray if unknown presbytery
    popup = folium.Popup(popup_content, max_width=300)
    
    # Add marker to the presbytery layer
    folium.Marker(
        location=[lat, lon],
        popup=popup,
        tooltip=church_name,
        icon=folium.Icon(color=pres_color)
    ).add_to(m_pres)
    

Add all the layers to the map with controller

In [278]:
# Step 4: Save the map to an HTML file
m_spec.save('pressy_churches_by_address_map.html')
m_pres.save('pressy_churches_by_pres_map.html')

In [274]:
# Step 5: Display the map (optional in a Jupyter Notebook)
m_spec

In [279]:
m_pres

## Create as a combined map

This allows the user to switch the colours between address specificity and presbytery

In [429]:
# Step 2: Create a folium map centered on New South Wales
m = folium.Map(location=[-33.8688, 151.2093], zoom_start=10)  # Centered around Sydney, adjust as needed

In [430]:
# Create FeatureGroups for the two layers (by specificity and by presbytery)
specificity_layer = folium.FeatureGroup(name='By Address')
presbytery_layer = folium.FeatureGroup(name='By Presbytery')

In [431]:
# Loop through the DataFrame and add markers
for i, row in df.iterrows():
    # Extract data
    lat = row['Lat_adj']
    lon = row['Lon_adj']
    if pd.isna(lat) or pd.isna(lon):
        continue
    church_name = row['Name']
    subtitle = row['Subtitle']
    minister = row['Minister/Pastor'] if row['Minister/Pastor'] else "Unknown"
    am_time = row['AM Service Time'] if row['AM Service Time'] != 'unknown' else ""
    pm_time = row['PM Service Time'] if row['PM Service Time'] != 'unknown' else ""
    website = row['Website']
    presbytery = row['Presbytery']
    attempt = row['Attempt']

    # Construct the popup content
    popup_content = f"<b>{church_name}</b><br>"
    if subtitle != 'Unknown':
        popup_content += f"<i>{subtitle}</i><br>"
    popup_content += f"Minister: {minister}<br>"
    if am_time != 'Unknown':
        popup_content += f"AM Service: {am_time}<br>"
    if pm_time != 'Unknown':
        popup_content += f"PM Service: {pm_time}<br>"
    if website != 'Unknown':
        popup_content += f"<a href='{website}' target='_blank'>Website</a><br><br>"
    
    # Add address details
    if row['Street'] != 'Unknown':
        popup_content += f"{row['Street']}<br>"
    
    pc = row['Postcode'] if row['Postcode'] != 'Unknown' else ''
    city = row['City'] if row['City'] != 'Unknown' else ''
    popup_content += f"{city}, {pc}<br>"

    popup = folium.Popup(popup_content, max_width=300)

    # Set marker color by address specificity (attempt number)
    spec_color = specificity_colors.get(attempt, 'gray')  # fallback to gray if unknown attempt

    # Add marker to the specificity layer
    # marker shape is based on specificity. If only suburb, use a circle.
    if attempt == 2:
        folium.CircleMarker(
            location=[lat, lon],
            popup=popup,
            tooltip=church_name,
            #icon=folium.Icon(color="gray", icon=""),
            fill_color='steelblue',
            color='steelblue',
            fill_opacity=0.8,
            radius=11,
        ).add_to(specificity_layer) #specificity_layer

    else:
        folium.Marker(
            location=[lat, lon],
            popup=popup,
            tooltip=church_name,
            icon=folium.Icon(color="darkblue", icon=""),
            radius=9,
        ).add_to(specificity_layer) #specificity_layer
    
    # Set marker color by Presbytery
    pres_color = presbytery_colors.get(presbytery, 'gray')  # fallback to gray if unknown presbytery
    popup = folium.Popup(popup_content, max_width=300)

    # Add marker to the presbytery layer
    if attempt == 2:
        folium.CircleMarker(
            location=[lat, lon],
            popup=popup,
            tooltip=church_name,
            fill_color=pres_color,
            color=pres_color,
            fill_opacity=0.8,
            radius=11,
        ).add_to(presbytery_layer) #specificity_layer
    else:
        folium.Marker(
            location=[lat, lon],
            popup=popup,
            tooltip=church_name,
            icon=folium.Icon(color=pres_color, icon=""),
            radius=9,
        ).add_to(presbytery_layer)
    

In [432]:
# Add both layers to the map
presbytery_layer.add_to(m)
specificity_layer.add_to(m)

print(f"{len(specificity_layer._children)} markers added to specificity layer")
print(f"{len(presbytery_layer._children)} markers added to presbytery layer: ")

251 markers added to specificity layer
251 markers added to presbytery layer: 


In [433]:
# Add a layer control panel to switch between them
#folium.LayerControl().add_to(m)
folium.LayerControl(collapsed=False).add_to(m)

<folium.map.LayerControl at 0x143db1ab0>

In [434]:
m.save('pressy_churches_toggle.html')

In [435]:
m