In [190]:
# This project will retrieve data on Crossboundary Energy
# Specifically data on projects completed with across different clients in various countries

# Author: Tobi Williams Babatunde

# Import libraries
import requests 
import lxml
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
import time

driver = webdriver.Chrome()

base_url = "https://crossboundaryenergy.com/projects"

# Setup crawling parameters
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
}

print("Libraries imported successfully")

# Load the webpage
driver.get(base_url)

# Define the amount of scrolling (in pixels) and the delay between scrolls
scroll_increment = 500  # Scroll down by 500 pixels each time
scroll_delay = 1  # Delay in seconds between each scroll

# Scroll down gradually until the bottom of the page is reached
page_height = driver.execute_script("return document.body.scrollHeight")
scroll_position = 0
while scroll_position < page_height:
    driver.execute_script(f"window.scrollTo(0, {scroll_position});")
    time.sleep(scroll_delay)
    scroll_position += scroll_increment
    page_height = driver.execute_script("return document.body.scrollHeight")

# Once at the bottom of the page, retrieve the page source
page_source = driver.page_source

# Close the WebDriver session
driver.quit()

# Now you can parse the HTML content using BeautifulSoup or any other method

try:
    # Parse the HTML content
    soup_1 = BeautifulSoup(page_source, 'lxml')
    print('HTML page parsed successfully\n')
except Exception as e:
    print('Failed to retrieve the HTML content. Exception:{}'.format(e))

project_name = soup_1.find_all("h2","subtitle mb-sm w-full grow")

Libraries imported successfully
HTML page parsed successfully



In [245]:
# Retrieve data for each project
project_list = []
project_data = {}

base_project_url = "https://crossboundaryenergy.com/project/"

# Iterate through each project page
counter = 0
for item in project_name:
    
    # Format each project name for use in url request
    project = item.get_text().lower().replace(" – "," ").replace(" ","-")

    # Handle some project names that don't follow the general rule
    if project == 'national-cement-nakuru':
        project = 'nc-nakuru'
    elif project == 'brush-manufacturers':
        project = 'teepee-brush-manufacturers'
    else:
        project = project

    try:
        # Request for project url
        response = requests.get(base_project_url+project, headers=headers)
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'lxml')
        # print('HTML page parsed successfully\n')

        country = soup.find("div","font-subtitle").get_text()
        project_title = item.get_text()

        # Generation Size, Customer Segment, Mounting Type
        generation_customer_mounting = soup.find_all("div","py-sm first:pt-0 flex flex-row gap-lg justify-between items-center border-b border-textColor")
        try:
            generation_size = generation_customer_mounting[0].findChild("span","body text-right").get_text()
            customer_segment = generation_customer_mounting[1].find("span", "body scale-100").get_text()
            mounting_type = generation_customer_mounting[2].find("span", "body scale-100").get_text()
        except Exception as retryGeneration:
            generation_size = generation_customer_mounting[1].findChild("span","body text-right").get_text()
            customer_segment = generation_customer_mounting[2].find("span", "body scale-100").get_text()
            mounting_type = generation_customer_mounting[3].find("span", "body scale-100").get_text()

        # Commission date, Estimated overall renewable energy contribution
        commission_date_re_contribution = soup.find_all("div","py-sm last:pb-0 flex flex-row gap-lg justify-between items-center border-b border-textColor last:border-b-0")
        commission_date = commission_date_re_contribution[0].find("span", "body text-right").get_text()
        re_contribution = commission_date_re_contribution[1].find("span", "body text-right").get_text()

        # Partners
        partner_list = soup.find_all("div","py-sm first:pt-0 last:pb-0 flex flex-row gap-lg justify-between items-center border-b border-textColor last:border-b-0")
        try:
            panel_count = partner_list[0].find("span", "body text-right").get_text()
            partners = partner_list[1].find("span", "body text-right").get_text()
        except Exception as retryPartners:
            panel_count = None
            partners = partner_list[0].find("span", "body text-right").get_text()
        except Exception as e:
            print(e)
        
        # Store data in dictionary
        project_data['project_name'] = project_title
        project_data['country'] = country
        project_data['generation_size'] = generation_size
        project_data['customer_segment'] = customer_segment
        project_data['mounting_type'] = mounting_type
        project_data['panel_count'] = panel_count
        project_data['commission_date'] = commission_date
        project_data['re_contribution'] = re_contribution
        project_data['partners'] = partners

        # Compile project data
        project_list.append(project_data)
        project_data = {} 

        counter+=1

        # progress bar
        print(f"\rProgress: {counter}/{len(project_name)}", end='', flush=True)

    except Exception as e:
        print('Failed to retrieve the HTML content. Exception:{}'.format(e))


    

Progress: 23/23

In [248]:
# Check output
project_list[0]

{'project_name': 'Balama',
 'country': 'Mozambique',
 'generation_size': '11.2 MWp',
 'customer_segment': 'Mining',
 'mounting_type': 'Ground',
 'panel_count': None,
 'commission_date': 'Q2 2023',
 're_contribution': '35%',
 'partners': 'Solarcentury Africa'}

In [249]:
# Save project data list to file withPython native capability
# Define fieldnames
fieldnames = project_list[0].keys()

# Write the data to a csv file
with open('crossboundary_energy_projects.csv', 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write header and rows
    writer.writeheader()
    writer.writerows(project_list)

print("Successfully saved data to file")

Successfully saved data to file
