### To schedule the scraping every X minutes (depending on your need), will need to use a Scheduler. Here we have two of the many options available:
1. GitHub Actions
2. Google Cloud Scheduler

In [1]:
!pip -q install PyVirtualDisplay

In [5]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

from datetime import datetime

from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

import requests 
import json 

In [11]:
ggeocode = 'AIzaSyACn8ZsmhM9DjpK6MYUApfscEnQypC6LjY'
def get_location_coordinates(location): # 4
    # pass for now
    # Define the base url
    geo_url = f"https://maps.googleapis.com/maps/api/geocode/json?address={location}&key={ggeocode}" # 6
    response = requests.get(geo_url) # 7
    content = response.content.decode("utf8") # 8
    geo_js = json.loads(content) # 9
    geo_status = geo_js["status"] # 10

    if geo_status == "OK": # 11
        geo_elements = geo_js["results"][0] # 12
        geometry = geo_elements["geometry"] # 13
        location_coordinates = geometry["location"] # 14
        location_lat = location_coordinates["lat"] # 15
        location_long = location_coordinates["lng"] # 16
        return (location_lat,location_long)
    else:
        return (None,None)


In [20]:
def scrapePirates(jsonify=False):
    URL = "https://www.icc-ccs.org/index.php/piracy-reporting-centre/live-piracy-report"
    rest_info  =[]
    r = requests.get(URL) 
    soup = BeautifulSoup(r.content, 'html.parser')
    all = soup.find("tbody")
    row = all.findAll('tr')
    for i in row:
            infos_row = i.findAll('td')
            for index,j in enumerate(infos_row):
                if index == 0:
                    attack_number =  j.text.replace('\n','').replace('\t','').replace('\r','')
                if index == 1:
                    narrations = j.text.replace('\n','').replace('\t','').replace('\r','')
                if index ==2:
                    date_of_incident = j.text.replace('\n','').replace('\t','').replace('\r','')
                if index >2:
                    continue
            try: 
                rest_info.append([attack_number,narrations,date_of_incident,datetime.today().strftime('%Y-%m-%d %H:%M')])
            except:
                continue
    df_pirates = pd.DataFrame(rest_info, columns = ['attack_nr','text','date_of_incident','scrape_date'])
    df_pirates['text'] = df_pirates["text"].apply(lambda x: x.split("Posn: ")[1])
    df_pirates['location'] = df_pirates["text"].apply(lambda x: x.split(",")[1].split(".")[0] if ":" in x.split(",")[0] else x.split(".")[0])
    df_pirates["geo_location"] = df_pirates["location"].apply(get_location_coordinates)  
    
    if jsonify:
        result = df_pirates.to_json(orient="split")
        parsed = json.loads(result)
        return json.dumps(parsed)
    else:
        df_pirates.to_csv("scraped_pirates.csv",mode='a', index=False,header=False)
        return

In [21]:
scrapePirates()

In [28]:
columns = ['attack_nr','text','date_of_incident','scrape_date', 'location']
test_data = pd.read_csv("scraped_pirates.csv", names=columns, usecols=[0,1,2,3,4], header=None)

In [29]:
test_data.head()

Unnamed: 0,attack_nr,text,date_of_incident,scrape_date,location
0,001-23,"12:01.5S – 077:12.0W, Callao Anchorage, Peru.F...",2023-01-09,2023-01-11 22:27,Callao Anchorage
1,001-23,"12:01.5S – 077:12.0W, Callao Anchorage, Peru.F...",2023-01-09,2023-01-11 22:29,Callao Anchorage
2,001-23,"12:01.5S – 077:12.0W, Callao Anchorage, Peru.F...",2023-01-09,2023-01-11 22:30,Callao Anchorage
