In [4]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

def fetch_table(url):
    """
    Fetch the HTML content of the webpage.
    """
    try:
        # Send a GET request to fetch the page content
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors
        return response.content
    except requests.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return None

def parse_table(content):
    """
    Parse the HTML content and extract the table into a DataFrame.
    """
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(content, 'lxml')

    # Find the table with the desired data using specific attributes
    table = soup.find('table', {'align': 'center', 'border': '1'})

    if not table:
        print("Table not found. Check the HTML structure.")
        return None

    # Extract table headers
    headers = []
    for td in table.find_all('td', width='7%'):  # Find all <td> elements with width attribute for headers
        header_text = td.find('strong')
        if header_text:
            headers.append(header_text.text.strip())
    print("Headers found:", headers[:13])
    
    # Extract table rows
    rows = []
    for tr in table.find_all('tr')[1:]:  # Skip the first header row
        cells = tr.find_all('td', {'width': '7%'})  # Get cells with specific width
        row = [cell.text.strip() for cell in cells]
        rows.append(row)

    # Create a DataFrame
#     print(headers[:13])
    return pd.DataFrame(rows, columns=headers[:13])

# URL of the page
url = "https://origin.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/ONI_v5.php"

# Fetch the table content
content = fetch_table(url)

if content:
    # Parse the table and create DataFrame
    df = parse_table(content)

    if df is not None:
        # Display the DataFrame
        print(df)

        # Save DataFrame to a CSV file
        df.to_csv('ONI_data.csv', index=False)
        df.to_pickle('ONI_data.pkl')
    else:
        print("Failed to create DataFrame.")
else:
    print("Failed to retrieve page content.")


Headers found: ['Year', 'DJF', 'JFM', 'FMA', 'MAM', 'AMJ', 'MJJ', 'JJA', 'JAS', 'ASO', 'SON', 'OND', 'NDJ']
    Year   DJF   JFM   FMA   MAM   AMJ   MJJ   JJA   JAS   ASO   SON   OND  \
0   1950  -1.5  -1.3  -1.2  -1.2  -1.1  -0.9  -0.5  -0.4  -0.4  -0.4  -0.6   
1   1951  -0.8  -0.5  -0.2   0.2   0.4   0.6   0.7   0.9   1.0   1.2   1.0   
2   1952   0.5   0.4   0.3   0.3   0.2   0.0  -0.1   0.0   0.2   0.1   0.0   
3   1953   0.4   0.6   0.6   0.7   0.8   0.8   0.7   0.7   0.8   0.8   0.8   
4   1954   0.8   0.5   0.0  -0.4  -0.5  -0.5  -0.6  -0.8  -0.9  -0.8  -0.7   
..   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   
77  2020   0.5   0.5   0.4   0.2  -0.1  -0.3  -0.4  -0.6  -0.9  -1.2  -1.3   
78  2021  -1.0  -0.9  -0.8  -0.7  -0.5  -0.4  -0.4  -0.5  -0.7  -0.8  -1.0   
79  2022  -1.0  -0.9  -1.0  -1.1  -1.0  -0.9  -0.8  -0.9  -1.0  -1.0  -0.9   
80  2023  -0.7  -0.4  -0.1   0.2   0.5   0.8   1.1   1.3   1.6   1.8   1.9   
81  2024   1.8   1.5   1.1   0.7  

In [5]:
df

Unnamed: 0,Year,DJF,JFM,FMA,MAM,AMJ,MJJ,JJA,JAS,ASO,SON,OND,NDJ
0,1950,-1.5,-1.3,-1.2,-1.2,-1.1,-0.9,-0.5,-0.4,-0.4,-0.4,-0.6,-0.8
1,1951,-0.8,-0.5,-0.2,0.2,0.4,0.6,0.7,0.9,1.0,1.2,1.0,0.8
2,1952,0.5,0.4,0.3,0.3,0.2,0.0,-0.1,0.0,0.2,0.1,0.0,0.1
3,1953,0.4,0.6,0.6,0.7,0.8,0.8,0.7,0.7,0.8,0.8,0.8,0.8
4,1954,0.8,0.5,0.0,-0.4,-0.5,-0.5,-0.6,-0.8,-0.9,-0.8,-0.7,-0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,2020,0.5,0.5,0.4,0.2,-0.1,-0.3,-0.4,-0.6,-0.9,-1.2,-1.3,-1.2
78,2021,-1.0,-0.9,-0.8,-0.7,-0.5,-0.4,-0.4,-0.5,-0.7,-0.8,-1.0,-1.0
79,2022,-1.0,-0.9,-1.0,-1.1,-1.0,-0.9,-0.8,-0.9,-1.0,-1.0,-0.9,-0.8
80,2023,-0.7,-0.4,-0.1,0.2,0.5,0.8,1.1,1.3,1.6,1.8,1.9,2.0


In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--enable-javascript")
chrome_options.add_argument("window-size=1920x1080")
chrome_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)

# Initialize the WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)

# URL of the ENSO Outlook page
url = "http://www.bom.gov.au/climate/enso/outlook/"

try:
    # Open the page
    driver.get(url)

    # Wait for the specific table element to be present
    table = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//table[.//th[@title='January']]"))
    )

    # Extract headers
    headers = [th.text.strip() for th in table.find_elements(By.XPATH, ".//thead/tr/th")]

    # Extract rows
    rows = []
    for tr in table.find_elements(By.XPATH, ".//tbody/tr"):
        cells = tr.find_elements(By.XPATH, ".//td")
        row = [cell.text.strip() for cell in cells]

        # If row length doesn't match headers, pad the row
        if len(row) < len(headers):
            row.extend([''] * (len(headers) - len(row)))
        
        rows.append(row)

    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Add the year column
    start_year = 1980
    df.insert(0, 'Year', range(start_year, start_year + len(df)))
    df.to_csv('BOM_ENSO.csv', index=False)
    df.to_pickle('BOM_ENSO.pkl')
    # Display the DataFrame
    print(df)

except Exception as e:
    print(f"Error fetching the table: {e}")
finally:
    driver.quit()  # Close the WebDriver


    Year  Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec
0   1980    N    N  ENW  ENW  ENW  ENW  ENW    N    N    N    N    N
1   1981    N    N    N    N    N    N    N    N    N    N    N    N
2   1982    N    N    N    N  ENW  ENA   EN   EN   EN   EN   EN   EN
3   1983   EN   EN  LNW  LNW  LNW  LNW  LNW  LNW  LNA  LNA   LN   LN
4   1984   LN  LNW  LNW  LNW  LNW   LN   LN    N    N    N    N    N
5   1985    N    N  LNA  LNA    N   LN    N    N    N    N    N    N
6   1986    N    N    N    N    N    N  ENA  ENA  ENA  ENA   EN   EN
7   1987   EN   EN   EN   EN   EN   EN   EN   EN   EN   EN   EN   EN
8   1988   EN   EN  LNW  LNA  LNA   LN   LN   LN   LN   LN   LN   LN
9   1989   LN   LN   LN  ENW  ENW    N    N    N    N    N    N    N
10  1990  ENW  ENW  ENA  ENA    N    N    N    N    N    N    N    N
11  1991    N    N  ENW    N  ENA   EN   EN   EN   EN   EN   EN   EN
12  1992   EN   EN   EN   EN   EN   EN    N    N    N    N    N    N
13  1993    N    N  ENA  ENA   EN 

In [7]:
df

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,1980,N,N,ENW,ENW,ENW,ENW,ENW,N,N,N,N,N
1,1981,N,N,N,N,N,N,N,N,N,N,N,N
2,1982,N,N,N,N,ENW,ENA,EN,EN,EN,EN,EN,EN
3,1983,EN,EN,LNW,LNW,LNW,LNW,LNW,LNW,LNA,LNA,LN,LN
4,1984,LN,LNW,LNW,LNW,LNW,LN,LN,N,N,N,N,N
5,1985,N,N,LNA,LNA,N,LN,N,N,N,N,N,N
6,1986,N,N,N,N,N,N,ENA,ENA,ENA,ENA,EN,EN
7,1987,EN,EN,EN,EN,EN,EN,EN,EN,EN,EN,EN,EN
8,1988,EN,EN,LNW,LNA,LNA,LN,LN,LN,LN,LN,LN,LN
9,1989,LN,LN,LN,ENW,ENW,N,N,N,N,N,N,N
