In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from datetime import datetime, timedelta

In [None]:

def scrape_weather(year, month):
    url = f"https://www.timeanddate.com/weather/vietnam/ho-chi-minh/historic?month={month}&year={year}"

    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    driver.get(url)
    wait = WebDriverWait(driver, 5)

    select_element = Select(wait.until(EC.element_to_be_clickable((By.ID, 'wt-his-select'))))

    data = []
    for option in select_element.options:
        option_date = option.get_attribute('value')
        select_element.select_by_value(option_date)

        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table#wt-his tbody tr")))
        time.sleep(1)

        rows = driver.find_elements(By.CSS_SELECTOR, "table#wt-his tbody tr")
        current_date = pd.to_datetime(option_date, format='%Y%m%d').date()

        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            if len(columns) == 0:
                continue

            time_of_day = row.find_element(By.TAG_NAME, "th").text.strip()

            if time_of_day == "00:00":
                continue

            temp = columns[1].text.replace("\u00a0°C", "").strip()
            weather = columns[2].text.strip()
            wind = columns[3].text.strip()
            wind_direction_element = columns[4].find_elements(By.TAG_NAME, "span")
            wind_direction = wind_direction_element[0].get_attribute("title") if wind_direction_element else ""
            humidity = columns[5].text.strip()
            barometer = columns[6].text.strip()
            visibility = columns[7].text.strip().replace("N/A", "")

            data.append({
                "Date": current_date,
                "Time": time_of_day,
                "Temperature (°C)": temp,
                "Weather": weather,
                "Wind": wind,
                "Wind Direction": wind_direction,
                "Humidity": humidity,
                "Barometer (mbar)": barometer,
                "Visibility (km)": visibility,
            })

    driver.quit()
    return pd.DataFrame(data)

In [3]:
start_date = datetime(2015, 1, 1)
end_date = datetime(2015, 2, 1)
all_data = pd.DataFrame()

while start_date <= end_date:
    print(f"Scraping data for {start_date.strftime('%Y-%m')}")
    monthly_df = scrape_weather(start_date.year, start_date.month)
    all_data = pd.concat([all_data, monthly_df], ignore_index=True)
    start_date += timedelta(days=32)
    start_date = start_date.replace(day=1)

Scraping data for 2015-01
Scraping data for 2015-02


In [5]:
all_data.to_csv("weather_data_2015_to_2017.csv", index=False, encoding="utf-8-sig")
print(all_data.head())

         Date               Time Temperature (°C)          Weather    Wind  \
0  2015-01-01  00:00\nThu, 1 Jan            25 °C  Passing clouds.  4 km/h   
1  2015-01-01              00:30            24 °C  Passing clouds.  4 km/h   
2  2015-01-01              01:00            25 °C  Passing clouds.  4 km/h   
3  2015-01-01              01:30            24 °C  Passing clouds.  4 km/h   
4  2015-01-01              02:00            24 °C  Passing clouds.  6 km/h   

                                      Wind Direction Humidity  \
0  Wind blowing from 240° West-southwest to East-...      69%   
1                Wind blowing from 0° North to South      74%   
2  Wind blowing from 290° West-northwest to East-...      74%   
3  Wind blowing from 300° West-northwest to East-...      78%   
4      Wind blowing from 320° Northwest to Southeast      78%   

  Barometer (mbar) Visibility (km)  
0        1013 mbar            7 km  
1        1012 mbar            7 km  
2        1012 mbar           