# Import các thư viện cần thiết

In [1]:
import requests
from datetime import datetime, timedelta, timezone
import pandas as pd
import time
from datetime import date

# Cấu hình chung

In [2]:
TIMEZONE = "Asia/Ho_Chi_Minh"
CSV_OUTPUT = "../data/hcm_weather_overview.csv"
API_URL = "https://archive-api.open-meteo.com/v1/archive"

# Tọa độ lấy từ https://www.geonames.org/1566083/ho-chi-minh-city.html
# Danh sách các tỉnh thành cần lấy
# Danh sách 5 điểm đại diện cho khí hậu TP.HCM
ZONES = {
    "HCM_Center": (10.7769, 106.7009),    # Quận 1 (Lõi nhiệt đô thị)
    "HCM_East": (10.8443, 106.7725),      # Thủ Đức (Khu cao, cửa ngõ)
    "HCM_NorthWest": (11.0067, 106.5132), # Củ Chi (Nóng, khô hơn)
    "HCM_South": (10.7028, 106.7226),     # Quận 7 (Gần sông, thoáng)
    "HCM_Coastal": (10.4114, 106.9546)    # Cần Giờ (Khí hậu biển)
}

# I. Thu thập dữ liệu

## 1. Thiết lập khoảng thời gian

In [None]:
# --- Thời gian lấy dữ liệu  ---
# Lấy trọn vẹn 2 năm quá khứ: 2024 và 2025
start_date = date(2024, 1, 1)
end_date = date(2025, 12, 31)

print("Start date:", start_date) # Kết quả: 2024-01-01
print("End date:", end_date)     # Kết quả: 2025-12-31

Start date: 2024-01-01
End date: 2025-12-31


## 2. Các biến thời tiết cần lấy

In [None]:
HOURLY_VARS = [
    "temperature_2m",
    "dew_point_2m",
    "apparent_temperature",
    "relative_humidity_2m",
    "precipitation", 
    "rain", 
    "wind_speed_10m", 
    "wind_gusts_10m", 
    "wind_direction_10m", 
    "surface_pressure",
    "pressure_msl",
    "cloud_cover", 
    "cloud_cover_low", 
    "cloud_cover_mid", 
    "cloud_cover_high", 
    "shortwave_radiation",
    "weather_code",
    "vapour_pressure_deficit",
]

## 3. Lấy dữ liệu từ Open-Meteo

In [5]:
def fetch_weather(lat, lon, start_date, end_date, hourly_vars, retries=3, delay=2):
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date.isoformat(),
        "end_date": end_date.isoformat(),
        "hourly": ",".join(hourly_vars),
        "timezone": TIMEZONE,
        "temperature_unit": "celsius",
        "wind_speed_unit": "kmh",
        "precipitation_unit": "mm",
        "timeformat": "iso8601"
    }

    for attempt in range(1, retries + 1):
        try:
            response = requests.get(API_URL, params=params, timeout=15)
            response.raise_for_status()
            return response.json()

        except Exception as e:
            print(f"[LỖI] Thử lần {attempt}/{retries} khi lấy dữ liệu ({lat}, {lon}): {e}")

            if attempt < retries:
                print(f"Chờ {delay} giây trước khi thử lại...")
                time.sleep(delay)
            else:
                print("Đã thử tối đa, bỏ qua điểm này.")
                return {}    # Trả về dict rỗng nếu thất bại hoàn toàn


all_data = []
for zone, (lat, lon) in ZONES.items():
    print(f"\nLấy dữ liệu cho {zone} ({lat}, {lon})...")

    data = fetch_weather(lat, lon, start_date, end_date, HOURLY_VARS)
    if not data or "hourly" not in data:
        print(f"[CẢNH BÁO] Không có dữ liệu cho {zone}, bỏ qua.")
        continue

    print(f"Coordinates: {data['latitude']}°N {data['longitude']}°E")
    print(f"Elevation: {data.get('elevation', 'N/A')} m asl")
    df = pd.DataFrame(data["hourly"])
    if len(ZONES) > 1:
        df["zone"] = zone
        df["lat"] = lat
        df["lon"] = lon

    all_data.append(df)
    time.sleep(0.2)

# Gộp dữ liệu
full_df = pd.concat(all_data, ignore_index=True)


Lấy dữ liệu cho HCM_Center (10.7769, 106.7009)...
Coordinates: 10.790861°N 106.71088°E
Elevation: 12.0 m asl

Lấy dữ liệu cho HCM_East (10.8443, 106.7725)...
Coordinates: 10.86116°N 106.80531°E
Elevation: 27.0 m asl

Lấy dữ liệu cho HCM_NorthWest (11.0067, 106.5132)...
Coordinates: 11.001758°N 106.51595°E
Elevation: 8.0 m asl

Lấy dữ liệu cho HCM_South (10.7028, 106.7226)...
Coordinates: 10.720562°N 106.69611°E
Elevation: 5.0 m asl

Lấy dữ liệu cho HCM_Coastal (10.4114, 106.9546)...
Coordinates: 10.439367°N 106.875°E
Elevation: 6.0 m asl


## 4. Kiểm tra dữ liệu

In [6]:
full_df.head()

Unnamed: 0,time,temperature_2m,dew_point_2m,apparent_temperature,relative_humidity_2m,precipitation,rain,wind_speed_10m,wind_gusts_10m,wind_direction_10m,...,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,shortwave_radiation,weather_code,vapour_pressure_deficit,zone,lat,lon
0,2024-01-01T00:00,25.6,23.5,30.6,88,0.0,0.0,5.7,10.4,145,...,99,0,0,99,0.0,3,0.41,HCM_Center,10.7769,106.7009
1,2024-01-01T01:00,25.4,23.5,30.7,90,0.0,0.0,3.6,8.3,143,...,96,0,0,96,0.0,3,0.33,HCM_Center,10.7769,106.7009
2,2024-01-01T02:00,25.3,23.2,30.8,88,0.0,0.0,0.5,5.0,135,...,96,0,0,96,0.0,3,0.38,HCM_Center,10.7769,106.7009
3,2024-01-01T03:00,25.0,23.4,30.0,91,0.0,0.0,4.7,5.4,9,...,98,8,2,98,0.0,3,0.29,HCM_Center,10.7769,106.7009
4,2024-01-01T04:00,24.6,23.2,29.1,92,0.0,0.0,8.0,11.5,352,...,100,13,1,100,0.0,3,0.25,HCM_Center,10.7769,106.7009


In [7]:
full_df.shape

(87720, 22)

In [8]:
full_df.dtypes

time                        object
temperature_2m             float64
dew_point_2m               float64
apparent_temperature       float64
relative_humidity_2m         int64
precipitation              float64
rain                       float64
wind_speed_10m             float64
wind_gusts_10m             float64
wind_direction_10m           int64
surface_pressure           float64
pressure_msl               float64
cloud_cover                  int64
cloud_cover_low              int64
cloud_cover_mid              int64
cloud_cover_high             int64
shortwave_radiation        float64
weather_code                 int64
vapour_pressure_deficit    float64
zone                        object
lat                        float64
lon                        float64
dtype: object

## 5. Lưu CSV

In [9]:
full_df.to_csv(CSV_OUTPUT, index=False)
print(f"Lưu CSV: {CSV_OUTPUT}")

Lưu CSV: ../data/hcm_weather_overview.csv
