In [27]:
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import s3fs
import time
from zoneinfo import ZoneInfo
from datetime import timedelta, datetime


# Set up environments of LakeFS
lakefs_endpoint = os.getenv("LAKEFS_ENDPOINT", "http://lakefs-dev:8000")
ACCESS_KEY = 'access_key'
SECRET_KEY = 'secret_key'

# Setting S3FileSystem for access LakeFS
fs = s3fs.S3FileSystem(
    key=ACCESS_KEY,
    secret=SECRET_KEY,
    client_kwargs={'endpoint_url': lakefs_endpoint}
)

def load_data():
    lakefs_path = "s3://dsi321-record-air-quality/main/airquality.parquet/year=2025"
    data_list = fs.glob(f"{lakefs_path}/*/*/*/*")
    df_all = pd.concat([pd.read_parquet(f"s3://{path}", filesystem=fs) for path in data_list], ignore_index=True)
    df_all['lat'] = pd.to_numeric(df_all['lat'], errors='coerce')
    df_all['long'] = pd.to_numeric(df_all['long'], errors='coerce')
    df_all['year'] = df_all['year'].astype("int64")
    df_all['month'] = df_all['month'].astype("int64")
    df_all.drop_duplicates(inplace=True)
    df_all['PM25.aqi'] = df_all['PM25.aqi'].mask(df_all['PM25.aqi'] < 0, pd.NA)
    # Fill value "Previous Record" Group By stationID
    df_all['PM25.aqi'] = df_all.groupby('stationID')['PM25.aqi'].transform(lambda x: x.ffill())
    return df_all

In [28]:
df = load_data()

In [29]:
# เปลี่ยน data types กันด้วยนะ

In [30]:
df.to_csv('data.csv', encoding='utf-8', index=False)

In [35]:
df.to_parquet('data2.parquet', partition_cols=['year','month','day','hour'], engine='pyarrow')

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds

# Set your target folder (local or S3 path)
target_path = "data2.parquet"  # หรือ s3://.../parquet-folder/

# Convert DataFrame to Arrow Table
table = pa.Table.from_pandas(df)

# Overwrite by writing directly using pyarrow.dataset
ds.write_dataset(
    table,
    base_dir=target_path,
    format="parquet",
    partitioning=["year", "month", "day", "hour"],
    existing_data_behavior="overwrite_or_ignore",  # เขียนทับ
)