In [None]:
!pip install -q autogluon

import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the datasets
train_df = pd.read_csv('/kaggle/input/super-ai-engineer-5-pm-2-5-data-of-thailand/train.csv')
submission_df = pd.read_csv('/kaggle/input/super-ai-engineer-5-pm-2-5-data-of-thailand/sample_submission.csv')

# Data preprocessing
# 1. First drop rows where Date is null
train_df = train_df.dropna(subset=['Date'])

# 2. Convert Date to datetime format
train_df['Date'] = pd.to_datetime(train_df['Date'], format='%d/%m/%Y', errors='coerce')

# 3. Drop rows where datetime conversion failed (resulted in NaT)
train_df = train_df.dropna(subset=['Date'])

# 4. Handle missing values in PM2.5 - forward fill then backward fill
train_df['PM2.5'] = train_df['PM2.5'].fillna(method='ffill').fillna(method='bfill')

# 5. Sort by date to ensure chronological order
train_df = train_df.sort_values('Date')

# Prepare data for AutoGluon TimeSeries
# AutoGluon expects the timestamp column to be named 'timestamp' and the target as 'target'
train_df = train_df.rename(columns={'Date': 'timestamp', 'PM2.5': 'target'})
train_df['item_id'] = 'PM2.5'  # Add a constant item_id column for univariate time series

# Verify no missing values in timestamp column
print("Missing values in timestamp column:", train_df['timestamp'].isnull().sum())

# Convert to TimeSeriesDataFrame
train_data = TimeSeriesDataFrame.from_data_frame(
    train_df,
    id_column="item_id",
    timestamp_column="timestamp"
)

# Initialize the TimeSeriesPredictor
predictor = TimeSeriesPredictor(
    prediction_length=366,  # We need to predict 366 days
    target="target",
    eval_metric="MASE",    # Mean Absolute Scaled Error
    path="autogluon-chronos-pm25",
)

# Fit the model with chronos-bolt-base
predictor.fit(
    train_data,
    presets="fast_training",  # Uses chronos-bolt-base by default
    time_limit=600,           # 10 minutes training time
)

# Generate predictions
predictions = predictor.predict(train_data)

# Prepare submission
# The predictions will be for the next 366 days after our training data
submission_df['Date'] = pd.to_datetime(submission_df['Date'], format='%d/%m/%Y')
submission_df['PM2.5'] = predictions['mean'].values

# Save the submission file
submission_df.to_csv('/kaggle/working/submission_pm25.csv', index=False)

print("Submission file created successfully!")

In [None]:
submission_df = pd.read_csv('/kaggle/input/super-ai-engineer-5-pm-2-5-data-of-thailand/sample_submission.csv')
pre_df = pd.read_csv('/kaggle/input/pm25-autogluon/submission_PM25_autogluon.csv')
submission_df['PM2.5'] =pre_df['PM2.5']
submission_df.to_csv('/kaggle/working/submission_pm25_autogluon.csv', index=False)
submission_df.info()