In [2]:
import pandas as pd

In [12]:
from MLForecastPipeline import *

def split_data(df, scenario, date_col="ds"):
    """Extracts train and test data based on train end date."""
    train_data = df[df[date_col] <= scenario['train_end']]
    test_start = pd.to_datetime(scenario['train_end']) + pd.Timedelta(days=1)
    test_data = df[df[date_col] >= test_start]
    return train_data, test_data

selected_sensors_df = pd.read_csv("../data/selected_sensors2_cleaned.csv", index_col=0)
testing_df = format_df_to_mlforecast(selected_sensors_df, 'full_date', '2')[['ds', 'y', 'unique_id']]
testing_df['ds'] = pd.to_datetime(testing_df['ds'])
train_df, test_df = split_data(testing_df, {"train_start": "2017-04-01", "train_end": "2018-04-01"})

In [13]:
train_df

Unnamed: 0,ds,y,unique_id
0,2017-03-22,40.683844,mean
1,2017-03-23,29.237465,mean
2,2017-03-24,43.675636,mean
3,2017-03-25,58.792217,mean
4,2017-03-26,48.348401,mean
...,...,...,...
371,2018-03-28,26.094187,mean
372,2018-03-29,23.352381,mean
373,2018-03-30,23.960317,mean
374,2018-03-31,31.329379,mean


In [17]:

from prophet import Prophet
from sklearn.base import BaseEstimator, RegressorMixin

class ProphetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **prophet_params):
        self.prophet_params = prophet_params
        self.model = None

    def fit(self, X, y):
        # Prophet requires a DataFrame with 'ds' and 'y' columns
        print(X)
        df = X.copy()
        df['y'] = y
        self.model = Prophet(**self.prophet_params)
        self.model.fit(df)
        return self

    def predict(self, X):
        # Prophet requires a DataFrame with 'ds' column for predictions
        print("HEYYYYYYYY - \n", X)
        future = X[['ds']].copy()
        forecast = self.model.predict(future)

        # future = self.model.make_future_dataframe(periods=len(X), freq='D')
        # forecast = self.model.predict(future)

        return forecast['yhat'].values

    def get_params(self, deep=True):
        return self.prophet_params

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
models = {
    "Prophet": ProphetWrapper()
}
model_name = 'Prophet'

fcst = MLForecast(
    models=[models[model_name]],
    freq='D',
    lags=[0],
    # target_transforms=list(transform_combination),
    # date_features=date_features,
    num_threads=1,
    # lag_transforms=lag_transforms,
)

# Fit the model
fcst.fit(train_df)

# Predict
predictions = fcst.predict(h=test_df.shape[0])
test_df_copy = test_df.copy()
test_df_copy['forecast'] = predictions[model_name].values     

ValueError: lags must be positive integers.

In [8]:
file_path = r"C:\Users\77019\Downloads\waqi-covid19-airqualitydata-2025.csv"

# Read CSV while skipping comment lines
data = pd.read_csv(file_path, comment='#')
data = data.loc[data['City'] == "Almaty"]

In [10]:
data['Specie'].unique()

array(['dew', 'co', 'humidity', 'so2', 'pressure', 'wind-speed',
       'wind-gust', 'temperature', 'pm10', 'pm25', 'no2'], dtype=object)

In [13]:
data['Date'].min(), data['Date'].max()

('2024-01-01', '2025-03-05')

In [14]:
import requests
import time

# List of available periods
periods = [
    "2015H1", "2016H1", "2017H1", "2018H1",
    "2019Q1", "2019Q2", "2019Q3", "2019Q4",
    "2020Q1", "2020Q2", "2020Q3", "2020Q4",
    "2021Q1", "2021Q2", "2021Q3", "2021Q4",
    "2022Q1", "2022Q2", "2022Q3", "2022Q4",
    "2023Q1", "2023Q2", "2023Q3", "2023Q4"
]

# Base URL
base_url = "https://aqicn.org/data-platform/covid19/report/45268-77d0de2d"

# Loop through each period and download the file
for period in periods:
    url = f"{base_url}/{period}"
    file_name = f"waqi-airquality-{period}.csv"

    print(f"Downloading {period}...")

    response = requests.get(url)

    if response.status_code == 200:
        with open(file_name, "wb") as file:
            file.write(response.content)
        print(f"✅ {file_name} downloaded successfully!")
    else:
        print(f"❌ Failed to download {period}. Status code: {response.status_code}")

    # Add a small delay to prevent overwhelming the server
    time.sleep(2)

print("🎉 All available datasets have been processed.")


Downloading 2015H1...
✅ waqi-airquality-2015H1.csv downloaded successfully!
Downloading 2016H1...
✅ waqi-airquality-2016H1.csv downloaded successfully!
Downloading 2017H1...
✅ waqi-airquality-2017H1.csv downloaded successfully!
Downloading 2018H1...
✅ waqi-airquality-2018H1.csv downloaded successfully!
Downloading 2019Q1...
✅ waqi-airquality-2019Q1.csv downloaded successfully!
Downloading 2019Q2...
✅ waqi-airquality-2019Q2.csv downloaded successfully!
Downloading 2019Q3...
✅ waqi-airquality-2019Q3.csv downloaded successfully!
Downloading 2019Q4...
✅ waqi-airquality-2019Q4.csv downloaded successfully!
Downloading 2020Q1...
✅ waqi-airquality-2020Q1.csv downloaded successfully!
Downloading 2020Q2...
✅ waqi-airquality-2020Q2.csv downloaded successfully!
Downloading 2020Q3...
✅ waqi-airquality-2020Q3.csv downloaded successfully!
Downloading 2020Q4...
✅ waqi-airquality-2020Q4.csv downloaded successfully!
Downloading 2021Q1...
✅ waqi-airquality-2021Q1.csv downloaded successfully!
Downloading 