In [5]:
import pandas as pd
path_to_data = "..//data//"

df = pd.read_csv(path_to_data+"pm_data.csv")
sensors_df = pd.read_csv(path_to_data+"sensors.csv")

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import acf

data = pd.read_csv("../data/decomp/kusok_1.csv")  # Загрузить файл
season_periods = {}
data.drop(columns=['Unnamed: 0'], inplace=True)
for col in data.columns:
    if col == 'full_date':
        continue
    series = data[col]  # Выбрать временной ряд
    lags = 500  # Достаточно, чтобы захватить возможный сезон
    
    acf_values = acf(series, nlags=lags)
    season_length = acf_values[100:].argmax() + 100
    print(season_length)
    season_periods[col] = season_length

365
356
358
358


In [12]:
data

Unnamed: 0,full_date,1,2,5,6
0,2017-03-22,148.020337,40.683844,39.022917,33.550382
1,2017-03-23,44.308725,29.237465,24.606322,23.765278
2,2017-03-24,54.905155,43.675636,32.662021,24.127526
3,2017-03-25,67.575610,58.792217,73.688502,57.902710
4,2017-03-26,45.677083,48.348401,41.901811,28.811111
...,...,...,...,...,...
1291,2020-10-03,59.629346,50.642450,43.423905,28.360000
1292,2020-10-04,69.613621,30.410000,17.520000,21.340000
1293,2020-10-05,79.143255,29.590000,16.530000,20.040000
1294,2020-10-06,48.956855,26.380000,13.220000,17.600000


In [7]:
data['full_date'].min(), data['full_date'].max(), len(data['full_date'])/360

('2017-03-22', '2020-10-07', 3.6)

In [8]:
for n in range(2,6):
    data_ = pd.read_csv(f"../data/decomp/kusok_{n}.csv")  # Загрузить файл
    print(data_['full_date'].min(), data_['full_date'].max(), len(data_['full_date'])/360)

2017-06-20 2018-09-29 1.2972222222222223
2017-07-20 2019-09-15 2.188888888888889
2017-10-28 2020-10-06 2.986111111111111
2018-04-29 2020-10-06 2.477777777777778


it makes sense to enforce a consistent season length across all samples, even if the underlying natural seasonality varies slightly. If we enforce 365 days for all samples, we ensure:

Uniformity in modeling → Makes it easier to compare across distributions.

Aligned start and end dates across columns → Prevents gaps and offsets.

Better compatibility with ML models → Models generally prefer fixed input sizes.

In [11]:
import numpy as np
import pandas as pd

# User-defined target month and day (parameter, not hardcoded)
target_month_day = "11-15"  # Default target ending day

# Define a fixed season length for uniformity (365 days for all samples)
fixed_season_length = 365

# Load and sort data by date
data["full_date"] = pd.to_datetime(data["full_date"])
data = data.sort_values("full_date")

# Dictionary to store dataset samples per column
dataset_samples = {}

# Iterate over each column (distribution)
for col in data.columns:
    if col == "full_date":
        continue

    series = data[["full_date", col]].dropna()  # Drop missing values

    # Determine the last available year where {target_month_day} exists
    available_years = series["full_date"].dt.year.unique()
    last_valid_year = None
    last_valid_end_date = None

    # Find the latest year that contains {MM-DD}
    for year in sorted(available_years, reverse=True):
        potential_end_date = pd.Timestamp(f"{year}-{target_month_day}")
        if potential_end_date in series["full_date"].values:
            last_valid_year = year
            last_valid_end_date = potential_end_date
            break

    if last_valid_end_date is None:
        print(f"⚠️ No exact match for {target_month_day} in column {col}, skipping.")
        continue

    # Determine the number of available full seasons
    series_filtered = series[series["full_date"] <= last_valid_end_date]
    num_available_days = len(series_filtered)
    num_full_seasons = num_available_days // fixed_season_length

    if num_full_seasons < 1:
        print(f"⚠️ Not enough data for at least 1 season in {col}, skipping.")
        continue

    # Generate samples for each available full season
    samples = []
    for i in range(num_full_seasons):
        start_date = last_valid_end_date - pd.Timedelta(days=(i + 1) * fixed_season_length)
        end_date = start_date + pd.Timedelta(days=fixed_season_length)

        # Extract data for this season
        season_data = series[(series["full_date"] >= start_date) & (series["full_date"] < end_date)].copy()
        if not season_data.empty:
            samples.append(season_data)

    # Store the dataset samples for this column
    dataset_samples[col] = samples

# Save datasets for each distribution
for col, samples in dataset_samples.items():
    for i, df in enumerate(samples):
        df.to_csv(f"../data/nov_samples/dataset_{col}_season_{i+1}.csv", index=False)

print(f"✅ Datasets created with fixed {fixed_season_length}-day seasons ending on {target_month_day}!")


✅ Datasets created with fixed 365-day seasons ending on 11-15!
