In [None]:
import os
import pandas as pd
from glob import glob
import numpy as np
import random
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:


def inject_anomalies_ts(ts, 
                        types=['amplitude', 'trend', 'mean', 'platform', 
                               'pattern', 'pattern_shift', 'variance', 
                               'extremum', 'shift_30', 'shift_50'],
                        inject_ratio=0.1, 
                        segment_length=7, 
                        seed=42):
  
    # Prepare data
    if isinstance(ts, pd.Series):
        idx = ts.index
        data = ts.values.astype(float).copy()
        return_series = True
    else:
        data = np.array(ts, dtype=float).copy()
        idx = None
        return_series = False

    np.random.seed(seed)
    random.seed(seed)

    n = len(data)
    max_starts = n - segment_length + 1
    n_segments = int(max_starts * inject_ratio)

    # Select non-overlapping segment starts
    candidates = list(range(max_starts))
    random.shuffle(candidates)
    chosen_starts = []
    occupied = np.zeros(n, dtype=bool)
    for start in candidates:
        if not occupied[start:start+segment_length].any():
            chosen_starts.append(start)
            occupied[start:start+segment_length] = True
            if len(chosen_starts) >= n_segments:
                break

    mask = np.zeros(n, dtype=bool)
    labels = []

    # Inject anomalies
    for start in chosen_starts:
        end = start + segment_length
        window = data[start:end].copy()
        anomaly_type = random.choice(types)

        if anomaly_type == 'amplitude':
            i = random.randint(0, segment_length-1)
            window[i] += np.random.uniform(1.5, 3.0)
        elif anomaly_type == 'extremum':
            i = random.randint(0, segment_length-1)
            window[i] = 1.0 if np.random.rand() > 0.5 else 0.0
        elif anomaly_type == 'mean':
            shift = np.random.uniform(0.3, 0.7)
            window += shift
        elif anomaly_type == 'trend':
            trend = np.linspace(0, np.random.uniform(0.5, 1.0), segment_length)
            window += trend
        elif anomaly_type == 'platform':
            mean_val = window.mean()
            window[:] = mean_val
        elif anomaly_type == 'pattern':
            base = np.array([0.2, 0.4, 0.6, 0.4])
            pat = np.tile(base, (segment_length//4 + 1,))[:segment_length]
            window[:] = pat
        elif anomaly_type == 'pattern_shift':
            pat = np.sin(np.linspace(0, 2*np.pi, segment_length))
            shift = random.randint(1, segment_length-1)
            window[:] = np.roll(pat, shift)
        elif anomaly_type == 'variance':
            window += np.random.normal(0, 0.5, size=segment_length)
        elif anomaly_type == 'shift_30':
            factor = random.choice([0.3, -0.3])
            window *= (1 + factor)
        elif anomaly_type == 'shift_50':
            factor = random.choice([0.5, -0.5])
            window *= (1 + factor)

        # Apply and clip
        data[start:end] = np.clip(window, 0, 1)

        # Mark mask & labels
        if anomaly_type in ('amplitude', 'extremum'):
            anomaly_idx = start + i
            mask[anomaly_idx] = True
            labels.append((anomaly_idx, anomaly_idx+1, anomaly_type))
        else:
            mask[start:end] = True
            labels.append((start, end, anomaly_type))

    # Return in original format
    if return_series:
        ts_mod = pd.Series(data, index=idx, name=ts.name)
    else:
        ts_mod = data

    return ts_mod, mask, labels


In [None]:
import pandas as pd
import numpy as np
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Loop through each dataset in new_dict
for filename, df_selected in new_dict.items():
    print(f"Processing {filename}...")

    # Ensure datetime index
    df_selected.index = pd.to_datetime(df_selected.index)

    # Define rolling window parameters
    train_window = pd.Timedelta(days=60)  
    predict_horizon = pd.Timedelta(days=1) 

    # Initialize lists to store results
    arima_params = []
    aics = []
    bics = []
    errors = []
    mses = []
    predictions_list = []
    actual_values_list = []
    dates = []


    date_start = df_selected.index.min()
    date_end = df_selected.index.max()
    current_start = date_start
    iteration = 1

    while current_start + train_window + predict_horizon <= date_end:
        # Define training and prediction windows
        train_end = current_start + train_window - predict_horizon 
        predict_start = train_end + predict_horizon 

        print(f"Iteration {iteration}: Training from {current_start.date()} to {train_end.date()}, Predicting {predict_start.date()}")

        train_data = df_selected.loc[current_start:train_end]
        test_data = df_selected.loc[predict_start:predict_start]

        # Fit Auto ARIMA
        model = auto_arima(train_data,
                           seasonal=False,
                           d=None,
                           start_p=0, max_p=10,
                           start_q=0, max_q=10,
                           max_order= None,
                           max_d=2,
                           trace=True,
                           n_jobs=10,
                           error_action="ignore",
                           suppress_warnings=True,
                           stepwise=False)

        # Make predictions
        forecast = model.predict(n_periods=1)

        # Compute error metrics
        mae = mean_absolute_error(test_data, forecast)
        mse = mean_squared_error(test_data, forecast)

        # Store results
        arima_params.append(model.order)
        aics.append(model.aic())
        bics.append(model.bic())
        errors.append(mae)
        mses.append(mse)
        predictions_list.append(forecast[0])
        actual_values_list.append(test_data.iloc[0])
        dates.append(predict_start.date())

        # Move the window forward by one day
        current_start += predict_horizon
        iteration += 1

    # Convert results to DataFrame
    results_df = pd.DataFrame({
        "Date": dates,
        "ARIMA_Params": arima_params,
        "AIC": aics,
        "BIC": bics,
        "MAE": errors,
        "MSE": mses,
        "Forecast": predictions_list,
        "Actual Values": actual_values_list
    })

    # Save results to CSV
    csv_filename = f"{filename.replace('.csv', '')}_arima_injected_results.csv"
    results_df.to_csv(csv_filename, index=False)
    print(f"Results saved to {csv_filename}\n")

print("Processing complete for all datasets.")