In [9]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.ar_model import AutoReg as AR

# Load dataset
df = pd.read_csv("gibf1_stable_sections.csv")

# Scaling factor for error sensitivity adjustment
SCALING_FACTOR = 100

# Dictionary to store all section-wise data
section_data = {
    "df_section": {},
    "best_k": {},
    "aic": {},
    "model": {},
    "predictions": {},
    "reconstruction_error": {},
    "scaled_reconstruction_error": {},
    "iFMII": {}
}

# Storage for all results
all_results = []

# Iterate through sections
for section in range(1, 44):
    # Filter data for this section and reset index
    df_section = df[df["Stable_Section"] == section].copy().reset_index(drop=True)
    section_data["df_section"][section] = df_section

    print(f"\n🔹 Processing df{section} (Stable_Section {section})...")

    # Ensure there are enough data points
    if len(df_section) < 10:
        print(f"⚠️ Not enough data for df{section}. Skipping...")
        all_results.append({"Section": section, "Best_k": None, "AIC": None, "iFMII": None})
        continue

    # Step 1: Find the best k using AIC
    best_k = 1
    best_aic = float("inf")

    for k in range(1, min(6, len(df_section) - 5)):  # Ensure valid k values
        try:
            model = AR(df_section["closePrice"], lags=k).fit()
            if model.aic < best_aic:
                best_aic = model.aic
                best_k = k
        except Exception as e:
            print(f"⚠️ AR model failed for k={k} in df{section}: {e}")
            continue

    section_data["best_k"][section] = best_k
    section_data["aic"][section] = best_aic
    print(f"✅ Best AR order (k) for df{section}: {best_k} (AIC = {best_aic})")

    # Step 2: Fit the best AR model
    try:
        print(f"🔧 Fitting AR({best_k}) model on df{section}...")
        section_data["model"][section] = AR(df_section["closePrice"], lags=best_k).fit()
    except Exception as e:
        print(f"⚠️ AR model fitting failed for df{section}: {e}")
        continue

    # Fix Prediction Issue: Ensure Proper Indexing
    train_size = len(df_section) - best_k

    if train_size <= 0:
        print(f"⚠️ Not enough data points for prediction in df{section}. Skipping...")
        continue

    try:
        print(f"🔮 Predicting for df{section} (Train size: {train_size}, Total size: {len(df_section)})...")

        # Make sure start and end indices are within valid range
        start_idx = max(train_size, 0)
        end_idx = len(df_section) - 1

        section_data["predictions"][section] = section_data["model"][section].predict(start=start_idx, end=end_idx)

        print(f"📊 Predictions for df{section}:\n{section_data['predictions'][section]}")

    except Exception as e:
        print(f"⚠️ Prediction failed for df{section}: {e}")
        continue

    # If predictions are NaN, print possible causes
    if section_data["predictions"][section].isna().sum() == len(section_data["predictions"][section]):
        print(f"⚠️ All predictions are NaN for df{section}. Possible issues:")
        print(f"   - Check if data index is properly reset (Try df_section.reset_index())")
        print(f"   - Ensure start={start_idx}, end={end_idx} are valid indices")
        print(f"   - Dataset size: {len(df_section)}, Best k: {best_k}")
        continue

    # Step 3: Compute Reconstruction Error
    actual_values = df_section["closePrice"].values[start_idx:]  # Align with predictions
    reconstructed_values = section_data["predictions"][section].values

    # Ensure predictions and actual_values have the same length
    if len(actual_values) != len(reconstructed_values):
        print(f"⚠️ Mismatch in lengths for df{section}. Actual: {len(actual_values)}, Predicted: {len(reconstructed_values)}")
        continue

    # Compute squared reconstruction error
    section_data["reconstruction_error"][section] = np.square(actual_values - reconstructed_values)
    print(f"🔄 Reconstruction Error for df{section}:\n{section_data['reconstruction_error'][section]}")

    # Step 4: Scale Reconstruction Error Instead of Normalizing
    section_data["scaled_reconstruction_error"][section] = section_data["reconstruction_error"][section] * SCALING_FACTOR

    # Compute iFMII using the scaled reconstruction error
    section_data["iFMII"][section] = np.nanmean(section_data["scaled_reconstruction_error"][section])

    # Print scaled reconstruction error
    print(f"📈 Scaled Reconstruction Error for df{section}:\n{section_data['scaled_reconstruction_error'][section]}")
    print(f"📉 Mean iFMII for df{section}: {section_data['iFMII'][section]}")

    # Step 5: Store Results for CSV Output
    for idx, (actual, predicted, error, scaled_error) in enumerate(zip(actual_values, reconstructed_values, 
                                                                       section_data["reconstruction_error"][section], 
                                                                       section_data["scaled_reconstruction_error"][section])):
        all_results.append({
            "Section": section,
            "Index": idx + start_idx,
            "Actual_ClosePrice": actual,
            "Predicted_ClosePrice": predicted,
            "Reconstruction_Error": error,
            "Scaled_Reconstruction_Error": scaled_error,
            "iFMII": section_data["iFMII"][section]
        })

# Convert results to a DataFrame and save to CSV
df_results = pd.DataFrame(all_results)
df_results.to_csv("gibf1_section_predictions.csv", index=False)

print("\n🔍 Final Results Saved to gibf1_section_predictions.csv")



🔹 Processing df1 (Stable_Section 1)...
✅ Best AR order (k) for df1: 1 (AIC = 2.262959188703989)
🔧 Fitting AR(1) model on df1...
🔮 Predicting for df1 (Train size: 19, Total size: 20)...
📊 Predictions for df1:
19    8.307491
dtype: float64
🔄 Reconstruction Error for df1:
[0.01155426]
📈 Scaled Reconstruction Error for df1:
[1.15542556]
📉 Mean iFMII for df1: 1.1554255577911408

🔹 Processing df2 (Stable_Section 2)...
✅ Best AR order (k) for df2: 1 (AIC = 2.838305299504773)
🔧 Fitting AR(1) model on df2...
🔮 Predicting for df2 (Train size: 19, Total size: 20)...
📊 Predictions for df2:
19    8.378938
dtype: float64
🔄 Reconstruction Error for df2:
[0.02926217]
📈 Scaled Reconstruction Error for df2:
[2.92621749]
📉 Mean iFMII for df2: 2.92621748767626

🔹 Processing df3 (Stable_Section 3)...
✅ Best AR order (k) for df3: 2 (AIC = -6.803899706707822)
🔧 Fitting AR(2) model on df3...
🔮 Predicting for df3 (Train size: 18, Total size: 20)...
📊 Predictions for df3:
18    8.319595
19    8.326380
dtype: f

In [11]:
# Rank iFMII in descending order (higher iFMII = higher anomaly)
df_results["iFMII_rank"] = df_results["iFMII"].rank(ascending=False, method="min")

# Calculate p-value using the rank
total_predictions = len(df_results)
df_results["p_value_linear"] = df_results["iFMII_rank"] / total_predictions

# Save the updated results to CSV
df_results.to_csv("gibf1_section_predictions.csv", index=False)

print("\n✅ Results saved to section_predictio.csv")


✅ Results saved to section_predictio.csv
