In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load dataset
df = pd.read_csv("merged_data.csv")  # Replace with your actual file path

# Extract relevant data
df_gini = df[["year", "income"]]  # Assuming "income" represents the Gini coefficient

# Split data into training and testing sets
train_size = int(len(df_gini) * 0.8)  # Use 80% for training
train, test = df_gini[:train_size], df_gini[train_size:]

# Train an improved ARIMA model with higher order terms to better capture fluctuations
arima_model = ARIMA(train["income"], order=(3,1,3))  # Adjust order to capture variance better
arima_result = arima_model.fit()

# Predict on test data
test_predictions = arima_result.forecast(steps=len(test))

# Evaluate the model
mae = mean_absolute_error(test["income"], test_predictions)
mse = mean_squared_error(test["income"], test_predictions)
rmse = np.sqrt(mse)
print(f"Model Evaluation:\nMAE: {mae:.4f}\nMSE: {mse:.4f}\nRMSE: {rmse:.4f}")

# Predict future Gini coefficient for years 111 to 115
future_years = list(range(111, 116))
future_predictions = arima_result.forecast(steps=5)

# Introduce synthetic noise to simulate volatility in the predictions
volatility_factor = df_gini["income"].diff().std()  # Estimate volatility from past fluctuations
np.random.seed(42)  # For reproducibility
adjusted_predictions = future_predictions + np.random.normal(scale=volatility_factor, size=5)

# Store predictions in a DataFrame
predictions_df = pd.DataFrame({"year": future_years, "predicted_gini": adjusted_predictions})

# Save predictions to CSV
predictions_df.to_csv("predicted_gini_111_115.csv", index=False)
print("Predictions saved to predicted_gini_111_115.csv")

# Plot results
plt.figure(figsize=(10,5))
plt.plot(df_gini["year"], df_gini["income"], label="Actual Gini Coefficient")
plt.plot(test["year"], test_predictions, label="Test Predictions", linestyle="dashed")
plt.plot(future_years, adjusted_predictions, label="Predicted Gini Coefficient", linestyle="dotted")
plt.xlabel("Year")
plt.ylabel("Gini Coefficient")
plt.legend()
plt.title("Improved Gini Coefficient Predictions and Model Evaluation")
plt.show()