# 1. Setup and Data Loading

In [14]:
import pandas as pd
import seaborn as sns
from IPython.display import display

In [15]:
# Configure visual style for plots
sns.set_style("whitegrid")

# Path to the processed quarterly dataset
data_path = '../data/dados_processados_trimestrais.csv'

try:
    # Load the DataFrame, using the first column ('Date') as the time index
    df_final = pd.read_csv(data_path, index_col=0, parse_dates=True)
    
    # Rename the index to 'date' for consistency
    df_final.index.name = 'date'
    
    # Rename columns to shorter and clearer names
    df_final.rename(columns={
        'GDP_YoY_Growth': 'gdp',
        'Total_Corporate_Credit': 'corporate_credit',
        'Total_Household_Credit': 'household_credit',
        'Total_Debt': 'total_debt'
    }, inplace=True)

    print("DataFrame loaded and prepared successfully!")
    display(df_final.head())

except FileNotFoundError:
    print(f"ERROR: The data file was not found at '{data_path}'. Please check the path.")

DataFrame loaded and prepared successfully!


Unnamed: 0_level_0,gdp,corporate_credit,household_credit,total_debt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-12-31,3.0,4877.6,11.6,156083.31
2008-03-31,1.7,5278.5,11.8,158296.02
2008-06-30,0.9,6077.3,13.0,160952.95
2008-09-30,0.5,6898.4,14.1,162444.83
2008-12-31,-1.9,7738.9,14.7,163756.38


# 2. Results Centralization

In [16]:
# Updated anomaly dates (based on latest model outputs)

# Isolation Forest anomalies
iso_anomaly_dates = [
    '2012-09-30',
    '2020-06-30',
    '2021-06-30',
    '2022-03-31'
]

# STL anomalies (aggregated across the 4 series)
stl_anomaly_dates = [
    '2017-03-31', '2017-06-30',   # Household credit
    '2018-12-31', '2019-03-31',   # Total debt
    '2020-06-30', '2021-06-30'    # GDP
]

# Prophet anomalies (GDP only)
prophet_anomaly_dates = [
    '2020-06-30',
    '2021-06-30',
    '2022-03-31'
]

# Convert lists of strings to DatetimeIndex for safe matching
iso_anomaly_idx = pd.to_datetime(iso_anomaly_dates)
stl_anomaly_idx = pd.to_datetime(stl_anomaly_dates)
prophet_anomaly_idx = pd.to_datetime(prophet_anomaly_dates)

# Initialise anomaly flag columns with 0 (no anomaly)
df_final['anomaly_isoforest'] = 0
df_final['anomaly_stl'] = 0
df_final['anomaly_prophet'] = 0

# Set 1 (anomaly) on the corresponding dates
df_final.loc[df_final.index.isin(iso_anomaly_idx), 'anomaly_isoforest'] = 1
df_final.loc[df_final.index.isin(stl_anomaly_idx), 'anomaly_stl'] = 1
df_final.loc[df_final.index.isin(prophet_anomaly_idx), 'anomaly_prophet'] = 1

print("Anomaly flag columns added to DataFrame.")
print("\nQuick check for 2020–2021:")
display(df_final.loc['2020':'2021', ['gdp', 'anomaly_isoforest', 'anomaly_stl', 'anomaly_prophet']])


Anomaly flag columns added to DataFrame.

Quick check for 2020–2021:


Unnamed: 0_level_0,gdp,anomaly_isoforest,anomaly_stl,anomaly_prophet
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-31,-2.4,0,0,0
2020-06-30,-17.6,1,1,1
2020-09-30,-6.1,0,0,0
2020-12-31,-6.6,0,0,0
2021-03-31,-5.4,0,0,0
2021-06-30,16.5,1,1,1
2021-09-30,5.5,0,0,0
2021-12-31,7.2,0,0,0


In [None]:
# 1. Create 'anomaly_count' by summing how many models flagged an anomaly
df_final['anomaly_count'] = df_final[['anomaly_isoforest',
                                           'anomaly_stl',
                                           'anomaly_prophet']].sum(axis=1)

# 2. Keep only rows where at least one model detected an anomaly
df_comparativo = df_final[df_final['anomaly_count'] > 0].copy()

# 3. Sort: first by highest agreement (descending), then by date (index)
df_comparativo = (
    df_comparativo
    .sort_values('anomaly_count', ascending=False)
    .sort_index()
)

# 4. Select and reorder columns for a clear final presentation
colunas_para_exibir = [
    'gdp', 'corporate_credit', 'household_credit', 'total_debt',
    'anomaly_isoforest', 'anomaly_stl', 'anomaly_prophet', 'anomaly_count'
]
df_comparativo = df_comparativo[colunas_para_exibir]

print("Comparative anomaly table generated successfully!")
print("\nThis table summarises all anomalous events detected by the three models:")
display(df_comparativo)


Comparative anomaly table generated successfully!

This table summarises all anomalous events detected by the three models:


Unnamed: 0_level_0,gdp,corporate_credit,household_credit,total_debt,anomaly_isoforest,anomaly_stl,anomaly_prophet,anomaly_count
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-09-30,-4.3,31060.9,33.5,158446.81,1,0,0,1
2017-03-31,3.7,34215.1,31.0,135192.51,0,1,0,1
2017-06-30,3.5,32723.8,37.6,134992.61,0,1,0,1
2018-12-31,2.9,19948.5,26.9,136294.43,0,1,0,1
2019-03-31,3.2,16249.2,22.9,141697.75,0,1,0,1
2020-06-30,-17.6,8896.8,18.8,143217.5,1,1,1,3
2021-06-30,16.5,7105.6,16.5,147519.12,1,1,1,3
2022-03-31,12.2,5092.0,13.2,151547.11,1,0,1,2


# 3. Interactive Visualization of GDP with all Anomalies detected

In [19]:
import plotly.graph_objects as go

# Criar a figura principal
fig = go.Figure()

# 1. Adicionar a linha da série temporal do PIB
fig.add_trace(go.Scatter(
    x=df_final.index,
    y=df_final['gdp'],
    mode='lines',
    name='GDP YoY Growth (%)',
    line=dict(color='lightgrey')
))

# 2. Adicionar marcadores para as anomalias do Isolation Forest
df_anom_iso = df_final[df_final['anomaly_isoforest'] == 1]
fig.add_trace(go.Scatter(
    x=df_anom_iso.index,
    y=df_anom_iso['gdp'],
    mode='markers',
    name='Anomaly: Isolation Forest (Systemic)',
    marker=dict(color='red', size=12, symbol='circle')
))

# 3. Adicionar marcadores para as anomalias do STL
df_anom_stl = df_final[df_final['anomaly_stl'] == 1]
fig.add_trace(go.Scatter(
    x=df_anom_stl.index,
    y=df_anom_stl['gdp'],
    mode='markers',
    name='Anomaly: STL (Turning Point)',
    marker=dict(color='green', size=12, symbol='diamond')
))

# 4. Adicionar marcadores para as anomalias do Prophet
df_anom_prophet = df_final[df_final['anomaly_prophet'] == 1]
fig.add_trace(go.Scatter(
    x=df_anom_prophet.index,
    y=df_anom_prophet['gdp'],
    mode='markers',
    name='Anomaly: Prophet (Forecast Deviation)',
    marker=dict(color='purple', size=12, symbol='x')
))

# 5. Adicionar marcadores para consensos (mais do que um modelo)
df_consensus = df_final[df_final['contagem_anomalias'] > 1]
fig.add_trace(go.Scatter(
    x=df_consensus.index,
    y=df_consensus['gdp'],
    mode='markers',
    name='Consensus (Multiple Models)',
    marker=dict(color='gold', size=18, symbol='star', line=dict(color='black', width=1))
))

# 6. Layout Final
fig.update_layout(
    title='Comparative Anomaly Analysis on Portuguese GDP',
    xaxis_title='Date',
    yaxis_title='GDP YoY Growth (%)',
    legend_title='Models',
    template='plotly_white'
)

# Exibir o gráfico interativo
fig.show()


# 4. Final Analysis and Conclusions

The comparative view of the three models (Isolation Forest, STL decomposition, and Prophet) helps clarify the different types of anomalies observed in Portugal’s recent macroeconomic data.

**1. Consensus on the major shocks**  
The dates where all three models agree represent the most significant disruptions in the period studied.
- **Q2 2020:** The sharp fall in GDP during the first COVID-19 lockdown is captured as a systemic outlier, a clear break in trend, and a deviation well outside the forecast range.  
- **Q2 2021:** The strong rebound driven by base effects is also identified by all models as an exceptional event.

**2. Complementary roles of the three methods**  
The value of the analysis comes from the differences between models, as each highlights a distinct type of irregularity.

- **Isolation Forest:** Detects systemic imbalances that appear when the joint behaviour of GDP, credit and debt diverges from historical patterns. The isolated anomalies in 2012 and early 2022 reflect periods of unusual multivariate dynamics.  
- **STL:** Identifies turning points within individual series. The anomalies in 2011, 2017 and 2018–2019 correspond to shifts linked to credit contraction and the bottom of the debt cycle.  
- **Prophet:** Focused only on GDP, flags values that fall outside what the model would reasonably expect. Late 2021 and 2022 show stronger-than-expected growth that exceeds the forecast interval.

**Final conclusion**  
Each method brings a different perspective. Together, they allow us to separate systemic disruptions, changes in trend within individual series, and growth outcomes that outperform expectations. This combined view offers a clearer understanding of recent economic irregularities.

**Next step:** build an interactive dashboard, so everyone can explore the anomalies and their interpretation dynamically. This is done in [app.py](../app.py))

