In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import permutation_test
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
# 1. Carga el dataset
df = pd.read_csv('../outputs/metrics.csv')

# 2. Log-transform si hay skew (ejemplo opcional)
df['MAE_log'] = np.log1p(df['MAE'])

# 3. Verificar supuestos clásicos ANOVA (opcional antes de permutación)

# 4. Prueba ANOVA por permutaciones: función general
def permutation_anova(df, formula, n_permutations=5000):
    """
    Realiza ANOVA por permutaciones sobre la fórmula especificada,
    devolviendo F observada y p‑valor empírico.
    """
    model = smf.ols(formula, data=df).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    f_obs = anova_table['F'].iloc[0]
    groups = df.copy()
    f_perms = []
    for _ in range(n_permutations):
        groups['MAE_perm'] = np.random.permutation(groups['MAE'])
        mod = smf.ols(formula.replace('MAE', 'MAE_perm'), data=groups).fit()
        ft = sm.stats.anova_lm(mod, typ=2)['F'].iloc[0]

        f_perms.append(ft)
    p_val = np.mean([f >= f_obs for f in f_perms])
    return f_obs, p_val, anova_table

# 5. Ejemplo: efecto de Method sobre MAE, controlando por Disease




In [4]:
# 6. Si hay múltiples factores: combinaciones
formulas = [
    'MAE ~ C(Method)',
    'MAE ~ C(Disease)',
    'MAE ~ C(Level)',
    'MAE ~ C(Method) + C(Disease) + C(Level) + C(Q("Prediction Weeks"))',
    'MAE ~ C(Method) * C(Disease)',
    'MAE ~ C(Method) * C(Level)',

]

results = {}

for formula in formulas:
    f_stat, p_val, table = permutation_anova(df, formula, 5000)
    results[formula] = {'F': f_stat, 'p_perm': p_val, 'table': table}
    print(f'\n📌 Modelo: {formula}')
    print(table)
    print(f'  → estadístico F = {f_stat:.3f}, p_perm = {p_val:.5f}')


📌 Modelo: MAE ~ C(Method)
                 sum_sq      df          F        PR(>F)
C(Method)  2.384027e+06     7.0  47.366141  7.057260e-66
Residual   4.433519e+07  6166.0        NaN           NaN
  → estadístico F = 47.366, p_perm = 0.00000

📌 Modelo: MAE ~ C(Disease)
                  sum_sq      df          F        PR(>F)
C(Disease)  3.793239e+05     1.0  50.522069  1.311226e-12
Residual    4.633989e+07  6172.0        NaN           NaN
  → estadístico F = 50.522, p_perm = 0.00000

📌 Modelo: MAE ~ C(Level)
                sum_sq      df          F         PR(>F)
C(Level)  5.930867e+06    23.0  38.880275  7.136502e-162
Residual  4.078835e+07  6150.0        NaN            NaN
  → estadístico F = 38.880, p_perm = 0.00000

📌 Modelo: MAE ~ C(Method) + C(Disease) + C(Level) + C(Q("Prediction Weeks"))
                                sum_sq      df          F         PR(>F)
C(Method)                 2.422663e+06     7.0  56.251431   1.900730e-78
C(Disease)                4.062930e+05     1

PatsyError: error tokenizing input (maybe an unclosed string?)
    MAE ~ C(Method) * C(`Prediction Weeks`)
                        ^

In [6]:
formulas = [

    'MAE ~ C(Method) * C(Q("Prediction Weeks"))',
    'MAE ~ C(Disease) * C(Level)',
    'MAE ~ C(Method) * C(Disease) * C(Q("Prediction Weeks"))',
    'MAE ~ C(Method) * C(Disease) + C(Q("Prediction Weeks"))',
    'MAE ~ C(Method) + C(Disease) + C(Level) + C(Q("Prediction Weeks")) + C(Method) * C(Disease) + C(Method) * C(Level)',

]

results = {}

for formula in formulas:
    f_stat, p_val, table = permutation_anova(df, formula, 5000)
    results[formula] = {'F': f_stat, 'p_perm': p_val, 'table': table}
    print(f'\n📌 Modelo: {formula}')
    print(table)
    print(f'  → estadístico F = {f_stat:.3f}, p_perm = {p_val:.5f}')


📌 Modelo: MAE ~ C(Method) * C(Q("Prediction Weeks"))
                                          sum_sq      df          F  \
C(Method)                           2.384240e+06     7.0  47.559629   
C(Q("Prediction Weeks"))            1.916489e+05     3.0   8.920136   
C(Method):C(Q("Prediction Weeks"))  1.566475e+05    21.0   1.041575   
Residual                            4.398689e+07  6142.0        NaN   

                                          PR(>F)  
C(Method)                           3.807431e-66  
C(Q("Prediction Weeks"))            6.789972e-06  
C(Method):C(Q("Prediction Weeks"))  4.071516e-01  
Residual                                     NaN  
  → estadístico F = 47.560, p_perm = 0.00000

📌 Modelo: MAE ~ C(Disease) * C(Level)
                           sum_sq      df          F         PR(>F)
C(Disease)           4.027601e+05     1.0  63.086256   2.336690e-15
C(Level)             5.954303e+06    23.0  40.550054  6.403126e-169
C(Disease):C(Level)  1.275514e+06    23.0   8.6

In [None]:
# 7. Post-hoc permutacional: pares de grupos
pairs = list(combinations(df['Method'].unique(), 2))
results = {}
for a, b in pairs:
    group_a = df[df['Method']==a]['MAE_log']
    group_b = df[df['Method']==b]['MAE_log']
    res = permutation_test((group_a, group_b),
                           statistic=lambda x, y: np.mean(x)-np.mean(y),
                           permutation_type='independent',
                           n_resamples=5000,
                           alternative='two-sided',
                           vectorized=False)
    results[(a,b)] = res.pvalue

print("Post‑hoc permutacional p‑values:")
for pair, p in results.items():
    print(pair, "p =", p)