In [2]:
import pandas as pd
import pickle
import pmdarima
import numpy as np

In [3]:
eva = pd.read_csv('./data/EVA_2006_2020_clean.csv')

In [3]:
eva.head()

Unnamed: 0,dept_code,dept,mun_code,mun,group,subgroup,crop,year,period,area_seeded,area_harvested,production,yield_,production_status,scientific_name,cycle
0,15,boyaca,15114,busbanza,hortalizas,acelga,acelga,2006,2006B,2.0,1.0,1.0,1.0,fruto fresco,beta vulgaris,transitorio
1,25,cundinamarca,25754,soacha,hortalizas,acelga,acelga,2006,2006B,82.0,80.0,1440.0,18.0,fruto fresco,beta vulgaris,transitorio
2,25,cundinamarca,25214,cota,hortalizas,acelga,acelga,2006,2006B,2.0,2.0,26.0,13.0,fruto fresco,beta vulgaris,transitorio
3,54,norte de santander,54405,los patios,hortalizas,acelga,acelga,2006,2006B,3.0,3.0,48.0,16.0,fruto fresco,beta vulgaris,transitorio
4,54,norte de santander,54518,pamplona,hortalizas,acelga,acelga,2006,2006B,1.0,1.0,5.0,5.0,fruto fresco,beta vulgaris,transitorio


# Training ARIMAX

In [4]:
df_ts = eva.set_index("year",drop=False)

In [5]:
df_train = df_ts[df_ts.year < 2019]
df_valid = df_ts[df_ts.year >= 2019]

In [26]:
df_train_dummies = pd.get_dummies(df_train, columns= ['dept', 'group'], drop_first = False)
df_valid_dummies = pd.get_dummies(df_valid, columns= ['dept', 'group'], drop_first = False)

In [27]:
exogenous_features = ['area_seeded']
for i in df_train_dummies.columns:
    if ("group" in i) or ("dept" in i):
        exogenous_features.append(i)
exogenous_features.remove('subgroup')

In [8]:
print(exogenous_features)
print(len(exogenous_features))

['area_seeded', 'dept_code', 'dept_amazonas', 'dept_antioquia', 'dept_arauca', 'dept_atlantico', 'dept_bolivar', 'dept_boyaca', 'dept_caldas', 'dept_caqueta', 'dept_casanare', 'dept_cauca', 'dept_cesar', 'dept_choco', 'dept_cordoba', 'dept_cundinamarca', 'dept_guainia', 'dept_guaviare', 'dept_huila', 'dept_la guajira', 'dept_magdalena', 'dept_meta', 'dept_narino', 'dept_norte de santander', 'dept_putumayo', 'dept_quindio', 'dept_risaralda', 'dept_san andres y providencia', 'dept_santander', 'dept_sucre', 'dept_tolima', 'dept_valle del cauca', 'dept_vaupes', 'dept_vichada', 'group_cereales', 'group_fibras', 'group_flores y follajes', 'group_forestales', 'group_frutales', 'group_hongos', 'group_hortalizas', 'group_leguminosas', 'group_oleaginosas', 'group_otros permanentes', 'group_otros transitorios', 'group_plantas aromaticas, condimentarias y medicinales', 'group_tuberculos y platanos']
47


In [9]:
# 1. area_harvested_model training
area_harvested_model = pmdarima.auto_arima(df_train_dummies.area_harvested, exogenous=df_train_dummies[exogenous_features], trace=True, error_action="ignore", suppress_warnings=True)
area_harvested_model.fit(df_train_dummies.area_harvested, exogenous=df_train_dummies[exogenous_features])
pickle.dump(area_harvested_model, open('./models/area_harvested_model.pickle', 'wb'))

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=966.02 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=2860011.943, Time=226.47 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=2819947.040, Time=229.50 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=2774239.045, Time=504.89 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=5175370.819, Time=36.62 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=4002.80 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=2766105.654, Time=841.39 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=2763119.935, Time=928.66 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=inf, Time=1046.84 sec
 ARIMA(0,1,3)(0,0,0)[0] intercept   : AIC=2764578.960, Time=997.51 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=909.48 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=inf, Time=1068.44 sec
 ARIMA(1,1,2)(0,0,0)[0]             : AIC=inf, Time=881.21 sec

Best model:  ARIMA(1,1,2)(0,0,0)[0] intercept
Total fit time: 12640.482 seconds

In [37]:
# 2. production_model training
production_model = pmdarima.auto_arima(df_train_dummies.production, exogenous=df_train_dummies[exogenous_features], trace=True, error_action="ignore", suppress_warnings=True)
production_model.fit(df_train_dummies.production, exogenous=df_train_dummies[exogenous_features])
pickle.dump(production_model, open('./models/production_model.pickle', 'wb'))

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=4822273.207, Time=689.36 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=4951286.119, Time=31.53 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=4833759.982, Time=102.33 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=4877534.975, Time=207.74 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=7266657.231, Time=31.83 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=4828720.418, Time=390.15 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=4822508.362, Time=512.47 sec
 ARIMA(3,0,2)(0,0,0)[0] intercept   : AIC=inf, Time=950.49 sec
 ARIMA(2,0,3)(0,0,0)[0] intercept   : AIC=4822184.518, Time=1098.95 sec
 ARIMA(1,0,3)(0,0,0)[0] intercept   : AIC=4824981.842, Time=546.50 sec
 ARIMA(3,0,3)(0,0,0)[0] intercept   : AIC=inf, Time=1015.24 sec
 ARIMA(2,0,4)(0,0,0)[0] intercept   : AIC=4824744.836, Time=814.10 sec
 ARIMA(1,0,4)(0,0,0)[0] intercept   : AIC=4824030.758, Time=709.82 sec
 ARIMA(3,0,4)(0,0,0)[0] intercept   : AIC=inf, Tim

In [34]:
df_train_dummies.replace([np.inf, -np.inf], np.nan, inplace=True)
df_train_dummies.fillna(0, inplace=True)

In [35]:
df_valid_dummies.replace([np.inf, -np.inf], np.nan, inplace=True)
df_valid_dummies.fillna(0, inplace=True)

In [36]:
# 3. yield_model training
yield_model = pmdarima.auto_arima(df_train_dummies.yield_, exogenous=df_train_dummies[exogenous_features], trace=True, error_action="ignore", suppress_warnings=True)
yield_model.fit(df_train_dummies.yield_, exogenous=df_train_dummies[exogenous_features])
pickle.dump(yield_model, open('./models/yield_model.pickle', 'wb'))

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=1450663.682, Time=910.22 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=1537450.217, Time=211.39 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=1492214.292, Time=283.79 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=1454132.017, Time=881.74 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=1537451.934, Time=1159.94 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=1453917.737, Time=829.58 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=1449262.023, Time=766.77 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=1450934.038, Time=695.09 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=1475575.287, Time=666.93 sec
 ARIMA(3,1,1)(0,0,0)[0] intercept   : AIC=1448608.735, Time=974.40 sec
 ARIMA(3,1,0)(0,0,0)[0] intercept   : AIC=1467894.368, Time=592.28 sec
 ARIMA(4,1,1)(0,0,0)[0] intercept   : AIC=1451384.697, Time=1068.73 sec
 ARIMA(3,1,2)(0,0,0)[0] intercept   : AIC=1449252.983, Time=991.00 sec
 ARIMA(4,1,0)(0,0,0)[0] intercep