In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
db_user = ""
db_password = ""
db_host = ""  
db_port = "" 
db_name = ""
connection = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
engine = create_engine(connection)

In [3]:
ace = '08|037|025%' # Dozza

In [4]:
query= text("SELECT DATE(datefrom) AS date, SUM(datavalue) AS datavalue FROM movements WHERE toid LIKE '" + ace + "' GROUP BY date ORDER BY date;")
try:
    with engine.connect() as connection:
        chunks = pd.read_sql(query, connection, chunksize=1000) 
        df = pd.concat(chunks, ignore_index=True) 
    print(df)
except Exception as e:
    print(f"Errore durante l'esecuzione della query: {e}")

          date  datavalue
0   2019-08-01      10241
1   2019-08-02       9752
2   2019-08-03       9188
3   2019-08-04       7757
4   2019-08-05       8431
..         ...        ...
56  2019-09-26      10703
57  2019-09-27       9880
58  2019-09-28      10000
59  2019-09-29       9900
60  2019-09-30      10096

[61 rows x 2 columns]


In [5]:
from sklearn.preprocessing import LabelEncoder
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["weekday"] = df["date"].dt.weekday
df["week"] = df["date"].dt.isocalendar().week - df["date"].dt.isocalendar().week.min()
df['weekend'] = df['weekday'].apply(lambda w: 1  if (w == 5 or w == 6) else 0)

# mettiamo se festivo (comprese 2 e 3 settimana di agosto)
df['festivo'] = ((df['week'].isin([1, 2])) | (df['weekday'].isin([5, 6]))).astype(int)

# Aggiunta: flag mese (agosto=0, settembre=1)
df["month"] = df["date"].dt.month.map({8: 0, 9: 1})
df

Unnamed: 0,date,datavalue,weekday,week,weekend,festivo,month
0,2019-08-01,10241,3,0,0,0,0
1,2019-08-02,9752,4,0,0,0,0
2,2019-08-03,9188,5,0,1,1,0
3,2019-08-04,7757,6,0,1,1,0
4,2019-08-05,8431,0,1,0,1,0
...,...,...,...,...,...,...,...
56,2019-09-26,10703,3,8,0,0,1
57,2019-09-27,9880,4,8,0,0,1
58,2019-09-28,10000,5,8,1,1,1
59,2019-09-29,9900,6,8,1,1,1


In [6]:
for lag in [1, 2, 3, 7]:
    df[f'lag_{lag}'] = df['datavalue'].shift(lag)

# sostituisco i NaN con 0 
df[['lag_1', 'lag_2', 'lag_3', 'lag_7']] = df[['lag_1', 'lag_2', 'lag_3', 'lag_7']].fillna(0)
df

Unnamed: 0,date,datavalue,weekday,week,weekend,festivo,month,lag_1,lag_2,lag_3,lag_7
0,2019-08-01,10241,3,0,0,0,0,0.0,0.0,0.0,0.0
1,2019-08-02,9752,4,0,0,0,0,10241.0,0.0,0.0,0.0
2,2019-08-03,9188,5,0,1,1,0,9752.0,10241.0,0.0,0.0
3,2019-08-04,7757,6,0,1,1,0,9188.0,9752.0,10241.0,0.0
4,2019-08-05,8431,0,1,0,1,0,7757.0,9188.0,9752.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
56,2019-09-26,10703,3,8,0,0,1,9204.0,10194.0,9868.0,9480.0
57,2019-09-27,9880,4,8,0,0,1,10703.0,9204.0,10194.0,10697.0
58,2019-09-28,10000,5,8,1,1,1,9880.0,10703.0,9204.0,9616.0
59,2019-09-29,9900,6,8,1,1,1,10000.0,9880.0,10703.0,9232.0


In [7]:
# dati sul meteo
query_weather = text("SELECT data, fenomeni FROM weather_data;")
try:
    with engine.connect() as connection:
        chunks = pd.read_sql(query_weather, connection, chunksize=1000) 
        df_weather = pd.concat(chunks, ignore_index=True) 
except Exception as e:
    print(f"Errore durante l'esecuzione della query: {e}")

# dati su eventi
query_events = text("SELECT * FROM dozza_eventi;")
try:
    with engine.connect() as connection:
        chunks = pd.read_sql(query_events, connection, chunksize=1000) 
        df_events = pd.concat(chunks, ignore_index=True) 
except Exception as e:
    print(f"Errore durante l'esecuzione della query: {e}")


df_weather["data"] = pd.to_datetime(df_weather["data"])
df_events["data"] = pd.to_datetime(df_events["data"])

df = df.merge(df_weather, left_on ="date", right_on="data", how="left")
df.drop(columns=["data"], inplace=True, errors = "ignore")

df = df.merge(df_events, left_on="date", right_on="data", how="left")
df.drop(columns=["data"], inplace=True, errors = "ignore")

le = LabelEncoder()
df['fenomeni'] = le.fit_transform(df['fenomeni'])
df['fenomeni'] = df['fenomeni'].apply(lambda x: 1 if x > 0 else 0)
df

Unnamed: 0,date,datavalue,weekday,week,weekend,festivo,month,lag_1,lag_2,lag_3,lag_7,fenomeni,evento
0,2019-08-01,10241,3,0,0,0,0,0.0,0.0,0.0,0.0,0,0
1,2019-08-02,9752,4,0,0,0,0,10241.0,0.0,0.0,0.0,1,0
2,2019-08-03,9188,5,0,1,1,0,9752.0,10241.0,0.0,0.0,0,0
3,2019-08-04,7757,6,0,1,1,0,9188.0,9752.0,10241.0,0.0,0,0
4,2019-08-05,8431,0,1,0,1,0,7757.0,9188.0,9752.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,2019-09-26,10703,3,8,0,0,1,9204.0,10194.0,9868.0,9480.0,1,0
57,2019-09-27,9880,4,8,0,0,1,10703.0,9204.0,10194.0,10697.0,0,0
58,2019-09-28,10000,5,8,1,1,1,9880.0,10703.0,9204.0,9616.0,0,0
59,2019-09-29,9900,6,8,1,1,1,10000.0,9880.0,10703.0,9232.0,0,0


In [8]:
# inseriamo ingressi a bologna e a imola del giorno prima
query_bologna = text("SELECT DATE(datefrom) AS date, SUM(datavalue) AS datavalue FROM movements WHERE toid LIKE '08|037|006%' GROUP BY date ORDER BY date;")
try:
    with engine.connect() as connection:
        chunks = pd.read_sql(query_bologna, connection, chunksize=1000) 
        df_bologna = pd.concat(chunks, ignore_index=True) 
except Exception as e:
    print(f"Errore durante l'esecuzione della query: {e}")

query_imola = text("SELECT DATE(datefrom) AS date, SUM(datavalue) AS datavalue FROM movements WHERE toid LIKE '08|037|032%' GROUP BY date ORDER BY date;")
try:
    with engine.connect() as connection:
        chunks = pd.read_sql(query_imola, connection, chunksize=1000) 
        df_imola = pd.concat(chunks, ignore_index=True) 
except Exception as e:
    print(f"Errore durante l'esecuzione della query: {e}")

df_bologna = df_bologna.rename(columns={'datavalue': 'bologna'})
df_imola = df_imola.rename(columns={'datavalue': 'imola'})

df_bologna["date"] = pd.to_datetime(df_bologna["date"])
df_imola["date"] = pd.to_datetime(df_imola["date"])

df = pd.merge(df, df_bologna, on='date', how='left')
df= pd.merge(df, df_imola, on='date', how='left')

df["bologna"] = df["bologna"].shift(1).fillna(0)
df["imola"] = df["imola"].shift(1).fillna(0)
df

Unnamed: 0,date,datavalue,weekday,week,weekend,festivo,month,lag_1,lag_2,lag_3,lag_7,fenomeni,evento,bologna,imola
0,2019-08-01,10241,3,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
1,2019-08-02,9752,4,0,0,0,0,10241.0,0.0,0.0,0.0,1,0,562915.0,85571.0
2,2019-08-03,9188,5,0,1,1,0,9752.0,10241.0,0.0,0.0,0,0,524811.0,81407.0
3,2019-08-04,7757,6,0,1,1,0,9188.0,9752.0,10241.0,0.0,0,0,405302.0,70136.0
4,2019-08-05,8431,0,1,0,1,0,7757.0,9188.0,9752.0,0.0,0,0,313369.0,55002.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,2019-09-26,10703,3,8,0,0,1,9204.0,10194.0,9868.0,9480.0,1,0,807756.0,96977.0
57,2019-09-27,9880,4,8,0,0,1,10703.0,9204.0,10194.0,10697.0,0,0,801625.0,97325.0
58,2019-09-28,10000,5,8,1,1,1,9880.0,10703.0,9204.0,9616.0,0,0,798166.0,92172.0
59,2019-09-29,9900,6,8,1,1,1,10000.0,9880.0,10703.0,9232.0,0,0,679888.0,85540.0


In [9]:
df["date"] = df['date'].astype('int64') #RENDIAMO DATA NUMERICO
df

Unnamed: 0,date,datavalue,weekday,week,weekend,festivo,month,lag_1,lag_2,lag_3,lag_7,fenomeni,evento,bologna,imola
0,1564617600000000000,10241,3,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
1,1564704000000000000,9752,4,0,0,0,0,10241.0,0.0,0.0,0.0,1,0,562915.0,85571.0
2,1564790400000000000,9188,5,0,1,1,0,9752.0,10241.0,0.0,0.0,0,0,524811.0,81407.0
3,1564876800000000000,7757,6,0,1,1,0,9188.0,9752.0,10241.0,0.0,0,0,405302.0,70136.0
4,1564963200000000000,8431,0,1,0,1,0,7757.0,9188.0,9752.0,0.0,0,0,313369.0,55002.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,1569456000000000000,10703,3,8,0,0,1,9204.0,10194.0,9868.0,9480.0,1,0,807756.0,96977.0
57,1569542400000000000,9880,4,8,0,0,1,10703.0,9204.0,10194.0,10697.0,0,0,801625.0,97325.0
58,1569628800000000000,10000,5,8,1,1,1,9880.0,10703.0,9204.0,9616.0,0,0,798166.0,92172.0
59,1569715200000000000,9900,6,8,1,1,1,10000.0,9880.0,10703.0,9232.0,0,0,679888.0,85540.0


In [10]:
#target
y = df['datavalue']

# Features 
features = ['date', 'weekday', 'week', 'weekend']

In [11]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler


In [12]:

X = df[features]

# Lista dei seed
seeds = [0, 7, 13, 21, 42, 99, 123, 34, 67, 80]
results = []
test_dates_per_seed = {}


# Loop su diversi seed
for seed in seeds:
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

    test_dates_per_seed[seed] = list(X_test['date'])
    
    # Scaling
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Log-transform target
    y_train_scaled = np.log1p(y_train)
    y_test_scaled = np.log1p(y_test)

    # Modello e ricerca iperparametri
    model = GradientBoostingRegressor(random_state=seed)
    param_grid = {
        'n_estimators': [100, 300, 500],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
    }

    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=0)
    grid.fit(X_train_scaled, y_train_scaled)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test_scaled)

    # Inverso del log1p
    y_pred = np.expm1(y_pred)
    y_test_orig = np.expm1(y_test_scaled)

    # Metriche
    mae = mean_absolute_error(y_test_orig, y_pred)
    mape = mean_absolute_percentage_error(y_test_orig, y_pred) * 100

    results.append({
        'seed': seed,
        'MAE': mae,
        'MAPE (%)': mape
    })

# Risultati finali
df_results = pd.DataFrame(results)
print(df_results)
print("MAPE medio:" , df_results['MAPE (%)'].mean())

   seed         MAE   MAPE (%)
0     0  666.749419   7.407434
1     7  746.521189   9.389887
2    13  872.865608  10.301297
3    21  636.824850   7.600504
4    42  776.716388   8.848974
5    99  831.632494  10.254239
6   123  682.873296   7.903562
7    34  695.031765   7.763041
8    67  785.878730   8.516242
9    80  696.334947   7.684897
MAPE medio: 8.56700767747591
