In [1]:
from sqlalchemy import create_engine,text
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
db_user = ""
db_password = ""
db_host = "" 
db_port = ""  
db_name = ""

In [3]:
connection = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
engine = create_engine(connection)

In [4]:
query= text("""SELECT DATE(datefrom) AS date, layerid, toid, SUM(datavalue) AS datavalue
             FROM movements 
            GROUP BY date, layerid, toid ORDER BY date;""")
try:
    with engine.connect() as connection:
        chunks = pd.read_sql(query, connection, chunksize=1000) 
        df = pd.concat(chunks, ignore_index=True) 
    print(df)
except Exception as e:
    print(f"Errore durante l'esecuzione della query: {e}")

               date             layerid                toid  datavalue
0        2019-08-01  08|033|001|000|000  08|033|004|000|000         57
1        2019-08-01  08|033|001|000|000  08|033|005|000|000         39
2        2019-08-01  08|033|001|000|000  08|033|006|000|000        387
3        2019-08-01  08|033|001|000|000  08|033|007|000|000          9
4        2019-08-01  08|033|001|000|000  08|033|008|000|000         96
...             ...                 ...                 ...        ...
3496041  2019-09-30  08|099|027|000|000  08|099|023|000|000        216
3496042  2019-09-30  08|099|027|000|000  08|099|024|000|000        200
3496043  2019-09-30  08|099|027|000|000  08|099|025|000|000        264
3496044  2019-09-30  08|099|027|000|000  08|099|026|000|000        120
3496045  2019-09-30  08|099|027|000|000  08|099|999|000|255        241

[3496046 rows x 4 columns]


In [5]:
from sklearn.preprocessing import LabelEncoder
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["weekday"] = df["date"].dt.weekday
df["week"] = df["date"].dt.isocalendar().week - df["date"].dt.isocalendar().week.min()
df['weekend'] = df['weekday'].apply(lambda w: 1  if (w == 5 or w == 6) else 0)
df["date"] = df['date'].astype('int64')
label_encoder = LabelEncoder()
df['layerid'] = label_encoder.fit_transform(df['layerid'])
df['toid'] = label_encoder.fit_transform(df['toid'])
df

Unnamed: 0,date,layerid,toid,datavalue,weekday,week,weekend
0,1564617600000000000,0,3,57,3,0,0
1,1564617600000000000,0,4,39,3,0,0
2,1564617600000000000,0,5,387,3,0,0
3,1564617600000000000,0,6,9,3,0,0
4,1564617600000000000,0,7,96,3,0,0
...,...,...,...,...,...,...,...
3496041,1569801600000000000,465,462,216,0,9,0
3496042,1569801600000000000,465,463,200,0,9,0
3496043,1569801600000000000,465,464,264,0,9,0
3496044,1569801600000000000,465,465,120,0,9,0


In [6]:
X = df.drop(['datavalue'],axis='columns')
y = df['datavalue']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

print("*** Random Forest ***")
Model_RF = RandomForestRegressor()
Model_RF.fit(X_train, y_train)
y_pred_RF = Model_RF.predict(X_test)

print('MAE =', mean_absolute_error(y_test, y_pred_RF))
print('MAPE =', mean_absolute_percentage_error(y_test, y_pred_RF)*100 ,'%')
MAPE_RF = mean_absolute_percentage_error(y_test, y_pred_RF)*100
plt.scatter(y_test, y_pred_RF)
plt.show()

*** Random Forest ***


In [None]:
indici = np.arange(100)
plt.figure(figsize=(10, 6))
plt.bar(indici, y_test[:100], label='y_test', color='red', alpha = 0.5)
plt.bar(indici, y_pred_RF[:100], label='y_pred', color='blue', alpha = 0.5)
plt.xlabel('record')
plt.ylabel('datavalue')
plt.title('Random Forest: confronto tra y_test e y_pred')
plt.legend()
plt.show()

K-Nearest Neighbors

In [None]:
from sklearn.model_selection import  GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.neighbors import KNeighborsRegressor

k = 5

In [None]:
selector1 = SelectKBest(score_func=f_regression, k=5)
X_train_selected1 = selector1.fit_transform(X_train, y_train)
X_test_selected1 = selector1.transform(X_test)

y_train_scaled_KNN1 = np.log1p(y_train)
y_test_scaled_KNN1 = np.log1p(y_test)
scaler = MinMaxScaler()
X_train_scaled_KNN1 = scaler.fit_transform(X_train_selected1)
X_test_scaled_KNN1 = scaler.transform(X_test_selected1)



In [None]:
print("*** K-Nearest Neighbors ***")
Model_KNN1 = KNeighborsRegressor(
    n_neighbors=  3,
    weights= 'distance',
    metric='manhattan'
)

Model_KNN1.fit(X_train_scaled_KNN1, y_train_scaled_KNN1)
y_pred_KNN1 = Model_KNN1.predict(X_test_scaled_KNN1)

y_pred_KNN1 = np.expm1(y_pred_KNN1)
y_test = np.expm1(y_test_scaled_KNN1)

print('MAE =', mean_absolute_error(y_test, y_pred_KNN1))
print('MAPE =',mean_absolute_percentage_error(y_test, y_pred_KNN1)*100,'%')
MAPE_KNN1 = mean_absolute_percentage_error(y_test, y_pred_KNN1)*100
plt.scatter(y_test, y_pred_KNN1)
plt.show()

In [None]:
indici = np.arange(100)
plt.figure(figsize=(10, 6))
plt.bar(indici, y_test[:100], label='y_test', color='red', alpha = 0.5)
plt.bar(indici, y_pred_KNN1[:100], label='y_pred', color='blue', alpha = 0.5)
plt.xlabel('record')
plt.ylabel('datavalue')
plt.title('K-Nearest Neighbors: confronto tra y_test e y_pred')
plt.legend()
plt.show()

k=3

In [None]:
selector2 = SelectKBest(score_func=f_regression, k=3)
X_train_selected2 = selector2.fit_transform(X_train, y_train)
X_test_selected2 = selector2.transform(X_test)

y_train_scaled_KNN2 = np.log1p(y_train)
y_test_scaled_KNN2 = np.log1p(y_test)
scaler = MinMaxScaler()
X_train_scaled_KNN2 = scaler.fit_transform(X_train_selected2)
X_test_scaled_KNN2 = scaler.transform(X_test_selected2)


In [None]:
print("*** K-Nearest Neighbors ***")
Model_KNN2 = KNeighborsRegressor(
    n_neighbors=  3,
    weights= 'distance',
    metric='manhattan'
)

Model_KNN2.fit(X_train_scaled_KNN2, y_train_scaled_KNN2)
y_pred_KNN2 = Model_KNN2.predict(X_test_scaled_KNN2)

y_pred_KNN2 = np.expm1(y_pred_KNN2)
y_test = np.expm1(y_test_scaled_KNN2)

print('MAE =', mean_absolute_error(y_test, y_pred_KNN2))
print('MAPE =',mean_absolute_percentage_error(y_test, y_pred_KNN2)*100,'%')
MAPE_KNN2 = mean_absolute_percentage_error(y_test, y_pred_KNN2)*100
plt.scatter(y_test, y_pred_KNN2)
plt.show()

In [None]:
indici = np.arange(100)
plt.figure(figsize=(10, 6))
plt.bar(indici, y_test[:100], label='y_test', color='red', alpha = 0.5)
plt.bar(indici, y_pred_KNN2[:100], label='y_pred', color='blue', alpha = 0.5)
plt.xlabel('record')
plt.ylabel('datavalue')
plt.title('K-Nearest Neighbors: confronto tra y_test e y_pred')
plt.legend()
plt.show()

k=2

In [None]:
selector = SelectKBest(score_func=f_regression, k=2)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

y_train_scaled_KNN = np.log1p(y_train)
y_test_scaled_KNN = np.log1p(y_test)
scaler = MinMaxScaler()
X_train_scaled_KNN = scaler.fit_transform(X_train_selected)
X_test_scaled_KNN = scaler.transform(X_test_selected)

In [None]:
print("*** K-Nearest Neighbors ***")
Model_KNN = KNeighborsRegressor(
    n_neighbors=  3,
    weights= 'distance',
    metric='manhattan'
)

Model_KNN.fit(X_train_scaled_KNN, y_train_scaled_KNN)
y_pred_KNN = Model_KNN.predict(X_test_scaled_KNN)

y_pred_KNN = np.expm1(y_pred_KNN)
y_test = np.expm1(y_test_scaled_KNN)

print('MAE =', mean_absolute_error(y_test, y_pred_KNN))
print('MAPE =',mean_absolute_percentage_error(y_test, y_pred_KNN)*100,'%')
MAPE_KNN = mean_absolute_percentage_error(y_test, y_pred_KNN)*100
plt.scatter(y_test, y_pred_KNN)
plt.show()

In [None]:
indici = np.arange(100)
plt.figure(figsize=(10, 6))
plt.bar(indici, y_test[:100], label='y_test', color='red', alpha = 0.5)
plt.bar(indici, y_pred_KNN[:100], label='y_pred', color='blue', alpha = 0.5)
plt.xlabel('record')
plt.ylabel('datavalue')
plt.title('K-Nearest Neighbors: confronto tra y_test e y_pred')
plt.legend()
plt.show()

Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

y_train_scaled_GB = np.log1p(y_train)
y_test_scaled_GB = np.log1p(y_test)
scaler = MinMaxScaler()
X_train_scaled_GB = scaler.fit_transform(X_train)
X_test_scaled_GB = scaler.transform(X_test)
Model_GB = GradientBoostingRegressor(learning_rate= 0.2, max_depth= 5, n_estimators= 500)


In [None]:
print("*** Gradient Boosting ***")
Model_GB.fit(X_train_scaled_GB, y_train_scaled_GB)
y_pred_GB = Model_GB.predict(X_test_scaled_GB)

y_pred_GB =np.expm1(y_pred_GB)
y_test =np.expm1(y_test_scaled_GB)
print('MAE =', mean_absolute_error(y_test, y_pred_GB))
print('MAPE =',mean_absolute_percentage_error(y_test, y_pred_GB)*100,'%')
MAPE_GB = mean_absolute_percentage_error(y_test, y_pred_GB)*100
plt.scatter(y_test, y_pred_GB)
plt.show()

In [None]:
indici = np.arange(100)
plt.figure(figsize=(10, 6))
plt.bar(indici, y_test[:100], label='y_test', color='red', alpha = 0.5)
plt.bar(indici, y_pred_GB[:100], label='y_pred', color='blue', alpha = 0.5)
plt.xlabel('record')
plt.ylabel('datavalue')
plt.title('Gradient Boosting: confronto tra y_test e y_pred')
plt.legend()
plt.show()

In [None]:
fig, axes = plt.subplots(5, 1, figsize=(20, 10)) 
fig.suptitle('Confronto tra y_test e y_pred', fontsize=14, fontweight='bold')
axes[0].plot(indici, y_test[:100], label='original')
axes[0].plot(indici, y_pred_RF[:100],label='Random Forest')
axes[0].legend()
axes[0].plot([], [], ' ', label=f'MAPE: {MAPE_RF:.2f}')
axes[0].legend()
axes[1].plot(indici, y_test[:100], label='original')
axes[1].plot(indici, y_pred_KNN[:100],label='K-Nearest Neighbors k=2')
axes[1].plot([], [], ' ', label=f'MAPE: {MAPE_KNN:.2f}')
axes[1].legend()
axes[2].plot(indici, y_test[:100], label='original')
axes[2].plot(indici, y_pred_GB[:100],label='Gradient Boosting')
axes[2].plot([], [], ' ', label=f'MAPE: {MAPE_GB:.2f}')
axes[2].legend()
axes[3].plot(indici, y_test[:100], label='original')
axes[3].plot(indici, y_pred_KNN2[:100],label='K-Nearest Neighbors k=3')
axes[3].plot([], [], ' ', label=f'MAPE: {MAPE_KNN2:.2f}')
axes[3].legend()
axes[4].plot(indici, y_test[:100], label='original')
axes[4].plot(indici, y_pred_KNN1[:100],label='K-Nearest Neighbors k=5')
axes[4].plot([], [], ' ', label=f'MAPE: {MAPE_KNN1:.2f}')
axes[4].legend()
plt.show()