# Testing new inputs for the prediction

In [3]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv("data/set.csv")
df['DateTime'] = pd.to_datetime(df['DateTime'])
df

Unnamed: 0,PT08.S1(CO),PT08.S2(NMHC),PT08.S3(NOx),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,DateTime,datetimestamp
0,1360.0,1046.0,1056.0,1692.0,1268.0,13.6,48.9,0.7578,2004-03-10 18:00:00,299706
1,1292.0,955.0,1174.0,1559.0,972.0,13.3,47.7,0.7255,2004-03-10 19:00:00,299707
2,1402.0,939.0,1140.0,1555.0,1074.0,11.9,54.0,0.7502,2004-03-10 20:00:00,299708
3,1376.0,948.0,1092.0,1584.0,1203.0,11.0,60.0,0.7867,2004-03-10 21:00:00,299709
4,1272.0,836.0,1205.0,1490.0,1110.0,11.2,59.6,0.7888,2004-03-10 22:00:00,299710
...,...,...,...,...,...,...,...,...,...,...
8440,1314.0,1101.0,539.0,1374.0,1729.0,21.9,29.3,0.7568,2005-04-04 10:00:00,309058
8441,1163.0,1027.0,604.0,1264.0,1269.0,24.3,23.7,0.7119,2005-04-04 11:00:00,309059
8442,1142.0,1063.0,603.0,1241.0,1092.0,26.9,18.3,0.6406,2005-04-04 12:00:00,309060
8443,1003.0,961.0,702.0,1041.0,770.0,28.3,13.5,0.5139,2005-04-04 13:00:00,309061


In [5]:
results = pd.DataFrame(columns=['Model', 'mse', 'mae', 'rmse', 'mape', 'r2']).dropna()
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score


def appendError(model, testy, y_pred):
    # model_name = str(type(model).__name__)
    mse = mean_squared_error(testy, y_pred)
    mae = mean_absolute_error(testy, y_pred)
    rmse = root_mean_squared_error(testy, y_pred)
    mape = mean_absolute_percentage_error(testy, y_pred)
    r2 = r2_score(testy, y_pred)
    return pd.DataFrame([{
        'Model': model,
        'mse': mse, 
        'mae': mae, 
        'rmse': rmse, 
        'mape': mape, 
        'r2': r2
    }])

In [6]:
day_to_number = {
    'Monday': 0,
    'Tuesday': 1,
    'Wednesday': 2,
    'Thursday': 3,
    'Friday': 4,
    'Saturday': 5,
    'Sunday': 6
}
data = df
# data = df.drop('datetimestamp',axis=1)
# data['Year'] = df['DateTime'].dt.year
# data['Month'] = df['DateTime'].dt.month
# data['Day'] = df['DateTime'].dt.day
# data['WeekDay'] = df['DateTime'].dt.day_name()
# data['WeekDay'] = data['WeekDay'].map(day_to_number)
# data['Hour'] = df['DateTime'].dt.hour

In [7]:
targets = data.columns[:5].to_list()
targets

['PT08.S1(CO)', 'PT08.S2(NMHC)', 'PT08.S3(NOx)', 'PT08.S4(NO2)', 'PT08.S5(O3)']

In [8]:
X = data.drop(columns=targets).drop('DateTime', axis=1)
X

Unnamed: 0,T,RH,AH,datetimestamp
0,13.6,48.9,0.7578,299706
1,13.3,47.7,0.7255,299707
2,11.9,54.0,0.7502,299708
3,11.0,60.0,0.7867,299709
4,11.2,59.6,0.7888,299710
...,...,...,...,...
8440,21.9,29.3,0.7568,309058
8441,24.3,23.7,0.7119,309059
8442,26.9,18.3,0.6406,309060
8443,28.3,13.5,0.5139,309061


In [9]:
Y = data[targets]
Y

Unnamed: 0,PT08.S1(CO),PT08.S2(NMHC),PT08.S3(NOx),PT08.S4(NO2),PT08.S5(O3)
0,1360.0,1046.0,1056.0,1692.0,1268.0
1,1292.0,955.0,1174.0,1559.0,972.0
2,1402.0,939.0,1140.0,1555.0,1074.0
3,1376.0,948.0,1092.0,1584.0,1203.0
4,1272.0,836.0,1205.0,1490.0,1110.0
...,...,...,...,...,...
8440,1314.0,1101.0,539.0,1374.0,1729.0
8441,1163.0,1027.0,604.0,1264.0,1269.0
8442,1142.0,1063.0,603.0,1241.0,1092.0
8443,1003.0,961.0,702.0,1041.0,770.0


In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

Shape of X_train: (5911, 4)
Shape of X_test: (2534, 4)
Shape of Y_train: (5911, 5)
Shape of Y_test: (2534, 5)


In [20]:
specs = ["dart", "exact", 5, 3]
model = xgb.XGBRegressor(booster=specs[0], tree_method=specs[1], max_depth=specs[2], gamma=specs[3])
model.fit(X_train, Y_train.iloc[:,0])

In [21]:
preds = model.predict(X_test)
results = pd.concat([results, appendError('test0', Y_test.iloc[:, 0], preds)])
results

Unnamed: 0,Model,mse,mae,rmse,mape,r2
0,test0,22811.089704,116.315143,151.033406,0.107104,0.406216
0,test0,21124.291213,111.26621,145.34198,0.103907,0.427116


In [65]:
preds = model.predict(X_test)
for i in targets:
    results = pd.concat([results, appendError(i, Y_test.iloc[:, targets.index(i)], preds[:, targets.index(i)])])

  results = pd.concat([results, appendError(i, Y_test.iloc[:, targets.index(i)], preds[:, targets.index(i)])])


In [84]:
results

Unnamed: 0,Model,mse,mae,rmse,mape,r2
0,PT08.S1(CO),9206.765738,71.730525,95.951893,0.064399,0.757414
0,PT08.S2(NMHC),13257.351752,87.304651,115.140574,0.095607,0.765415
0,PT08.S3(NOx),9595.83848,75.043424,97.958351,0.094798,0.785528
0,PT08.S4(NO2),12961.368527,85.322634,113.848006,0.059144,0.874679
0,PT08.S5(O3),37959.476606,148.120061,194.831919,0.160416,0.701995
0,test0,8800.766215,69.933104,93.812399,0.062882,0.768111
0,test0,23220.7234,117.572074,152.383475,0.108488,0.388165
0,test0,23271.960838,117.674216,152.551502,0.108498,0.378653


In [49]:
Y_test.iloc[:,0]

37      1484.0
1617    1059.0
3755     881.0
5225    1461.0
3967     919.0
         ...  
3859     944.0
529     1126.0
10      1236.0
2013    1153.0
7730    1017.0
Name: PT08.S1(CO), Length: 2534, dtype: float64