# Using XGBoost Regressor

In [9]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_log_error, mean_squared_error, root_mean_squared_error
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("../data/processed/processed.csv")

In [3]:
data.head()

Unnamed: 0,Date,AQI,Year,Month,Day,temperature_2m_max,temperature_2m_min,precipitation_sum,weather_code,sunshine_duration,cloud_cover_mean,wind_speed_10m_mean,winddirection_10m_dominant
0,2018-01-01,75.0,2018,1,1,28.253,14.503,0.0,0.0,36062.707,0.0,5.679092,270.6138
1,2018-01-02,76.0,2018,1,2,28.453,12.903,0.0,1.0,36040.188,7.166666,4.744073,278.32556
2,2018-01-03,79.0,2018,1,3,28.903,15.303,0.0,0.0,36101.605,0.083333,5.611268,285.8839
3,2018-01-04,84.0,2018,1,4,28.353,15.303,0.0,0.0,36083.652,1.541667,5.784296,326.56494
4,2018-01-05,97.0,2018,1,5,28.453,14.003,0.0,3.0,36066.5,46.125,5.50617,287.77652


In [4]:
# Selecting Features for Training

features = ["Year", "Month", "temperature_2m_max", "temperature_2m_min",
            "precipitation_sum", "weather_code", "sunshine_duration", "cloud_cover_mean",
           "wind_speed_10m_mean", "winddirection_10m_dominant"]

X = data[features]
y = data["AQI"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
model = XGBRegressor()

In [11]:
model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [12]:
y_preds = model.predict(X_test)

In [13]:
y_preds

array([149.94308 ,  80.713196,  85.32212 , 157.50095 , 182.23952 ,
       106.30837 ,  69.228806, 122.65601 ,  65.304924, 125.99004 ,
        62.489006,  71.05668 ,  64.04127 ,  69.7127  , 177.5548  ,
        70.80141 , 101.184235, 109.84165 ,  93.45717 , 139.96027 ,
       128.99544 , 102.36602 ,  48.133263, 161.44135 ,  51.603306,
        62.081852,  69.75304 , 114.72441 , 166.12314 ,  62.324642,
       104.18639 , 160.57364 ,  56.848717,  49.449028, 107.186615,
        85.11231 ,  70.617676,  68.396835,  56.046005,  61.02686 ,
       102.06045 ,  85.19995 ,  83.887794,  50.1878  ,  64.64578 ,
       104.47211 ,  74.70453 , 180.52866 ,  53.310833,  69.309006,
        57.922592, 158.75667 ,  73.55866 , 140.65143 , 143.16652 ,
       147.86456 ,  54.88835 ,  62.143772, 109.55695 ,  52.177   ,
       107.04027 , 114.72556 ,  71.55941 , 105.89956 ,  76.55203 ,
        95.29864 , 131.8773  ,  55.943085,  81.70674 ,  62.68496 ,
       100.27127 ,  55.55646 , 121.28637 , 147.08    ,  62.908

In [16]:
print("R2:", r2_score(y_test, y_preds))
print("MAE:", mean_absolute_error(y_test, y_preds))
print("MSE:", mean_squared_error(y_test, y_preds))
print("RMSE:", root_mean_squared_error(y_test, y_preds))
print("RMSLE:", root_mean_squared_log_error(y_test, y_preds))

R2: 0.591691504171387
MAE: 16.859599351882935
MSE: 736.195768790842
RMSE: 27.132927759289853
RMSLE: 0.2229397185330592


In [20]:
# Save the model
import pickle

with open("../models/XGBoost-model.pkl", "wb") as f:
    pickle.dump(model, f)