<p style="padding: 10px;
          font-size:50px;
          font-weight: bolder;
          color: rgba(124, 203, 234, 0.75)">
    Laboratory Task 1
</p>

<a name="000" style="color: black">
    <p style="padding: 10px;
              font-size:25px;
              font-weight: bolder">
        Table of Contents
    </p>
</a>

<div style="padding: 20px;
            margin-top: 20px;
            color: black;
            background: rgba(124,203,234,0.25)">
    <ol>
        <li><a href="#001">Option 1: Regression Model</a></li>
        <li><a href="#002">Option 2: Time Series Model</a></li>
    </ol>
</div>

In [1]:
# !pip install eli5 xgboost statsmodels

In [2]:
import datetime
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
import pickle
import eli5

from sklearn.utils import shuffle
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

warnings.filterwarnings(action="ignore")
sns.set_theme(style="whitegrid", palette="pastel")
%matplotlib inline

<blockquote style="border-color: rgba(124,203,234,0.25)">
    <a name="001" style="color: black">
        <p style="padding: 10px;
              font-size:25px;
              font-weight: bolder">
            Option 1: Regression Model
        </p>
    </a>
</blockquote>

In [3]:
train_data = pd.read_csv("src\\reg_train.csv",
    dtype={
        "lon": np.float16, "lat": np.float16, "sum": np.int16,
        "year": np.int16, "month": np.int16, "day": np.int16,
        "hour": np.int16, "weekday": np.int16, "weekend": np.int16,
        "holiday": np.int16, "point_id": np.int16
    })

test_data = pd.read_csv("src\\reg_test.csv",
    dtype={
        "lon": np.float16, "lat": np.float16, "sum": np.int16,
        "year": np.int16, "month": np.int16, "day": np.int16,
        "hour": np.int16, "weekday": np.int16, "weekend": np.int16,
        "holiday": np.int16, "point_id": np.int16, "error": np.float16
    })

valid_data = pd.read_csv("src\\reg_valid.csv",
    dtype={
        "lon": np.float16, "lat": np.float16, "sum": np.int16,
        "year": np.int16, "month": np.int16, "day": np.int16,
        "hour": np.int16, "weekday": np.int16, "weekend": np.int16,
        "holiday": np.int16, "point_id": np.int16, "error": np.float16
    })

train_data.head()

Unnamed: 0,lon,lat,sum,year,month,day,hour,weekday,weekend,holiday,point_id
0,30.25,60.03125,1,2019,1,1,0,1,0,1,0
1,30.25,59.84375,1,2019,1,1,0,1,0,1,1
2,30.234375,59.96875,1,2019,1,1,0,1,0,1,2
3,30.234375,59.84375,2,2019,1,1,0,1,0,1,3
4,30.234375,60.03125,1,2019,1,1,0,1,0,1,4


In [4]:
X_train = train_data.drop(["sum"], axis=1)
print(f"Dataset shape: {X_train.shape}")
y_train = train_data[["sum"]]

X_test = test_data.drop(["sum", "error"], axis=1)
X_test = X_test.loc[:, X_train.columns]
print(f"Dataset shape: {X_test.shape}")
y_test = test_data[["sum"]]

X_valid = valid_data.drop(["sum", "error"], axis=1)
X_valid = X_valid.loc[:, X_train.columns]
print(f"Dataset shape: {X_valid.shape}")
y_valid = valid_data[["sum"]]

Dataset shape: (3623620, 10)
Dataset shape: (700, 10)
Dataset shape: (686, 10)


In [5]:
X_train, y_train = shuffle(X_train, y_train)
X_test, y_test = shuffle(X_test, y_test)
X_valid, y_valid = shuffle(X_valid, y_valid)

In [6]:
xgb = XGBRegressor(n_jobs=-1)
xgb_stat = xgb.fit(X_train, y_train)
xgb_stat.get_params

<bound method XGBModel.get_params of XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.300000012, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)>

In [7]:
prediction = xgb_stat.predict(X_test)

print(mean_absolute_error(prediction, y_test))
print(mean_squared_error(prediction, y_test))
print(r2_score(prediction, y_test))

4.374427
44.022705
0.044087573367549426


In [8]:
prediction = xgb_stat.predict(X_valid)

print(mean_absolute_error(prediction, y_valid))
print(mean_squared_error(prediction, y_valid))
print(r2_score(prediction, y_valid))

4.4864936
40.546734
0.22247252372041437


In [None]:
xgb = XGBRegressor(n_jobs=-1)

xgb_param_grid = {
    "max_depth": [10, 20],
    "learning_rate": [0.15, 0.5],
    "max_leaves": [10, 20],
    "n_estimators": [200, 300]
}

xgb_regressor = GridSearchCV(estimator=xgb, param_grid=xgb_param_grid, scoring="neg_mean_absolute_error", cv=2, verbose=True)
best_xgb = xgb_regressor.fit(X_train, y_train)
best_xgb.best_params_

Fitting 2 folds for each of 16 candidates, totalling 32 fits


In [None]:
prediction = best_xgb.predict(X_test)

print(mean_absolute_error(prediction, y_test))
print(mean_squared_error(prediction, y_test))
print(r2_score(prediction, y_test))

In [None]:
prediction = best_xgb.predict(X_valid)

print(mean_absolute_error(prediction, y_valid))
print(mean_squared_error(prediction, y_valid))
print(r2_score(prediction, y_valid))

In [None]:
with open("model.pkl", "wb") as fp:
    pickle.dump(best_xgb, fp)

In [None]:
importance = PermutationImportance(best_xgb, random_state=42).fit(X_valid, y_valid)
eli5.show_weights(importance, feature_names=X_valid.columns.tolist())

<blockquote style="border-color: rgba(124,203,234,0.25)">
    <a name="002" style="color: black">
        <p style="padding: 10px;
              font-size:25px;
              font-weight: bolder">
            Option 2: Time Series Model
        </p>
    </a>
</blockquote>

In [None]:
train_data = pd.read_csv("time_train.csv", parse_dates=["datetime"], index_col=["datetime"], encoding="cp1251")
test_data = pd.read_csv("time_test.csv", parse_dates=["datetime"], index_col=["datetime"], encoding="cp1251")
valid_data = pd.read_csv("time_valid.csv", parse_dates=["datetime"], index_col=["datetime"], encoding="cp1251")

train_data.head()

In [None]:
arima_model = ARIMA(train_data["sum"])
arima_trained = arima_model.fit()

In [None]:
forecast = result.predict(
    start= len(train_data),
    end = (len(train_data)-1)+200,
    typ ="levels"
).rename("forecast")