In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns

In [None]:
plt.style.use('ggplot')
plt.rcParams['font.family'] = 'sans-serif' 
plt.rcParams['font.serif'] = 'Ubuntu' 
plt.rcParams['font.monospace'] = 'Ubuntu Mono' 
plt.rcParams['font.size'] = 14 
plt.rcParams['axes.labelsize'] = 12 
plt.rcParams['axes.labelweight'] = 'bold' 
plt.rcParams['axes.titlesize'] = 12 
plt.rcParams['xtick.labelsize'] = 12 
plt.rcParams['ytick.labelsize'] = 12 
plt.rcParams['legend.fontsize'] = 12 
plt.rcParams['figure.titlesize'] = 12 
plt.rcParams['image.cmap'] = 'jet' 
plt.rcParams['image.interpolation'] = 'none' 
plt.rcParams['figure.figsize'] = (15, 5) 
plt.rcParams['axes.grid']=True
plt.rcParams['lines.linewidth'] = 2 
plt.rcParams['lines.markersize'] = 8
colors = ['xkcd:pale orange', 'xkcd:sea blue', 'xkcd:pale red', 'xkcd:sage green', 'xkcd:terra cotta', 'xkcd:dull purple', 'xkcd:teal', 'xkcd: goldenrod', 'xkcd:cadet blue',
'xkcd:scarlet']

In [None]:
ts_data = pd.read_excel("pma.xlsx")
ts_data = ts_data.reset_index().drop("index", axis=1)
ts_data["time"] = pd.to_datetime(ts_data["time"])
ts_data = ts_data.set_index("time")
ts_data.head()

In [None]:
plt.plot(ts_data.index, ts_data["pma"])

In [None]:
seasons = [1, 2, 3, 4]

def map_season(month, day):
    if (month == 12 and day >= 21) or (month == 1) or (month == 2) or (month == 3 and day < 21):
        return seasons[0]  # Winter
    elif (month == 3 and day >= 21) or (month == 4) or (month == 5) or (month == 6 and day < 21):
        return seasons[1]  # Spring
    elif (month == 6 and day >= 21) or (month == 7) or (month == 8) or (month == 9 and day < 21):
        return seasons[2]  # Summer
    else:
        return seasons[3]  # Fall


def is_special_day(date):
    special_days = ['2016-07-06', '2017-06-25', '2016-09-12', '2017-09-01']
    special_days = pd.to_datetime(special_days).date

    return int(date in special_days)


def create_features(df):
    """
    Create time series features based on the time series index.
    """
    df = df.copy()
    df["date"] = df.index.date
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df["is_special_day"] = df.apply(lambda row: is_special_day(row.date), axis=1)
    df['season'] = df.apply(lambda row: map_season(row.month, row.dayofmonth), axis=1)
    df['weekofyear'] = df.index.isocalendar().week

    df.drop(columns=["date"], inplace=True)
    return df

df = create_features(ts_data)

df

In [None]:
sns.boxplot(data=df, x='hour', y='pma')

In [None]:
sns.boxplot(data=df, x='month', y='pma')

In [None]:
sns.boxplot(data=df, x='season', y='pma')

In [None]:
sns.boxplot(data=df, x='is_special_day', y='pma')

# Model creation

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year', "season", "is_special_day", "tmp"]
TARGET = 'pma'

split_date = "01-06-2017"
train = df.loc[df.index < split_date]
test = df.loc[df.index >= split_date]

X_train = train[FEATURES]
y_train = train[TARGET]

X_test = test[FEATURES]
y_test = test[TARGET]

reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
                       n_estimators=5000,
                       early_stopping_rounds=50,
                       objective='reg:linear',
                       max_depth=3,
                       learning_rate=0.01)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=100)

In [None]:
fi = pd.DataFrame(data=reg.feature_importances_,
             index=reg.feature_names_in_,
             columns=['importance'])
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()

In [None]:
test['prediction'] = reg.predict(X_test)
df = df.merge(test[['prediction']], how='left', left_index=True, right_index=True)
ax = df[['pma']].plot(figsize=(15, 5))
test['prediction'].plot(ax=ax, style='.')
plt.legend(['Truth Data', 'Predictions'])
ax.set_title('Raw Data and Predictions')
plt.show()

In [None]:
score = np.sqrt(mean_squared_error(test['pma'], test['prediction']))
print(f'RMSE Score on Test set: {score:0.2f}')

In [None]:
test['error'] = np.abs(test[TARGET] - test['prediction'])
test['date'] = test.index.date
test.groupby(['date'])['error'].mean().sort_values(ascending=False).head(5)