# Learn Time Series Basic With Machine Learning Approach

## Part I

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/london-homes-energy-data/london_energy.csv')
df.head(3)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df_avg_consumption = df.groupby('Date')['KWH'].mean()
df_avg_consumption = pd.DataFrame({
    'date': df_avg_consumption.index.tolist(),
    'consumption': df_avg_consumption.values.tolist()
})
df_avg_consumption['date'] = pd.to_datetime(df_avg_consumption['date'])
df_avg_consumption

In [None]:
df_avg_consumption['date'].min(), df_avg_consumption['date'].max()

In [None]:
df_avg_consumption.plot(x='date',y='consumption')

In [None]:
df_avg_consumption.query("date > '2012-01-01' & date < '2013-01-01'").plot(x="date", y="consumption")

In [None]:
df_avg_consumption['day_of_week'] = df_avg_consumption['date'].dt.dayofweek
df_avg_consumption['day_of_year'] = df_avg_consumption['date'].dt.dayofyear
df_avg_consumption['month'] = df_avg_consumption['date'].dt.month
df_avg_consumption['quarter'] = df_avg_consumption['date'].dt.quarter
df_avg_consumption['year'] = df_avg_consumption['date'].dt.year

# Menggunakan representasi sin/cos untuk fitur siklik
# Representasi sin dan cos untuk hari dalam seminggu (0-6)
df_avg_consumption['day_of_week_sin'] = np.sin(2 * np.pi * df_avg_consumption['day_of_week'] / 7)
df_avg_consumption['day_of_week_cos'] = np.cos(2 * np.pi * df_avg_consumption['day_of_week'] / 7)

# Representasi sin dan cos untuk hari dalam setahun (1-365/366)
df_avg_consumption['day_of_year_sin'] = np.sin(2 * np.pi * df_avg_consumption['day_of_year'] / 365)
df_avg_consumption['day_of_year_cos'] = np.cos(2 * np.pi * df_avg_consumption['day_of_year'] / 365)

# Representasi sin dan cos untuk bulan (1-12)
df_avg_consumption['month_sin'] = np.sin(2 * np.pi * df_avg_consumption['month'] / 12)
df_avg_consumption['month_cos'] = np.cos(2 * np.pi * df_avg_consumption['month'] / 12)

# Representasi sin dan cos untuk quarter (1-4)
df_avg_consumption['quarter_sin'] = np.sin(2 * np.pi * df_avg_consumption['quarter'] / 4)
df_avg_consumption['quarter_cos'] = np.cos(2 * np.pi * df_avg_consumption['quarter'] / 4)

df_avg_consumption.head(3)

In [None]:
training_mask = df_avg_consumption["date"] < "2013-07-28"
training_data = df_avg_consumption.loc[training_mask]

testing_mask = df_avg_consumption["date"] >= "2013-07-28"
testing_data = df_avg_consumption.loc[testing_mask]

training_data.shape, testing_data.shape

In [None]:
training_data

In [None]:
testing_data

In [None]:
fig, ax = plt.subplots(figsize=(20,5))
training_data.plot(ax=ax, label='Training', x='date', y='consumption')
testing_data.plot(ax=ax, label='Testing', x='date', y='consumption')

In [None]:
training_data = training_data.drop(columns=['date'])
testing_dates = testing_data['date']
testing_data = testing_data.drop(columns=['date'])

# Memperbarui X_train dan X_test dengan fitur sin dan cos yang baru
X_train = training_data[
    [
        'day_of_week_sin', 'day_of_week_cos',
        'day_of_year_sin', 'day_of_year_cos',
        'month_sin', 'month_cos',
        'quarter_sin', 'quarter_cos',
        'year'  # Kolom year tetap bisa dipertahankan jika ada pengaruh tren tahunan
    ]
]
y_train = training_data['consumption']

X_test = testing_data[
    [
        'day_of_week_sin', 'day_of_week_cos',
        'day_of_year_sin', 'day_of_year_cos',
        'month_sin', 'month_cos',
        'quarter_sin', 'quarter_cos',
        'year'
    ]
]
y_test = testing_data['consumption']

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

In [None]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

def evaluate_model(y_test, prediction):
    print(f"MAE: {mean_absolute_error(y_test, prediction)}")
    print(f"MAPE: {mean_absolute_percentage_error(y_test, prediction)}")
    print(f"MSE: {mean_squared_error(y_test, prediction)}")
    
def plot_predictions(testing_dates, y_test, prediction):
    df_test = pd.DataFrame({
        'date': testing_dates,
        'actual': y_test,
        'prediction': prediction
    })
    fig, ax = plt.subplots(figsize=(10,5))
    df_test.plot(ax=ax, label='Actual', x='date', y='actual')
    df_test.plot(ax=ax, label='Prediction', x='date', y='prediction')
    plt.legend(['Actual','Prediction'])
    plt.show()

In [None]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

cv_split = TimeSeriesSplit(n_splits=4, test_size=100)

xgb_model = XGBRegressor()
parameters = {
    'max_depth': [3,4,5,6,7,10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'n_estimators': [100,300,500,700,900,1000],
    'colsample_bytree': [0.3, 0.5, 0.7]
}

grid_search = GridSearchCV(estimator=xgb_model, cv=cv_split, param_grid=parameters)
grid_search.fit(X_train, y_train)

In [None]:
prediction = grid_search.predict(X_test)
plot_predictions(testing_dates, y_test, prediction)
evaluate_model(y_test, prediction)

In [None]:
# cv_split = TimeSeriesSplit(n_splits=4, test_size=100)

# lgbm_model = LGBMRegressor()
# parameters = {
#     'max_depth': [3,4,5,6,7,10],
#     'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
#     'n_estimators': [100,300,500,700,900,1000],
#     'colsample_bytree': [0.3, 0.5, 0.7]
# }

# grid_search = GridSearchCV(estimator=lgbm_model, cv=cv_split, param_grid=parameters)
# grid_search.fit(X_train, y_train)

In [None]:
# prediction = grid_search.predict(X_test)
# plot_prediction(testing_dates, y_test, prediction)
# evaluate_model(y_test, prediction)

In [None]:
df_weather = pd.read_csv('/kaggle/input/london-weather/london_weather.csv')
df_weather.head(3)

In [None]:
df_weather.info()

In [None]:
df_weather.isna().sum()

In [None]:
df_weather = df_weather.ffill()
df_weather.isna().sum()

In [None]:
df_weather['date'] = pd.to_datetime(df_weather['date'], format='%Y%m%d')
df_avg_consumption = df_avg_consumption.merge(df_weather, how='inner', on='date')
df_avg_consumption

In [None]:
training_mask = df_avg_consumption["date"] < "2013-07-28"
training_data = df_avg_consumption.loc[training_mask]

testing_mask = df_avg_consumption["date"] >= "2013-07-28"
testing_data = df_avg_consumption.loc[testing_mask]

training_data.shape, testing_data.shape

In [None]:
training_data = training_data.drop(columns=["date"])
testing_dates = testing_data["date"]
testing_data = testing_data.drop(columns=["date"])

# Memperbarui X_train dan X_test dengan fitur sin-cos dan variabel cuaca lainnya
X_train = training_data[
    [
        'day_of_week_sin', 'day_of_week_cos',
        'day_of_year_sin', 'day_of_year_cos',
        'month_sin', 'month_cos',
        'quarter_sin', 'quarter_cos',
        'year',  # Tetap menyertakan tahun untuk tren jangka panjang
        'cloud_cover', 'sunshine', 'global_radiation', 'max_temp',
        'mean_temp', 'min_temp', 'precipitation', 'pressure',
        'snow_depth'
    ]
]
y_train = training_data['consumption']

X_test = testing_data[
    [
        'day_of_week_sin', 'day_of_week_cos',
        'day_of_year_sin', 'day_of_year_cos',
        'month_sin', 'month_cos',
        'quarter_sin', 'quarter_cos',
        'year',
        'cloud_cover', 'sunshine', 'global_radiation', 'max_temp',
        'mean_temp', 'min_temp', 'precipitation', 'pressure',
        'snow_depth'
    ]
]
y_test = testing_data['consumption']

In [None]:
xgb_model = XGBRegressor()
parameters = {
    'max_depth': [3,4,5,6,7,10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'n_estimators': [100,300,500,700,900,1000],
    'colsample_bytree': [0.3, 0.5, 0.7]
}

grid_search = GridSearchCV(estimator=xgb_model, cv=cv_split, param_grid=parameters)
grid_search.fit(X_train, y_train)

# Menampilkan hasil terbaik
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

In [None]:
prediction = grid_search.predict(X_test)
plot_predictions(testing_dates, y_test, prediction)
evaluate_model(y_test, prediction)

In [None]:
# Mendapatkan model terbaik dari grid search
best_model = grid_search.best_estimator_
feature_importance = best_model.feature_importances_
feature = [
    'day_of_week_sin', 'day_of_week_cos',  # Sin dan cos untuk hari dalam seminggu
    'day_of_year_sin', 'day_of_year_cos',  # Sin dan cos untuk hari dalam setahun
    'month_sin', 'month_cos',              # Sin dan cos untuk bulan
    'quarter_sin', 'quarter_cos',          # Sin dan cos untuk kuartal
    'year',                                # Tetap menyertakan tahun untuk tren
    'cloud_cover', 'sunshine', 'global_radiation', 'max_temp', 
    'mean_temp', 'min_temp', 'precipitation', 'pressure',
    'snow_depth'                           # Variabel cuaca
]

importance_df = pd.DataFrame({
    'Features': feature,
    'Importance': feature_importance
})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Features', data=importance_df, palette='viridis')
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

In [None]:
### Before SinCos
# MAE: 0.41222399123657294
# MAPE: 0.16246850639888177
# MSE: 0.8432288584575132

### After SinCos
# MAE: 0.44123818524317965
# MAPE: 0.1644001320544813
# MSE: 0.864698398183634

## Part II