In [None]:
import xgboost as xgb
import optuna
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import itertools
import numpy as np
import random
import statsmodels.api as sm
# time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')

In [None]:
def iqr(df,degisken):
    # 1 ve 3. çeyrekllik hesaplama
    q1 = np.quantile(df[degisken], 0.25)
    q3 = np.quantile(df[degisken], 0.75)
    # iki çeyrek arasındaki %50’lik dilim
    diff = q3 - q1
    # alt ve üst eşik değerlerini hesaplarken 1. çeyrekten 1.5 kat az, 3. çeyrekten de 1.5 kat fazla olan değeri sınır olarak belirle
    lower_t = q1 - (1.5 * diff)
    upper_t = q3 + (1.5 * diff)
    #eşik değerlerin altında ve üstünde kalan değerleri döndür
    
    return (df[(df[degisken] < lower_t) | (df[degisken] > upper_t)]),lower_t,upper_t 

In [None]:
df_batman = pd.read_excel('datas/data_batman.xlsx')
df_diyarbakir = pd.read_excel('datas/data_diyarbakir.xlsx')
df_mardin = pd.read_excel('datas/data_mardin.xlsx')
df_siirt = pd.read_excel('datas/data_siirt.xlsx')
df_sirnak = pd.read_excel('datas/data_sirnak.xlsx')
df_urfa = pd.read_excel('datas/data_urfa.xlsx')

In [None]:
df_all = pd.concat([df_batman, df_diyarbakir, df_mardin, df_siirt, df_sirnak, df_urfa])
df_all['ds'] = pd.to_datetime(df_all['ds'])
df_all

In [None]:
df_all.drop(columns='time', axis= 1, inplace = True)

In [None]:
df_all

In [None]:
other_cols = list(df_all.columns)
other_cols.remove('y')
df_downsized = pd.DataFrame()

for city in ['Batman', 'Diyarbakir', 'Mardin', 'Siirt', 'Sirnak', 'Urfa']:
    
    df = df_all.copy()
    df = df[df['il'] == city]

    df_y = df[['ds', 'y']].groupby(pd.Grouper(freq='D', key='ds')).sum().reset_index()
    df_other = df[other_cols].groupby(pd.Grouper(freq='D', key='ds')).mean().reset_index()
    df_other.drop('ds', axis = 1 ,inplace=True)
    
    df_last = pd.concat([df_y, df_other], axis = 1)
    df_last['il'] = city 


    df_downsized = pd.concat([df_downsized, df_last])

df_downsized.reset_index(drop= True, inplace= True)

In [None]:
df_last_y = df_downsized[['ds', 'y']].groupby(pd.Grouper(freq='D', key='ds')).sum().reset_index()
df_last_other = df_downsized[other_cols].groupby(pd.Grouper(freq = 'D', key='ds')).mean().reset_index()
df_last_other.drop('ds', axis = 1 ,inplace=True)

In [None]:
df_final = pd.concat([df_last_y, df_last_other], axis = 1)
df_final

In [None]:
def pct_change(y_test, y_pred):
    df = pd.DataFrame(columns = ['y_test', 'y_pred'])
    df['y_test'] = y_test
    df['y_pred'] = y_pred
    return float(abs(df[['y_pred', 'y_test']].pct_change(axis=1)['y_test']).mean())

In [None]:
while True:
    TuketilenEnerjiOutlier,lowerThresh,upperThresh = iqr(df_final, "y")
    print("Çeyrekler Açıklığı  (IQR) Yöntemine Göre: ") 
    print(f"TUKETILEN_ENERJI_KWH Değişkeninde Bulunan Aykırı Değer Miktarı: {len(TuketilenEnerjiOutlier)}")
    print(f"Lower Thresh Değeri: {lowerThresh} | Upper Thresh Değeri: {upperThresh}") 
    df_final['y'].mask(df_final['y'] > upperThresh, df_final['y'].mean() , inplace=True)
    df_final['y'].mask(df_final['y'] < lowerThresh, df_final['y'].mean() , inplace=True)
    if len(TuketilenEnerjiOutlier) <=0:
        break

In [None]:
def create_features(df, label=None, cols_keep = None):
    """
    Creates time series features from datetime index
    """
    df['date'] = df.index
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    
    if cols_keep == None:
        X = df[['hour','dayofweek','quarter','month','year',
            'dayofyear','dayofmonth','weekofyear']]
    else:
        cols_keep = cols_keep + ['hour','dayofweek','quarter','month','year',
            'dayofyear','dayofmonth','weekofyear']
        X = df[cols_keep]
        
    if label:
        y = df[label]
        return X, y
    return X

In [None]:
df_final = df_final.drop(columns= ['weathercode (wmo code)','cloudcover_low (%)','cloudcover_mid (%)','cloudcover_high (%)','et0_fao_evapotranspiration (mm)',
                                                                                 'winddirection_10m (°)', 'winddirection_100m (°)','windgusts_10m (km/h)','soil_temperature_100_to_255cm (°C)',
                                                                                 'soil_moisture_100_to_255cm (m³/m³)']).reset_index(drop = True)


In [None]:
col_list = list(df_final.columns)
col_list.remove('y')
col_list.remove('ds')

In [None]:
split_date = '2023-06-01'
df_train = df_final[df_final['ds'] < split_date].reset_index(drop = True)
df_test = df_final[df_final['ds'] >= split_date].reset_index(drop = True)
X_train, y_train = create_features(df_train.set_index('ds'), label='y', cols_keep = col_list)
X_test, y_test = create_features(df_test.set_index('ds'), label='y', cols_keep = col_list)

In [None]:
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    
    return pct_change(y_train, y_pred)

In [None]:
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=20000)

In [None]:
study.best_params

In [None]:
model = xgb.XGBRegressor(**study.best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

pct_change(y_train, y_pred)

In [None]:
df_train['y_pred'] = y_pred
df_train

In [None]:
_ = df_train.set_index('ds')[['y','y_pred']].plot(figsize=(15, 5)),

In [None]:
pred_test = model.predict(X_test)

In [None]:
pct_change(y_test, pred_test)

In [None]:
df_test['pred'] = pred_test
df_test

In [None]:
_ = df_test.set_index('ds')[['y','pred']].plot(figsize=(15, 5)),

In [None]:
df_last = pd.concat([df_train, df_test], sort = False)
df_last

In [None]:
_ = df_last.set_index('ds')[['y','pred']].plot(figsize=(15, 5)),