## Rain prediction in Australia

#### Import required libraries

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE

import sklearn

from sklearn.svm import SVC
from sklearn.svm import SVR

from sklearn.impute import SimpleImputer

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler,OneHotEncoder, LabelEncoder

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score,roc_auc_score,precision_score, recall_score, f1_score,ConfusionMatrixDisplay,classification_report
from sklearn.metrics import mean_squared_error

import optuna

import xgboost as xgb
from xgboost import XGBClassifier

<br>
<br>
<br>
<br>
<br>

#### Read dataset

In [4]:
dataframe_clean_wo_outl_wo_corr = pd.read_csv(r"C:\Users\Lucio\Documents\Github\Next-day-rain-prediction\1- Data\2- Processed\dataframe_clean_wo_outl_wo_corr.csv", index_col=0)
dataframe_clean_wo_outl_wo_corr.head()

Unnamed: 0,Location,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,Humidity9am,Humidity3pm,Pressure9am,RainToday,RainTomorrow
0,Albury,0.6,W,44.0,W,WNW,71.0,22.0,1007.7,0.0,0.0
1,Albury,0.0,WNW,44.0,NNW,WSW,44.0,25.0,1010.6,0.0,0.0
2,Albury,0.0,WSW,46.0,W,WSW,38.0,30.0,1007.6,0.0,0.0
3,Albury,0.0,NE,24.0,SE,E,45.0,16.0,1017.6,0.0,0.0
4,Albury,1.0,W,41.0,ENE,NW,82.0,33.0,1010.8,0.0,0.0


dataframe_clean_wo_outl_wo_corr characteristics:
- Removed univariated outliers
- Removed variables with high collinearity

<br>
<br>
<br>
<br>
<br>

#### Encode Categorical Features

In [5]:
dataframe_encoded = pd.get_dummies(dataframe_clean_wo_outl_wo_corr)
dataframe_encoded.head()

Unnamed: 0,Rainfall,WindGustSpeed,Humidity9am,Humidity3pm,Pressure9am,RainToday,RainTomorrow,Location_Adelaide,Location_Albany,Location_Albury,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,0.6,44.0,71.0,22.0,1007.7,0.0,0.0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,0.0,44.0,44.0,25.0,1010.6,0.0,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0.0,46.0,38.0,30.0,1007.6,0.0,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,0.0,24.0,45.0,16.0,1017.6,0.0,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1.0,41.0,82.0,33.0,1010.8,0.0,0.0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


<br>
<br>
<br>
<br>
<br>

### Model Tranining

In [6]:
#Create X and y dataframes
X = dataframe_encoded[[c for c in dataframe_encoded if c != 'RainTomorrow']].values
y = dataframe_encoded[['RainTomorrow']]

In [7]:
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X, y,random_state=0, test_size=0.30)

In [12]:
def objective_xgb(trial):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)

    param = {
        "verbosity": 1,
        "objective": "binary:logistic",
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        #"booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        'eval_metric': 'error',  # Use error (1 - accuracy) as the evaluation metric for binary classification
    }

    if param["booster"] == "gbtree":# or param["booster"] == "gblinear":
        param["subsample"] = trial.suggest_float("subsample", 1e-8, 1.0, log=True)
        param["n_estimators"] = trial.suggest_int("n_estimators", 1, 1000)        
        param["max_depth"] = trial.suggest_int("max_depth", 1, 64)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    '''
    if param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 64)
        param["subsample"] = trial.suggest_float("subsample", 1e-8, 1.0, log=True)
        param["n_estimators"] = trial.suggest_int("n_estimators", 1, 1000) 
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    '''
    bst = xgb.train(param, dtrain)
    y_pred = bst.predict(dvalid)
    y_pred_binary = [1 if p >= 0.5 else 0 for p in y_pred]  # Convert probabilities to binary predictions
    
    accuracy = sklearn.metrics.accuracy_score(y_val, y_pred_binary)
    
    return -accuracy  #An objective value linked with the Trial object

In [13]:
sc_X = MinMaxScaler()
sc_y = MinMaxScaler()
X_sc = sc_X.fit_transform(X)
y_sc = sc_y.fit_transform(y)

In [14]:
X_sc_train, X_sc_val, y_sc_train, y_sc_val = sklearn.model_selection.train_test_split(X_sc, y_sc, random_state=0, test_size=0.30)

In [15]:
study_xgb = optuna.create_study()
study_xgb.optimize(objective_xgb, n_trials=30)
study_xgb.best_params

[I 2023-11-21 19:35:25,749] A new study created in memory with name: no-name-c75bb56e-8c7c-45e6-895b-58de6bbd71a7


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:28,400] Trial 0 finished with value: -0.8350420881212872 and parameters: {'booster': 'gbtree', 'lambda': 0.0027627675294478794, 'alpha': 9.030532144027777e-08, 'subsample': 0.003989341123380293, 'n_estimators': 337, 'max_depth': 24, 'eta': 0.009134665819853382, 'gamma': 1.3565771598841993e-07, 'grow_policy': 'depthwise'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:29,491] Trial 1 finished with value: -0.21743617972889287 and parameters: {'booster': 'gbtree', 'lambda': 2.471120208429458e-08, 'alpha': 0.046843893307469596, 'subsample': 1.4244243161057073e-05, 'n_estimators': 865, 'max_depth': 34, 'eta': 0.13578917044329492, 'gamma': 0.0036636576310437835, 'grow_policy': 'lossguide'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:30,923] Trial 2 finished with value: -0.83483566136838 and parameters: {'booster': 'gbtree', 'lambda': 2.238000078812369e-06, 'alpha': 0.03962956754017319, 'subsample': 0.00047850965745067605, 'n_estimators': 832, 'max_depth': 19, 'eta': 0.001631261994634028, 'gamma': 1.9272462075623864e-07, 'grow_policy': 'lossguide'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:32,017] Trial 3 finished with value: -0.21743617972889287 and parameters: {'booster': 'gbtree', 'lambda': 0.0014023110607134877, 'alpha': 8.171380121732629e-08, 'subsample': 3.900784839544371e-07, 'n_estimators': 228, 'max_depth': 52, 'eta': 0.0006445400096342336, 'gamma': 0.0010497695426235029, 'grow_policy': 'depthwise'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:33,004] Trial 4 finished with value: -0.21743617972889287 and parameters: {'booster': 'gbtree', 'lambda': 1.2873597380855141e-06, 'alpha': 1.7365447887985792e-07, 'subsample': 3.025779814382656e-06, 'n_estimators': 372, 'max_depth': 39, 'eta': 9.138271792830982e-08, 'gamma': 0.0003246062073110227, 'grow_policy': 'lossguide'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:34,108] Trial 5 finished with value: -0.21743617972889287 and parameters: {'booster': 'gbtree', 'lambda': 6.36175798753497e-08, 'alpha': 7.95936242522374e-07, 'subsample': 1.0515018278731473e-05, 'n_estimators': 1, 'max_depth': 60, 'eta': 0.0004889301316886351, 'gamma': 1.4375991236156355e-08, 'grow_policy': 'lossguide'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:35,157] Trial 6 finished with value: -0.21743617972889287 and parameters: {'booster': 'gbtree', 'lambda': 1.1055549353601064e-07, 'alpha': 0.1536323212283759, 'subsample': 2.4923729510484223e-08, 'n_estimators': 784, 'max_depth': 13, 'eta': 0.3714214372989, 'gamma': 4.0434780849169694e-08, 'grow_policy': 'depthwise'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:36,124] Trial 7 finished with value: -0.21743617972889287 and parameters: {'booster': 'gbtree', 'lambda': 4.7509295986508734e-07, 'alpha': 1.3510367982151932e-08, 'subsample': 1.3390993105433518e-08, 'n_estimators': 574, 'max_depth': 14, 'eta': 5.088869506315166e-06, 'gamma': 0.0034877600261929336, 'grow_policy': 'depthwise'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:37,090] Trial 8 finished with value: -0.21743617972889287 and parameters: {'booster': 'gbtree', 'lambda': 2.384099428978859e-05, 'alpha': 0.15232653727383266, 'subsample': 1.486002499482311e-08, 'n_estimators': 807, 'max_depth': 10, 'eta': 0.012077886002626246, 'gamma': 9.633634508606543e-08, 'grow_policy': 'lossguide'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:38,123] Trial 9 finished with value: -0.21743617972889287 and parameters: {'booster': 'gbtree', 'lambda': 1.4656826451928042e-08, 'alpha': 1.2100794932721145e-06, 'subsample': 7.553947027554905e-08, 'n_estimators': 607, 'max_depth': 54, 'eta': 2.1599463192097523e-06, 'gamma': 8.339740224292807e-08, 'grow_policy': 'depthwise'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:39,322] Trial 10 finished with value: -0.8259134383816142 and parameters: {'booster': 'gbtree', 'lambda': 0.25252409181757013, 'alpha': 5.045320489437193e-05, 'subsample': 0.04594564374942321, 'n_estimators': 185, 'max_depth': 1, 'eta': 0.02580748841307193, 'gamma': 0.9981440284478406, 'grow_policy': 'depthwise'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:41,202] Trial 11 finished with value: -0.8299731645221221 and parameters: {'booster': 'gbtree', 'lambda': 0.00040661183206323583, 'alpha': 0.0024767000594946107, 'subsample': 0.0012314023010506201, 'n_estimators': 445, 'max_depth': 24, 'eta': 0.0029277625612860415, 'gamma': 9.098854903282147e-07, 'grow_policy': 'lossguide'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:42,740] Trial 12 finished with value: -0.8313722791807151 and parameters: {'booster': 'gbtree', 'lambda': 1.5639591070208627e-05, 'alpha': 3.069811564885136e-05, 'subsample': 0.0006174964351598703, 'n_estimators': 994, 'max_depth': 25, 'eta': 6.181553987731658e-05, 'gamma': 4.480193261544629e-06, 'grow_policy': 'lossguide'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:47,309] Trial 13 finished with value: -0.7655909539209614 and parameters: {'booster': 'gbtree', 'lambda': 0.008284790595296384, 'alpha': 0.0015530035066372969, 'subsample': 0.041124345818796086, 'n_estimators': 676, 'max_depth': 24, 'eta': 0.7339849472242173, 'gamma': 5.254777240178458e-06, 'grow_policy': 'depthwise'}. Best is trial 0 with value: -0.8350420881212872.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:35:59,003] Trial 14 finished with value: -0.8441019289433244 and parameters: {'booster': 'gbtree', 'lambda': 5.503364712626407e-06, 'alpha': 0.9450088535951114, 'subsample': 0.725362881616662, 'n_estimators': 332, 'max_depth': 41, 'eta': 0.011355936900811325, 'gamma': 8.321026992375705e-07, 'grow_policy': 'lossguide'}. Best is trial 14 with value: -0.8441019289433244.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:36:17,446] Trial 15 finished with value: -0.8419000435789812 and parameters: {'booster': 'gbtree', 'lambda': 0.0001064304639682045, 'alpha': 6.677554153138328e-06, 'subsample': 0.562805916777869, 'n_estimators': 298, 'max_depth': 41, 'eta': 0.027661369442186844, 'gamma': 1.1090720649466408e-08, 'grow_policy': 'depthwise'}. Best is trial 14 with value: -0.8441019289433244.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:36:36,687] Trial 16 finished with value: -0.8406844193674167 and parameters: {'booster': 'gbtree', 'lambda': 7.313612924958608e-05, 'alpha': 6.503266957143363e-06, 'subsample': 0.6646803201620943, 'n_estimators': 121, 'max_depth': 44, 'eta': 0.08736300797319703, 'gamma': 1.2043994523796056e-08, 'grow_policy': 'lossguide'}. Best is trial 14 with value: -0.8441019289433244.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:36:47,463] Trial 17 finished with value: -0.8451799353196174 and parameters: {'booster': 'gbtree', 'lambda': 7.82496132019947e-06, 'alpha': 0.885383250911391, 'subsample': 0.5130725001909233, 'n_estimators': 284, 'max_depth': 44, 'eta': 0.059460048221478726, 'gamma': 1.3069558461007474e-06, 'grow_policy': 'depthwise'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:36:51,883] Trial 18 finished with value: -0.839422922544095 and parameters: {'booster': 'gbtree', 'lambda': 4.51652304868137e-06, 'alpha': 0.6790620639790343, 'subsample': 0.03946306528296284, 'n_estimators': 481, 'max_depth': 48, 'eta': 0.10464003500919963, 'gamma': 1.3506514797924652e-05, 'grow_policy': 'lossguide'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:37:06,684] Trial 19 finished with value: -0.8341246358861442 and parameters: {'booster': 'gbtree', 'lambda': 3.637200188202458e-07, 'alpha': 0.5870484446158166, 'subsample': 0.9947875195007188, 'n_estimators': 113, 'max_depth': 34, 'eta': 0.7144727821390991, 'gamma': 1.1341817483709955e-06, 'grow_policy': 'depthwise'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:37:10,192] Trial 20 finished with value: -0.8382990435560449 and parameters: {'booster': 'gbtree', 'lambda': 5.6418277917611375e-06, 'alpha': 0.004416721292540735, 'subsample': 0.00845371597667377, 'n_estimators': 421, 'max_depth': 64, 'eta': 0.003545887038171034, 'gamma': 1.1986690920192988e-05, 'grow_policy': 'lossguide'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:37:24,071] Trial 21 finished with value: -0.8427257505906098 and parameters: {'booster': 'gbtree', 'lambda': 0.00010861950882771133, 'alpha': 0.00029994808655373127, 'subsample': 0.3538814511748099, 'n_estimators': 287, 'max_depth': 43, 'eta': 0.02754261117857274, 'gamma': 4.678410123213796e-07, 'grow_policy': 'depthwise'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:37:33,022] Trial 22 finished with value: -0.8408679098144453 and parameters: {'booster': 'gbtree', 'lambda': 2.0205827385176702e-05, 'alpha': 0.0006142537969146693, 'subsample': 0.1749730134649326, 'n_estimators': 274, 'max_depth': 47, 'eta': 0.042788165960413314, 'gamma': 6.028489459548962e-07, 'grow_policy': 'depthwise'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:37:40,297] Trial 23 finished with value: -0.840615610449781 and parameters: {'booster': 'gbtree', 'lambda': 0.00010947168632476244, 'alpha': 0.01401644114398869, 'subsample': 0.1386040944299978, 'n_estimators': 133, 'max_depth': 38, 'eta': 0.008401622171663048, 'gamma': 4.6756868904668984e-05, 'grow_policy': 'depthwise'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:37:50,249] Trial 24 finished with value: -0.833918209133237 and parameters: {'booster': 'gbtree', 'lambda': 1.8722539872920446e-06, 'alpha': 0.00029683405669519346, 'subsample': 0.17631617286171278, 'n_estimators': 358, 'max_depth': 54, 'eta': 0.18406554587292334, 'gamma': 5.138993912376526e-07, 'grow_policy': 'depthwise'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:37:53,789] Trial 25 finished with value: -0.8358448588270373 and parameters: {'booster': 'gbtree', 'lambda': 1.0625478282341933e-05, 'alpha': 0.009066904632847468, 'subsample': 0.010457834665691423, 'n_estimators': 48, 'max_depth': 30, 'eta': 0.040397839935231765, 'gamma': 2.1125257712703806e-06, 'grow_policy': 'depthwise'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:38:05,548] Trial 26 finished with value: -0.7736186609784628 and parameters: {'booster': 'gbtree', 'lambda': 6.515091240721063e-07, 'alpha': 0.00023797844228759876, 'subsample': 0.3076869356296905, 'n_estimators': 224, 'max_depth': 45, 'eta': 0.8355841211335443, 'gamma': 3.4799306677061696e-07, 'grow_policy': 'depthwise'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:38:19,893] Trial 27 finished with value: -0.8448588270373174 and parameters: {'booster': 'gbtree', 'lambda': 5.1089045021982234e-05, 'alpha': 0.9865403574347783, 'subsample': 0.9456724922896605, 'n_estimators': 559, 'max_depth': 31, 'eta': 0.15356564870013167, 'gamma': 2.4039670457833144e-06, 'grow_policy': 'depthwise'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:38:35,341] Trial 28 finished with value: -0.8443312920021101 and parameters: {'booster': 'gbtree', 'lambda': 4.359260209005445e-06, 'alpha': 0.9727887334512185, 'subsample': 0.8791975434443149, 'n_estimators': 544, 'max_depth': 29, 'eta': 0.18802228746120614, 'gamma': 3.928172942910947e-05, 'grow_policy': 'lossguide'}. Best is trial 17 with value: -0.8451799353196174.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[I 2023-11-21 19:38:38,346] Trial 29 finished with value: -0.8141929860776623 and parameters: {'booster': 'gbtree', 'lambda': 3.357244031832335e-05, 'alpha': 0.1898117939282201, 'subsample': 0.0037847349364593754, 'n_estimators': 578, 'max_depth': 29, 'eta': 0.2747802029336958, 'gamma': 4.234655297086859e-05, 'grow_policy': 'depthwise'}. Best is trial 17 with value: -0.8451799353196174.


{'booster': 'gbtree',
 'lambda': 7.82496132019947e-06,
 'alpha': 0.885383250911391,
 'subsample': 0.5130725001909233,
 'n_estimators': 284,
 'max_depth': 44,
 'eta': 0.059460048221478726,
 'gamma': 1.3069558461007474e-06,
 'grow_policy': 'depthwise'}

In [16]:
xgb_params = study_xgb.best_params
xgb_params

{'booster': 'gbtree',
 'lambda': 7.82496132019947e-06,
 'alpha': 0.885383250911391,
 'subsample': 0.5130725001909233,
 'n_estimators': 284,
 'max_depth': 44,
 'eta': 0.059460048221478726,
 'gamma': 1.3069558461007474e-06,
 'grow_policy': 'depthwise'}

In [22]:
dtrain = xgb.DMatrix(X_sc_train, y_sc_train)
dtest = xgb.DMatrix(X_sc_val)

cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=50,
    verbose_eval=200, show_stdv=False)
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()
plt.show()

In [23]:
#num_boost_rounds = len(cv_output)
num_boost_rounds = 50
#print(num_boost_rounds)

model = XGBClassifier(**xgb_params, silent=0)

model.fit(X_sc_train, y_sc_train, eval_set=[(X_sc_val, y_sc_val)], early_stopping_rounds=10, verbose=True)

#model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-logloss:0.66162
[1]	validation_0-logloss:0.63355
[2]	validation_0-logloss:0.60872
[3]	validation_0-logloss:0.58631
[4]	validation_0-logloss:0.56614
[5]	validation_0-logloss:0.54785
[6]	validation_0-logloss:0.53145
[7]	validation_0-logloss:0.51633
[8]	validation_0-logloss:0.50267
[9]	validation_0-logloss:0.49028
[10]	validation_0-logloss:0.47891
[11]	validation_0-logloss:0.46856
[12]	validation_0-logloss:0.45896
[13]	validation_0-logloss:0.45009
[14]	validation_0-logloss:0.44203
[15]	validation_0-logloss:0.43463
[16]	validation_0-logloss:0.42813
[17]	validation_0-logloss:0.42201
[18]	validation_0-logloss:0.41635
[19]	validation_0-logloss:0.41118
[20]	validation_0

In [26]:
y_predicted = model.predict(X_sc_val)
#threshold = 0.5
#y_binary_predicted = np.where(y_predicted >= threshold, 1, 0)
#y_binary_predicted
y_predicted

array([0, 0, 0, ..., 1, 0, 0])

In [27]:
conf_matrix = confusion_matrix(y_sc_val, y_predicted)

accuracy = accuracy_score(y_sc_val, y_predicted)
precision = precision_score(y_sc_val, y_predicted)
recall = recall_score(y_sc_val, y_predicted)
f1 = f1_score(y_sc_val, y_predicted)
roc_auc = roc_auc_score(y_sc_val, y_predicted)

print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("ROC AUC:", roc_auc)

Confusion Matrix:
 [[32302  1817]
 [ 4702  4778]]
Accuracy: 0.8504782219775683
Precision: 0.7244882486732374
Recall: 0.5040084388185654
F1-Score: 0.5944634525660965
ROC AUC: 0.7253768270472556
