## Rain prediction in Australia

#### Import required libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE

import sklearn

from sklearn.svm import SVC
from sklearn.svm import SVR

from sklearn.impute import SimpleImputer

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler,OneHotEncoder, LabelEncoder

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score,roc_auc_score,precision_score, recall_score, f1_score,ConfusionMatrixDisplay,classification_report
from sklearn.metrics import mean_squared_error

import optuna

import xgboost as xgb
from xgboost import XGBClassifier

import joblib

<br>
<br>
<br>
<br>
<br>

#### Read dataset

In [2]:
dataframe_clean_wo_outl_wo_corr = pd.read_csv(r"C:\Users\Lucio\Documents\Github\Next-day-rain-prediction\1- Data\2- Processed\dataframe_clean_wo_outl_wo_corr.csv", index_col=0)
dataframe_clean_wo_outl_wo_corr.head()

Unnamed: 0,Location,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,Humidity9am,Humidity3pm,Pressure9am,RainToday,RainTomorrow
0,Albury,0.6,W,44.0,W,WNW,71.0,22.0,1007.7,0.0,0.0
1,Albury,0.0,WNW,44.0,NNW,WSW,44.0,25.0,1010.6,0.0,0.0
2,Albury,0.0,WSW,46.0,W,WSW,38.0,30.0,1007.6,0.0,0.0
3,Albury,0.0,NE,24.0,SE,E,45.0,16.0,1017.6,0.0,0.0
4,Albury,1.0,W,41.0,ENE,NW,82.0,33.0,1010.8,0.0,0.0


dataframe_clean_wo_outl_wo_corr characteristics:
- Removed univariated outliers
- Removed variables with high collinearity

<br>
<br>
<br>
<br>
<br>

#### Encode Categorical Features

In [3]:
dataframe_encoded = pd.get_dummies(dataframe_clean_wo_outl_wo_corr)
dataframe_encoded.head()

Unnamed: 0,Rainfall,WindGustSpeed,Humidity9am,Humidity3pm,Pressure9am,RainToday,RainTomorrow,Location_Adelaide,Location_Albany,Location_Albury,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,0.6,44.0,71.0,22.0,1007.7,0.0,0.0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,0.0,44.0,44.0,25.0,1010.6,0.0,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0.0,46.0,38.0,30.0,1007.6,0.0,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,0.0,24.0,45.0,16.0,1017.6,0.0,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1.0,41.0,82.0,33.0,1010.8,0.0,0.0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


<br>
<br>
<br>
<br>
<br>

### Model Tranining

In [4]:
#Create X and y dataframes
X = dataframe_encoded[[c for c in dataframe_encoded if c != 'RainTomorrow']].values
y = dataframe_encoded[['RainTomorrow']]

In [5]:
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X, y,random_state=0, test_size=0.30)

In [7]:
def objective_xgb(trial):
    param = {
        "verbosity": 1,
        "objective": "binary:logistic",
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        'eval_metric': 'error',
    }

    #param["booster"] == "gbtree"
    param["subsample"] = trial.suggest_float("subsample", 1e-8, 1.0, log=True)
    param["n_estimators"] = trial.suggest_int("n_estimators", 1, 1000)        
    param["max_depth"] = trial.suggest_int("max_depth", 1, 64)
    param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
    param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
    param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    bst = xgb.XGBClassifier(**param)
    bst.fit(X_train, y_train)

    y_pred = bst.predict(X_val)
    accuracy = sklearn.metrics.accuracy_score(y_val, y_pred)

    return -accuracy  #Negative accuracy to maximize it (because 'eval_metric': 'error')

#### Applying StandardScaler

In [8]:
sc_X = MinMaxScaler()
sc_y = MinMaxScaler()
X_sc = sc_X.fit_transform(X)
y_sc = sc_y.fit_transform(y)

In [9]:
X_sc_train, X_sc_val, y_sc_train, y_sc_val = sklearn.model_selection.train_test_split(X_sc, y_sc, random_state=0, test_size=0.30)

#### Hyperparameter optimization with Optuna

In [10]:
study_xgb = optuna.create_study()
study_xgb.optimize(objective_xgb, n_trials=10)
study_xgb.best_params

[I 2023-11-22 20:00:08,143] A new study created in memory with name: no-name-239eaaca-bec9-40d5-95af-5fc353ac8417
[I 2023-11-22 20:01:17,621] Trial 0 finished with value: -0.8242849606642354 and parameters: {'booster': 'gbtree', 'lambda': 7.545295590749144e-08, 'alpha': 0.6000440389410776, 'subsample': 0.8548783932123012, 'n_estimators': 759, 'max_depth': 2, 'eta': 3.097424724587213e-06, 'gamma': 2.1784690775011526e-08, 'grow_policy': 'lossguide'}. Best is trial 0 with value: -0.8242849606642354.
[I 2023-11-22 20:02:53,315] Trial 1 finished with value: -0.8422899607789169 and parameters: {'booster': 'gbtree', 'lambda': 0.342177913176916, 'alpha': 0.005949996343455195, 'subsample': 0.007229586354650772, 'n_estimators': 595, 'max_depth': 9, 'eta': 9.834271555196803e-05, 'gamma': 2.8880918753160445e-05, 'grow_policy': 'depthwise'}. Best is trial 1 with value: -0.8422899607789169.
[I 2023-11-22 20:10:31,839] Trial 2 finished with value: -0.8477258652721392 and parameters: {'booster': 'gbtr

{'booster': 'gbtree',
 'lambda': 0.00012590745625659945,
 'alpha': 0.0027868967368774207,
 'subsample': 0.9232800553773376,
 'n_estimators': 379,
 'max_depth': 21,
 'eta': 0.030538597049319792,
 'gamma': 0.001118029638832503,
 'grow_policy': 'lossguide'}

In [11]:
xgb_params = study_xgb.best_params
xgb_params

{'booster': 'gbtree',
 'lambda': 0.00012590745625659945,
 'alpha': 0.0027868967368774207,
 'subsample': 0.9232800553773376,
 'n_estimators': 379,
 'max_depth': 21,
 'eta': 0.030538597049319792,
 'gamma': 0.001118029638832503,
 'grow_policy': 'lossguide'}

#### Train model using best parameters

In [12]:
model = XGBClassifier(**xgb_params, silent=0)

model.fit(X_sc_train, y_sc_train, eval_set=[(X_sc_val, y_sc_val)], early_stopping_rounds=10, verbose=True)



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-logloss:0.67630
[1]	validation_0-logloss:0.66050
[2]	validation_0-logloss:0.64566
[3]	validation_0-logloss:0.63158
[4]	validation_0-logloss:0.61825
[5]	validation_0-logloss:0.60567
[6]	validation_0-logloss:0.59384
[7]	validation_0-logloss:0.58261
[8]	validation_0-logloss:0.57198
[9]	validation_0-logloss:0.56192
[10]	validation_0-logloss:0.55232
[11]	validation_0-logloss:0.54323
[12]	validation_0-logloss:0.53473
[13]	validation_0-logloss:0.52644
[14]	validation_0-logloss:0.51862
[15]	validation_0-logloss:0.51126
[16]	validation_0-logloss:0.50420
[17]	validation_0-logloss:0.49747
[18]	validation_0-logloss:0.49101
[19]	validation_0-logloss:0.48488
[20]	validation_0

#### Predict using validation dataset

In [13]:
y_predicted = model.predict(X_sc_val)
y_predicted

array([0, 0, 0, ..., 1, 0, 0])

#### Model performance evaluation

In [14]:
conf_matrix = confusion_matrix(y_sc_val, y_predicted)

accuracy = accuracy_score(y_sc_val, y_predicted)
precision = precision_score(y_sc_val, y_predicted)
recall = recall_score(y_sc_val, y_predicted)
f1 = f1_score(y_sc_val, y_predicted)
roc_auc = roc_auc_score(y_sc_val, y_predicted)

print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("ROC AUC:", roc_auc)

Confusion Matrix:
 [[32302  1817]
 [ 4731  4749]]
Accuracy: 0.8498130691070896
Precision: 0.7232713981114834
Recall: 0.5009493670886076
F1-Score: 0.5919232207403714
ROC AUC: 0.7238472911822768


#### Save model

In [16]:
ubi = r'C:\Users\Lucio\Documents\Github\Next-day-rain-prediction\3- Models/XGBClf_rain_pred.joblib'

joblib.dump(model, ubi)

['C:\\Users\\Lucio\\Documents\\Github\\Next-day-rain-prediction\\3- Models/XGBClf_rain_pred.joblib']