In [8]:
### **Basic Libraries**

In [9]:
import numpy as np
import pandas as pd

In [28]:
train = np.load("data/train_matrix.npy")
total_amount = np.load("data/train_labels.npy")

In [11]:
print(train.shape)
print(total_amount.shape)

(2689034, 53)
(2689034,)


In [12]:
test = np.load("data/test_matrix.npy")
test_amount = np.load("data/test_labels.npy")

In [13]:
### **Split Training Data**

In [14]:
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(train, total_amount, test_size=0.10, random_state=123 )

In [16]:
### **LightGBM without parameters**

In [17]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

In [18]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
reg = LGBMRegressor()
reg.fit(xtrain, ytrain)

In [19]:
pred = reg.predict(xvalid)
mse = mean_squared_error(yvalid, pred)
rmse = np.sqrt(mse)
print(rmse)

2.694677568159574


In [20]:
pred = reg.predict(test)
mse = mean_squared_error(test_amount, pred)
rmse = np.sqrt(mse)
print(rmse)

5.424498820992102


In [21]:
### **Parameter optimisation using Optuna**

In [22]:
def optimize(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 5, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 40, 400),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 10.0),
        "n_jobs": -1
    }
    reg = LGBMRegressor(**params)
    reg.fit(xtrain, ytrain) 
    pred = reg.predict(xvalid)
    mse = mean_squared_error(yvalid, pred)
    rmse = np.sqrt(mse)
    return rmse

In [23]:
import optuna
study = optuna.create_study(direction="minimize")
study.optimize(optimize, n_trials=50)

[32m[I 2022-08-16 18:58:06,515][0m A new study created in memory with name: no-name-3ed28db9-7b6c-4a46-84a0-ee1552a624f2[0m
[32m[I 2022-08-16 18:58:17,308][0m Trial 0 finished with value: 1.4653935140231755e+38 and parameters: {'max_depth': 5, 'min_child_samples': 322, 'subsample': 0.6371820508028108, 'learning_rate': 4.267468357158954}. Best is trial 0 with value: 1.4653935140231755e+38.[0m
[32m[I 2022-08-16 18:58:32,009][0m Trial 1 finished with value: 1454105192172.6335 and parameters: {'max_depth': 14, 'min_child_samples': 54, 'subsample': 0.6536379396771976, 'learning_rate': 2.288808121127226}. Best is trial 1 with value: 1454105192172.6335.[0m
[32m[I 2022-08-16 18:58:45,582][0m Trial 2 finished with value: 5.939386487354532e+17 and parameters: {'max_depth': 12, 'min_child_samples': 232, 'subsample': 0.8585631595099117, 'learning_rate': 2.4665604516647144}. Best is trial 1 with value: 1454105192172.6335.[0m
[32m[I 2022-08-16 18:58:54,631][0m Trial 3 finished with val

In [24]:
del xtrain, xvalid, ytrain, yvalid

In [25]:
### **Final Modal**

In [29]:
reg = LGBMRegressor(max_depth=12, min_child_samples=12, subsample=0.5, learning_rate=0.5, n_jobs=-1)
reg.fit(train, total_amount)
pred = reg.predict(test)
mse = mean_squared_error(test_amount, pred)
rmse = np.sqrt(mse)
print(rmse)

5.3655729742342535


In [None]:
### **Save Model**

In [None]:
### **Predictions**

In [None]:
pred = reg.predict(test)
mse = mean_squared_error(test_amount, pred)
rmse = np.sqrt(mse)
print(rmse)

In [33]:
2.57/2.69

0.9553903345724907

In [34]:
5.36/5.42

0.988929889298893

In [35]:
print(test_amount[1000:1020])      
print(pred[1000:1020])      

[38.35 19.24 39.3  19.8  12.43 51.54 12.3  12.85 14.76 17.16 15.96 13.56
 22.3  45.65 16.8  18.3  16.56 12.36 16.38 10.3 ]
[38.18757043 17.74369212 42.22670592 21.66078636 12.65424084 52.30006214
 11.97186719 11.70528872 14.26221954 16.83989133 15.49606916 13.39244145
 28.05898526 46.7507621  33.32419388 18.22802055 15.17275875 12.2041193
 17.92550622 10.72688762]
