### **Basic Libraries**

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = np.load("data/train_matrix.npy")
total_amount = np.load("data/train_labels.npy")

In [3]:
print(train.shape)
print(total_amount.shape)

(2689006, 53)
(2689006,)


In [4]:
test = np.load("data/test_matrix.npy")
test_amount = np.load("data/test_labels.npy")

### **Split Training Data into validation and training**

In [5]:
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(train, total_amount, test_size=0.10, random_state=123)

In [6]:
del train, total_amount

### **LightGBM without parameters**

In [7]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

In [8]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
reg = LGBMRegressor()
reg.fit(xtrain, ytrain)

In [9]:
pred = reg.predict(xvalid)
mse = mean_squared_error(yvalid, pred)
rmse = np.sqrt(mse)
print(rmse)

2.7185601578298515


In [10]:
pred = reg.predict(test)
mse = mean_squared_error(test_amount, pred)
rmse = np.sqrt(mse)
print(rmse)

5.702606731198352


### **Parameter optimisation using Optuna**

In [14]:
def optimize(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 5, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 40, 400),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
        "n_jobs": -1
    }
    reg = LGBMRegressor(**params)
    reg.fit(xtrain, ytrain) 
    pred = reg.predict(xvalid)
    mse = mean_squared_error(yvalid, pred)
    rmse = np.sqrt(mse)
    return rmse

In [15]:
import optuna
study = optuna.create_study(direction="minimize")
study.optimize(optimize, n_trials=50)

[32m[I 2022-08-21 13:43:40,108][0m A new study created in memory with name: no-name-ab4f0b23-862e-4eb7-9f6f-2c4ea590e271[0m
[32m[I 2022-08-21 13:43:48,408][0m Trial 0 finished with value: 2.6255510555897037 and parameters: {'max_depth': 15, 'min_child_samples': 337, 'subsample': 0.5012739623603298, 'learning_rate': 0.4222393872703886}. Best is trial 0 with value: 2.6255510555897037.[0m
[32m[I 2022-08-21 13:43:56,797][0m Trial 1 finished with value: 2.6773100940197163 and parameters: {'max_depth': 5, 'min_child_samples': 376, 'subsample': 0.583888054474553, 'learning_rate': 0.6023945798345121}. Best is trial 0 with value: 2.6255510555897037.[0m
[32m[I 2022-08-21 13:44:06,465][0m Trial 2 finished with value: 2.686392565366239 and parameters: {'max_depth': 6, 'min_child_samples': 155, 'subsample': 0.9437615573175333, 'learning_rate': 0.19521353730810825}. Best is trial 0 with value: 2.6255510555897037.[0m
[32m[I 2022-08-21 13:44:16,053][0m Trial 3 finished with value: 2.6597

In [16]:
del xtrain, xvalid, ytrain, yvalid

### **Final Modal**

In [17]:
train = np.load("data/train_matrix.npy")
total_amount = np.load("data/train_labels.npy")

In [19]:
reg = LGBMRegressor(max_depth=10, min_child_samples=13, subsample=0.7, learning_rate=0.25, n_jobs=-1)
reg.fit(train, total_amount)

### **Predictions**

In [20]:
pred = reg.predict(test)
mse = mean_squared_error(test_amount, pred)
rmse = np.sqrt(mse)
print(rmse)

5.658210424618493
