In [137]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import glob
import lightgbm as lgb
from tqdm import tqdm
from loaders.blog_loader import load_data, mono_list
from monotonenorm import SigmaNet, GroupSort, direct_norm
import optuna

In [10]:
Xtr, Ytr, Xts, Yts = load_data(get_categorical_info=False)

In [11]:
monotone_constraints = [1 if i in mono_list else 0 for i in range(Xtr.shape[1])]

In [12]:
clf = lgb.LGBMRegressor(n_estimators=10000, max_depth=5, learning_rate=.1, monotone_constraints=monotone_constraints)
clf.fit(Xtr, Ytr, early_stopping_rounds=200, eval_set=[(Xts, Yts)], eval_metric='rmse', verbose=0)

LGBMRegressor(max_depth=5,
              monotone_constraints=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...],
              n_estimators=10000)

In [13]:
rmse_tr = (((clf.predict(Xtr) - Ytr)**2).mean())**.5
rmse_ts = (((clf.predict(Xts) - Yts)**2).mean())**.5
rmse_tr, rmse_ts

(0.14963566230704012, 0.14832811996053064)

In [135]:
Xtrt = torch.tensor(Xtr, dtype=torch.float32).cuda()
Ytrt = torch.tensor(Ytr, dtype=torch.float32).view(-1, 1).cuda()
Xtst = torch.tensor(Xts, dtype=torch.float32).cuda()
Ytst = torch.tensor(Yts, dtype=torch.float32).view(-1, 1).cuda()
#std = 1
mean = Xtrt.mean(0)
std = Xtrt.std(0)

Xtrt = (Xtrt - mean) / std
Xtst = (Xtst - mean) / std


def run_exp(seed):
  torch.manual_seed(seed)
  dataloader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(Xtrt, Ytrt), batch_size=256, shuffle=True)

  per_layer_lip = .5
  width = 4

  class Model(torch.nn.Module):
    def __init__(self, robust=False, sigma=False):
      super().__init__()
      if robust:
        from monotonenorm import direct_norm
        activation = lambda : GroupSort(1)
      else:
        direct_norm = lambda x, *args, **kwargs: x # make it a normal network
        activation = lambda : GroupSort(1)
        #activation = lambda : torch.nn.ReLU()
        #swish
        #activation = lambda : torch.nn.SiLU()

      self.nn = torch.nn.Sequential(
        direct_norm(torch.nn.Linear(Xtr.shape[1], width), kind="one-inf", alpha=per_layer_lip),
        activation(),
        direct_norm(torch.nn.Linear(width, width), kind="inf", alpha=per_layer_lip),
        activation(),
        direct_norm(torch.nn.Linear(width, width), kind="inf", alpha=per_layer_lip),
        activation(),
        direct_norm(torch.nn.Linear(width, 1), kind="inf", alpha=per_layer_lip),
        #direct_norm(mup.MuReadout(width, 1), kind="inf", alpha=per_layer_lip),
      )
      if sigma:
        self.nn = SigmaNet(self.nn, sigma=per_layer_lip**5, monotone_constraints= monotone_constraints)
    
    def forward(self, x):
      return self.nn(x)

  model = Model(robust=True, sigma=True).cuda()
  print(model)
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
  EPOCHS = 200
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=3e-3, steps_per_epoch=len(dataloader), epochs=EPOCHS)
  print('params:', sum(p.numel() for p in model.parameters()))
  bar = tqdm(range(EPOCHS))
  best_rmse = 1
  for i in bar:
    for Xi, yi in dataloader:
      y_pred = model(Xi)
      losstr = torch.nn.functional.mse_loss(y_pred, yi)
      optimizer.zero_grad()
      losstr.backward()
      optimizer.step()
      scheduler.step()
    with torch.no_grad():
      y_predts = model(Xtst)
      lossts = torch.nn.functional.mse_loss(y_predts, Ytst)
      tsrmse = lossts.item()**.5
      trrmse = losstr.item()**.5
      best_rmse = min(best_rmse, tsrmse)
      bar.set_description(f"train rmse: {trrmse:.4f} test rmse: {tsrmse:.4f}, best: {best_rmse:.4f}, lr: {scheduler.get_last_lr()[0]:.4f}")
  return best_rmse



In [136]:
run_exp(1)

Model(
  (nn): SigmaNet(
    (nn): Sequential(
      (0): Linear(in_features=276, out_features=4, bias=True)
      (1): GroupSort(num_groups: {self.n_groups})
      (2): Linear(in_features=4, out_features=4, bias=True)
      (3): GroupSort(num_groups: {self.n_groups})
      (4): Linear(in_features=4, out_features=4, bias=True)
      (5): GroupSort(num_groups: {self.n_groups})
      (6): Linear(in_features=4, out_features=1, bias=True)
    )
  )
)
params: 1153


train rmse: 0.1277 test rmse: 0.1685, best: 0.1677, lr: 0.0000: 100%|██████████| 200/200 [02:02<00:00,  1.63it/s]


0.1677173962770335