In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import glob
import lightgbm as lgb
from tqdm import tqdm
from loaders.blog_loader import load_data, mono_list
from monotonenorm import SigmaNet, GroupSort, direct_norm
import mup

In [2]:
Xtr, Ytr, Xts, Yts = load_data(get_categorical_info=False)

In [3]:
monotone_constraints = [1 if i in mono_list else 0 for i in range(Xtr.shape[1])]

In [4]:
clf = lgb.LGBMRegressor(n_estimators=10000, max_depth=5, learning_rate=.1, monotone_constraints=monotone_constraints)
clf.fit(Xtr, Ytr, early_stopping_rounds=200, eval_set=[(Xts, Yts)], eval_metric='rmse', verbose=0)

LGBMRegressor(max_depth=5,
              monotone_constraints=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...],
              n_estimators=10000)

In [5]:
rmse_tr = (((clf.predict(Xtr) - Ytr)**2).mean())**.5
rmse_ts = (((clf.predict(Xts) - Yts)**2).mean())**.5
rmse_tr, rmse_ts

(0.14963566230704012, 0.14832811996053064)

In [44]:

torch.manual_seed(13)

per_layer_lip = .5
width = 32

class Model(torch.nn.Module):
  def __init__(self, width, robust=False, sigma=False):
    super().__init__()
    if robust:
      from monotonenorm import direct_norm
      activation = lambda : GroupSort(1)
    else:
      direct_norm = lambda x, *args, **kwargs: x # make it a normal network
      #activation = lambda : torch.nn.ReLU()
      #swish
      activation = lambda : torch.nn.SiLU()

    self.nn = torch.nn.Sequential(
      direct_norm(torch.nn.Linear(Xtr.shape[1], width), kind="one-inf", alpha=per_layer_lip),
      activation(),
      direct_norm(torch.nn.Linear(width, width), kind="inf", alpha=per_layer_lip),
      activation(),
      direct_norm(torch.nn.Linear(width, width), kind="inf", alpha=per_layer_lip),
      activation(),
      direct_norm(torch.nn.Linear(width, 1), kind="inf", alpha=per_layer_lip),
      #direct_norm(mup.MuReadout(width, 1), kind="inf", alpha=per_layer_lip),
    )
    if sigma:
      self.nn = SigmaNet(self.nn, sigma=per_layer_lip**4, monotone_constraints= monotone_constraints)
  
  def forward(self, x):
    return self.nn(x)

# base = Model(1)
# delta = Model(2)
model = Model(width, robust=True, sigma=True)
#mup.set_base_shapes(model, base, delta=delta)

# for param in model.parameters():
#     ### If initializing manually with fixed std or bounds,
#     ### then replace with same function from mup.init
#     # torch.nn.init.uniform_(param, -0.1, 0.1)
#     mup.init.uniform_(param, -0.1, 0.1)
#     ### Likewise, if using
#     ###   `xavier_uniform_, xavier_normal_, kaiming_uniform_, kaiming_normal_`
#     ### from `torch.nn.init`, replace with the same functions from `mup.init`


model = model.cuda()

#optimizer = mup.MuAdam(model.parameters(), lr=3e-3)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
print('params:', sum(p.numel() for p in model.parameters()))
#scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=.999)



params: 11009


In [45]:
model

Model(
  (nn): SigmaNet(
    (nn): Sequential(
      (0): Linear(in_features=276, out_features=32, bias=True)
      (1): GroupSort(num_groups: {self.n_groups})
      (2): Linear(in_features=32, out_features=32, bias=True)
      (3): GroupSort(num_groups: {self.n_groups})
      (4): Linear(in_features=32, out_features=32, bias=True)
      (5): GroupSort(num_groups: {self.n_groups})
      (6): Linear(in_features=32, out_features=1, bias=True)
    )
  )
)

In [46]:
Xtrt = torch.tensor(Xtr, dtype=torch.float32).cuda()
Ytrt = torch.tensor(Ytr, dtype=torch.float32).view(-1, 1).cuda()
Xtst = torch.tensor(Xts, dtype=torch.float32).cuda()
Ytst = torch.tensor(Yts, dtype=torch.float32).view(-1, 1).cuda()
#std = 1
mean = Xtrt.mean(0)
std = Xtrt.std(0)

Xtrt = (Xtrt - mean) / std
Xtst = (Xtst - mean) / std


dataloader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(Xtrt, Ytrt), batch_size=256, shuffle=True)

bar = tqdm(range(1000))
for i in bar:
  for Xi, yi in dataloader:
    y_pred = model(Xi)
    losstr = torch.nn.functional.mse_loss(y_pred, yi)
    optimizer.zero_grad()
    losstr.backward()
    optimizer.step()
    #scheduler.step()

  with torch.no_grad():
    y_predts = model(Xtst)
    lossts = torch.nn.functional.mse_loss(y_predts, Ytst)
    bar.set_description(f'Loss: {losstr.item():.4f} {lossts.item():.4f}, RMSE: {(losstr.item()**.5):.4f} {(lossts.item()**.5):.4f}')

Loss: 0.0286 0.0296, RMSE: 0.1692 0.1721:   5%|▌         | 52/1000 [00:37<11:26,  1.38it/s]


KeyboardInterrupt: 