In [1]:
import os
import types

import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.data.sampler import WeightedRandomSampler

from sklearn.impute import KNNImputer

# from skorch import NeuralNetRegressor

import numpy as np
from tqdm import tqdm

from loss import LossComputer

from transformers import AdamW, BertTokenizer
from datasets import load_dataset

device = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mljyflores_team[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.14 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [2]:
class DRODataset(Dataset):
    def __init__(self, X, y, g):
        self.X = torch.Tensor(X)
        self.y = torch.tensor(y)
        self.group_to_name = {i:name for (i,name) in enumerate(set(g))}
        self.name_to_group = {name:i for (i,name) in enumerate(set(g))}
        self.g = torch.tensor([self.name_to_group[i] for i in g])
        self.n_groups = len(set(g))
        self._group_array = torch.LongTensor(self.g)
        self._group_counts = (torch.arange(self.n_groups).unsqueeze(1)==self._group_array).sum(1).float()

    def __len__(self):
        return len(self.g)

    def __getitem__(self, idx):
        X_row = self.X[idx].float()
        y_row = self.y[idx].float()
        g_row = self.g[idx].float()
        sample = {"X": X_row, "y": y_row, "g":g_row}
        return sample
  
    def group_counts(self):
        return torch.bincount(self.g)

    def get_loader(self, train=False, reweight_groups=False, **kwargs):
        if not train: # Validation or testing
            assert reweight_groups is None
            shuffle = False
            sampler = None
        elif not reweight_groups: # Training but not reweighting
            shuffle = True
            sampler = None
        else: # Training and reweighting
            # When the --robust flag is not set, reweighting changes the loss function
            # from the normal ERM (average loss over each training example)
            # to a reweighted ERM (weighted average where each (y,c) group has equal weight) .
            # When the --robust flag is set, reweighting does not change the loss function
            # since the minibatch is only used for mean gradient estimation for each group separately
            group_weights = len(self)/self._group_counts
            weights = group_weights[self._group_array]

            # Replacement needs to be set to True, otherwise we'll run out of minority samples
            sampler = WeightedRandomSampler(weights, len(self), replacement=True)
            shuffle = False

        loader = DataLoader(
          self,
          shuffle=shuffle,
          sampler=sampler,
          **kwargs)
        return loader
  
    def group_str(self, group_idx):
        self.group_to_name[group_idx]
    

In [3]:
def run_epoch(epoch, model, optimizer, loader, loss_computer, # logger, csv_logger, 
              args, is_training, show_progress=False, 
              log_every=50, scheduler=None):
    """
    scheduler is only used inside this function if model is bert.
    """

    if is_training:
        model.train()
    else:
        model.eval()

    prog_bar_loader = tqdm(loader)

    with torch.set_grad_enabled(is_training):
        for batch_idx, batch in enumerate(prog_bar_loader):
            # Unpack batch, feed through model, compute loss
            if args['d']=='dhs':
                x, y, g = batch['X'].to(f'cuda:{model.device_ids[0]}'), batch['y'].to(f'cuda:{model.device_ids[0]}'), batch['g'].to(f'cuda:{model.device_ids[0]}')
                outputs = model(x)
                loss_main = loss_computer.loss(outputs.view(-1), y, g, is_training)
        
            elif args['d']=='liar':
                batch = {k: v.to(f'cuda:{model.device_ids[0]}') for k, v in batch.items()}
                outputs = model(**batch)
                loss_main = loss_computer.loss(outputs['logits'], 
                                            batch['labels'], 
                                            batch['labels'], 
                                            is_training)
      
      
            # Backpropagate
            if is_training:
                optimizer.zero_grad()
                loss_main.backward()
                optimizer.step()

            if is_training and (batch_idx+1) % log_every==0:
                loss_computer.reset_stats()

            if (not is_training) or loss_computer.batch_count > 0:
                if is_training:
                    loss_computer.reset_stats()


def train(model, criterion, dataset, args, epoch_offset):    
    # model = model.to(device)

    # process generalization adjustment stuff
    adjustments = [float(c) for c in args['generalization_adjustment'].split(',')]
    assert len(adjustments) in (1, dataset['train_data'].n_groups)
    if len(adjustments)==1:
        adjustments = np.array(adjustments* dataset['train_data'].n_groups)
    else:
        adjustments = np.array(adjustments)

    # Define loss, optimizer, and LR scheduler
    train_loss_computer = LossComputer(
                            criterion,
                            is_robust=args['robust'],
                            dataset=dataset['train_data'],
                            alpha=args['alpha'],
                            gamma=args['gamma'],
                            adj=adjustments,
                            step_size=args['robust_step_size'],
                            normalize_loss=args['use_normalized_loss'],
                            btl=args['btl'],
                            min_var_weight=args['minimum_variational_weight'])
    optimizer = torch.optim.AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=args['lr'],
        # momentum=0.9,
        weight_decay=args['weight_decay'])
    if args['scheduler']:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            'min',
            factor=0.1,
            patience=5,
            threshold=0.0001,
            min_lr=0,
            eps=1e-08)
    else:
        scheduler = None

    # Iterate through epochs
    best_val_acc = 0
    for epoch in range(epoch_offset, epoch_offset+args['n_epochs']):
      
        # Training Set
        # logger.write('\nEpoch [%d]:\n' % epoch)
        print(f'Epoch {epoch}')
        print(f'Training')
        run_epoch(
            epoch, model, optimizer, 
            dataset['train_loader'],
            train_loss_computer, 
            args,
            is_training=True,
            show_progress=args['show_progress'],
            log_every=args['log_every'],
            scheduler=scheduler)

        # Validation Set
        print(f'Validation')
        val_loss_computer = LossComputer(
            criterion,
            is_robust=args['robust'],
            dataset=dataset['val_data'],
            step_size=args['robust_step_size'],
            alpha=args['alpha'])
        run_epoch(
            epoch, model, optimizer, 
            dataset['val_loader'],
            val_loss_computer, # logger, val_csv_logger, 
            args,
            is_training=False)

        # Test set; don't print to avoid peeking
        if dataset['test_data'] is not None:
            test_loss_computer = LossComputer(
                criterion,
                is_robust=args['robust'],
                dataset=dataset['test_data'],
                step_size=args['robust_step_size'],
                alpha=args['alpha'])
            run_epoch(
                epoch, model, optimizer, 
                dataset['test_loader'],
                test_loss_computer, # None, test_csv_logger, 
                args,
                is_training=False)

        # Inspect learning rates
        if (epoch+1) % 1 == 0:
            for param_group in optimizer.param_groups:
                curr_lr = param_group['lr']
                print(f'Current lr: {curr_lr}')

        if args['scheduler'] and args['model'] != 'bert':
            if args['robust']:
                val_loss, _ = val_loss_computer.compute_robust_loss_greedy(val_loss_computer.avg_group_loss, val_loss_computer.avg_group_loss)
            else:
                val_loss = val_loss_computer.avg_actual_loss
            scheduler.step(val_loss) #scheduler step to update lr at the end of epoch

        if epoch % args['save_step'] == 0:
            if args['d'] in ['dhs']:
                torch.save(model, os.path.join(args['log_dir'], args['d'], '%d_model.pth' % epoch))
            else:
                model.module.save_pretrained(os.path.join(args['log_dir'], args['d'], '%d_model' % epoch))

        if args['save_last']:
            if args['d'] in ['dhs']:
                torch.save(model, os.path.join(args['log_dir'], args['d'], 'last_model.pth'))
            else:
                model.module.save_pretrained(os.path.join(args['log_dir'], args['d'], 'last_model'))

        if args['save_best']:
            if args['robust'] or args['reweight_groups']:
                curr_val_acc = min(val_loss_computer.avg_group_acc)
            else:
                curr_val_acc = val_loss_computer.avg_acc
            print(f'Current validation accuracy: {curr_val_acc}')
            if curr_val_acc > best_val_acc:
                best_val_acc = curr_val_acc
                if args['d'] in ['dhs']:
                    torch.save(model, os.path.join(args['log_dir'], args['d'], 'best_model'))
                else:
                    model.module.save_pretrained(os.path.join(args['log_dir'], args['d'], 'best_model'))
                print(f'Best model saved at epoch {epoch}')

        if args['automatic_adjustment']:
            gen_gap = val_loss_computer.avg_group_loss - train_loss_computer.exp_avg_loss
            adjustments = gen_gap * torch.sqrt(train_loss_computer.group_counts)
            train_loss_computer.adj = adjustments
            print('Adjustments updated\n')
            for group_idx in range(train_loss_computer.n_groups):
              pass
        #         logger.write(
        #             f'  {train_loss_computer.get_group_name(group_idx)}:\t'
        #             f'adj = {train_loss_computer.adj[group_idx]:.3f}\n')
        # logger.write('\n')
        # wandb.finish()
    
    return {'train_loss': train_loss_computer,
            'val_loss':   val_loss_computer,
            'test_loss':  test_loss_computer}
        

In [4]:
# Args
args = {'s':'confounder',
        't':None,
        'c':None,
        'resume':False,
        'minority_fraction':None,
        'imbalance_ratio':None,
        'fraction':1.0,
        'root_dir':None,
        'reweight_groups':False,
        'augment_data':False,
        'val_fraction':0.1,
        'alpha':0.2,
        'generalization_adjustment':"0.0",
        'automatic_adjustment':False, 
        'robust_step_size':0.01,
        'use_normalized_loss':False,
        'btl':False,
        'hinge':False,
        'train_from_scratch':False,
        'scheduler':True,
        'weight_decay':0.001,
        'gamma':0.1,
        'minimum_variational_weight':0,
        'seed':0,
        'show_progress':False,
        'log_dir':'logs/',
        'log_every':50,
        'save_step':10,
        'save_best':False,
        'save_last':False,
        'clip':0.05}

## Dataset

### DHS Poverty Prediction

In [None]:
def classify_num(x):
  if x<-100000:
    return "High Poverty"
  elif x<0:
    return "Moderate Poverty"
  elif x<100000:
    return "Moderate Wealth"
  else:
    return "High Wealth"

In [None]:
# Read data
dhs = pd.read_csv("/content/drive/MyDrive/S&DS 632/group_DRO-master/dhs.csv")
dhs['Wealth_Class'] = dhs['Wealth Index'].apply(classify_num)

# Get X, y, g
dhs_X = dhs.drop(['Cluster number','DHSCLUST','Wealth Index','Wealth_Class'], axis=1)
dhs_y = dhs['Wealth Index']
dhs_g = dhs['Wealth_Class']

# Impute missing training values
imputer = KNNImputer(n_neighbors=4)
dhs_X = imputer.fit_transform(dhs_X)

In [None]:
dro_dataset  = DRODataset(dhs_X, dhs_y, dhs_g)
dro_dataload = DataLoader(dro_dataset, batch_size=args['batch_size'], shuffle=True)

# Generate data
data = {}
data['train_loader'] = dro_dataload # train_loader
data['val_loader']   = dro_dataload # val_loader
data['test_loader']  = dro_dataload # test_loader
data['train_data']   = dro_dataset  # train_data
data['val_data']     = dro_dataset  # val_data
data['test_data']    = dro_dataset  # test_data

### LIAR

In [5]:
args['d'] = 'liar'

In [6]:
# Get data and splits
dataset = load_dataset('liar')
train_data, test_data, val_data = dataset['train'], dataset['test'], dataset['validation']

# Tokenize
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def encode_dict(d):
    return tokenizer(d['statement'], truncation=True, padding='max_length')

train_data = train_data.map(encode_dict, batched=True)
test_data  = test_data.map(encode_dict, batched=True)
val_data   = val_data.map(encode_dict, batched=True)

# Rename 'label' key to 'labels'
train_data = train_data.map(lambda examples: {'labels': examples['label']}, batched=True)
test_data  = test_data.map(lambda examples: {'labels': examples['label']}, batched=True)
val_data   = val_data.map(lambda examples: {'labels': examples['label']}, batched=True)

# Format the dataset
def format_LIAR_dataset(dataset):
    g = np.array([int(d['label']) for d in dataset])
    dataset.n_groups = len(set(g))
    dataset._group_array = torch.LongTensor(g)
    dataset._group_counts = (torch.arange(dataset.n_groups).unsqueeze(1)==dataset._group_array).sum(1).float()
    dataset.group_counts = dataset._group_counts
    return dataset

train_data = format_LIAR_dataset(train_data)
test_data  = format_LIAR_dataset(test_data)
val_data   = format_LIAR_dataset(val_data)

Using custom data configuration default
Reusing dataset liar (/home/lily/lyf6/.cache/huggingface/datasets/liar/default/1.0.0/479463e757b7991eed50ffa7504d7788d6218631a484442e2098dabbf3b44514)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/lily/lyf6/.cache/huggingface/datasets/liar/default/1.0.0/479463e757b7991eed50ffa7504d7788d6218631a484442e2098dabbf3b44514/cache-be82b1962002fc97.arrow
Loading cached processed dataset at /home/lily/lyf6/.cache/huggingface/datasets/liar/default/1.0.0/479463e757b7991eed50ffa7504d7788d6218631a484442e2098dabbf3b44514/cache-eceadc3eed73f948.arrow
Loading cached processed dataset at /home/lily/lyf6/.cache/huggingface/datasets/liar/default/1.0.0/479463e757b7991eed50ffa7504d7788d6218631a484442e2098dabbf3b44514/cache-3a111b8a2d8cb711.arrow
Loading cached processed dataset at /home/lily/lyf6/.cache/huggingface/datasets/liar/default/1.0.0/479463e757b7991eed50ffa7504d7788d6218631a484442e2098dabbf3b44514/cache-eb8559b88a236780.arrow
Loading cached processed dataset at /home/lily/lyf6/.cache/huggingface/datasets/liar/default/1.0.0/479463e757b7991eed50ffa7504d7788d6218631a484442e2098dabbf3b44514/cache-80adcbda198326de.arrow
Loading cached processed dataset at

In [7]:
args['batch_size'] = 16

In [8]:
# Fix formatting for dataloader
weights = (len(train_data)/train_data._group_counts)[train_data._group_array]
sampler = WeightedRandomSampler(weights, len(train_data), replacement=True)
train_data.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
train_dataloader = torch.utils.data.DataLoader(train_data, 
                                               batch_size=args['batch_size'],
                                               shuffle=False,
                                               sampler=sampler)

test_data.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=args['batch_size'])

val_data.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=args['batch_size'])

# Generate data
data = {}
data['train_data']   = train_data  # train_data
data['val_data']     = val_data    # val_data
data['test_data']    = test_data   # test_data
data['train_loader'] = train_dataloader # train_loader
data['val_loader']   = val_dataloader   # val_loader
data['test_loader']  = test_dataloader  # test_loader

## Model and Criterion

### MLP Regression Model

In [None]:
class MLP_Regression(torch.nn.Module):
  def __init__(self, num_variables):
    super(MLP_Regression, self).__init__()
    self.linear1     = torch.nn.Linear(num_variables, 128)
    self.dropout1    = torch.nn.Dropout(p=0.5)
    self.activation1 = torch.nn.ReLU()
    self.linear2     = torch.nn.Linear(128, 64)
    self.dropout2    = torch.nn.Dropout(p=0.5)
    self.activation2 = torch.nn.ReLU()
    self.linear3     = torch.nn.Linear(64, 32)
    self.dropout3    = torch.nn.Dropout(p=0.5)
    # self.activation3 = torch.nn.ReLU()
    self.linear4     = torch.nn.Linear(32, 1)

  def forward(self, x):
    x = self.activation1(self.dropout1(self.linear1(x)))
    x = self.activation2(self.dropout2(self.linear2(x)))
    # x = self.activation3(self.dropout3(self.linear3(x)))
    x = self.dropout3(self.linear3(x))
    return self.linear4(x).view(-1,1)

In [None]:
# Model
model = MLP_Regression(156)

# Criterion
criterion = torch.nn.MSELoss(reduction='none')

### BERT

In [9]:
from transformers import BertConfig, BertForSequenceClassification

In [10]:
config_class = BertConfig
model_class = BertForSequenceClassification

config = config_class.from_pretrained(
    'bert-base-uncased',
    num_labels=6,
    finetuning_task='liar')
model = model_class.from_pretrained(
    'bert-base-uncased',# '/content/drive/MyDrive/S&DS 632/logs/last_model',
    from_tf=False,
    config=config)

model = torch.nn.DataParallel(model, device_ids=[0,1])
model.to(f'cuda:{model.device_ids[0]}')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

In [11]:
def hinge_loss(yhat, y):
  # The torch loss takes in three arguments so we need to split yhat
  # It also expects classes in {+1.0, -1.0} whereas by default we give them in {0, 1}
  # Furthermore, if y = 1 it expects the first input to be higher instead of the second,
  # so we need to swap yhat[:, 0] and yhat[:, 1]...
  torch_loss = torch.nn.MarginRankingLoss(margin=1.0, reduction='none')
  y = (y.float() * 2.0) - 1.0
  return torch_loss(yhat[:, 1], yhat[:, 0], y)
# criterion = hinge_loss

criterion = torch.nn.CrossEntropyLoss(reduction='none')

## Training

In [12]:
args['model'] = 'bert'
args['n_epochs'] = 3
args['lr'] = 1e-5

args['robust'] = True

In [None]:
loss_dict = train(model, criterion, data, args, epoch_offset=0)

In [16]:
testloss = loss_dict['test_loss']


In [17]:
testloss.avg_acc

tensor(2.0213, device='cuda:0')

In [18]:
testloss.avg_group_acc

tensor([1.7442, 1.3884, 1.7484, 2.4439, 2.2785, 3.7817], device='cuda:0')

## Plotting

In [None]:
import seaborn as sns
sns.scatterplot(model(torch.Tensor(dhs_X).to(device)).detach().cpu().numpy().reshape(-1),
                dhs_y)

In [None]:
from sklearn.metrics import r2_score
from sklearn.feature_selection import r_regression
from sklearn.decomposition import PCA

In [None]:
print(np.sqrt(np.mean((model(torch.Tensor(dhs_X).to(device)).detach().cpu().numpy().reshape(-1)-dhs_y)**2)))
print(r2_score(model(torch.Tensor(dhs_X).to(device)).detach().cpu().numpy().reshape(-1), dhs_y))

In [None]:
pca = PCA(n_components=3)
pca.fit(dhs_X)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

[0.76696825 0.20481146 0.00755109]
[4192443.87793695 2166485.01359786  415990.37574294]


In [None]:
dhs_X_ = pca.transform(dhs_X)

In [None]:
r_regression(dhs_X_, dhs_y)

array([ 0.46859513, -0.06751622, -0.26220503])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dhs_X, dhs_y, test_size=0.2, random_state=42)
reg = LinearRegression().fit(X_train, y_train)
print(reg.score(X_train, y_train))
print(np.sqrt(np.mean((reg.predict(X_test)-y_test)**2)))
print(r2_score(reg.predict(X_test),y_test))

0.8442211421166622
40964.583554132725
0.6256822564199208


In [None]:
0.28375245731738574
59620.88131970075
-1.203185418697995

# Scratch

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(dhs_X, dhs_y, 
#                                                     test_size=0.2, 
#                                                     random_state=42)

# regr = AdaBoostRegressor(random_state=0, n_estimators=200)
# scores = cross_val_score(regr, dhs_X, dhs_y, cv=5)
# print(scores)

# regr.fit(dhs_X, dhs_y)

# print(r2_score(dhs_y, regr.predict(dhs_X)))
# sns.scatterplot(regr.predict(X_test),
#                 y_test)