In [None]:
import pandas as pd


df_train = pd.read_csv("/content/train.csv", sep=',')
df_test = pd.read_csv("/content/test.csv", sep=',')

print(df_train.shape)

df_train = df_train.sample(n=3000)
print(df_train.head())
df_train[['Title', 'Description', 'Class Index']][:5]

(120000, 3)
        Class Index                                              Title  \
2579              3         Brookstone Posts a Narrower Loss (Reuters)   
443               4                              Technology as Fashion   
106218            4       Microsoft sues firms for software violations   
98174             2  NBA suspends nine players, Artest for rest of ...   
35150             2  Making seamless transition to Paralympics will...   

                                              Description  
2579    Reuters - Brookstone Inc. (BKST.O), a retailer...  
443     Analyzing the success of the iPod mini in Japa...  
106218  The world #39;s largest computer software manu...  
98174   NBA on Sunday suspended nine players for invol...  
35150   VANCOUVER (CP) - How to make a seamless transi...  


Unnamed: 0,Title,Description,Class Index
2579,Brookstone Posts a Narrower Loss (Reuters),"Reuters - Brookstone Inc. (BKST.O), a retailer...",3
443,Technology as Fashion,Analyzing the success of the iPod mini in Japa...,4
106218,Microsoft sues firms for software violations,The world #39;s largest computer software manu...,4
98174,"NBA suspends nine players, Artest for rest of ...",NBA on Sunday suspended nine players for invol...,2
35150,Making seamless transition to Paralympics will...,VANCOUVER (CP) - How to make a seamless transi...,2


In [None]:
df_train['Class Index'].unique()

array([3, 4, 2, 1])

In [None]:
df_train.loc[df_train['Class Index'] == 1, 'Class Index'] = 0
df_train.loc[df_train['Class Index'] == 2, 'Class Index'] = 1
df_train.loc[df_train['Class Index'] == 3, 'Class Index'] = 2
df_train.loc[df_train['Class Index'] == 4, 'Class Index'] = 3


df_test.loc[df_test['Class Index'] == 1, 'Class Index'] = 0
df_test.loc[df_test['Class Index'] == 2, 'Class Index'] = 1
df_test.loc[df_test['Class Index'] == 3, 'Class Index'] = 2
df_test.loc[df_test['Class Index'] == 4, 'Class Index'] = 3

In [None]:
df_train.head()

Unnamed: 0,Class Index,Title,Description
2579,2,Brookstone Posts a Narrower Loss (Reuters),"Reuters - Brookstone Inc. (BKST.O), a retailer..."
443,3,Technology as Fashion,Analyzing the success of the iPod mini in Japa...
106218,3,Microsoft sues firms for software violations,The world #39;s largest computer software manu...
98174,1,"NBA suspends nine players, Artest for rest of ...",NBA on Sunday suspended nine players for invol...
35150,1,Making seamless transition to Paralympics will...,VANCOUVER (CP) - How to make a seamless transi...


In [None]:
df_train['Class Index'].unique()
df_test['Class Index'].unique()

array([2, 3, 1, 0])

In [None]:
!pip install transformers
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

model_name = "bert-base-cased"
config = BertConfig.from_pretrained(
    model_name,
    num_labels=4,
)
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-cased",
    do_lower_case=False,
)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased",
    config=config,
)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
trainable_layers = [model.bert.encoder.layer[-1], model.bert.pooler, model.classifier]
total_params = 0
trainable_params = 0

for p in model.parameters():
        p.requires_grad = False
        total_params += p.numel()

for layer in trainable_layers:
    for p in layer.parameters():
        p.requires_grad = True
        trainable_params += p.numel()

print(f"Total parameters count: {total_params}") # ~108M
print(f"Trainable parameters count: {trainable_params}") # ~7M

Total parameters count: 108313348
Trainable parameters count: 7681540


In [None]:
LABEL_LIST = [0,1,2,3]
MAX_SEQ_LENGHT = 128

import torch
import transformers
from torch.utils.data import TensorDataset
from transformers.data.processors.utils import InputExample
from transformers.data.processors.glue import glue_convert_examples_to_features


def _create_examples(df, set_type):
    """ Convert raw dataframe to a list of InputExample. Filter malformed examples
    """
    examples = []
    for index, row in df.iterrows():
        # if row['gold_label'] not in LABEL_LIST:
        #     continue
        # if not isinstance(row['sentence1'], str) or not isinstance(row['sentence2'], str):
        #     continue
        guid = f"{index}-{set_type}"
        examples.append(
            InputExample(guid=guid, text_a=row['Title'], text_b=row['Description'], label=row['Class Index']))
    return examples

def _df_to_features(df, set_type):
    """ Pre-process text. This method will:
    1) tokenize inputs
    2) cut or pad each sequence to MAX_SEQ_LENGHT
    3) convert tokens into ids
    
    The output will contain:
    `input_ids` - padded token ids sequence
    `attention mask` - mask indicating padded tokens
    `token_type_ids` - mask indicating the split between premise and hypothesis
    `label` - label
    """
    examples = _create_examples(df, set_type)
    
    #backward compatibility with older transformers versions
    legacy_kwards = {}
    from packaging import version
    if version.parse(transformers.__version__) < version.parse("2.9.0"):
        legacy_kwards = {
            "pad_on_left": False,
            "pad_token": tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            "pad_token_segment_id": 0,
        }
    
    return glue_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        label_list=LABEL_LIST,
        max_length=MAX_SEQ_LENGHT,
        output_mode="classification",
        **legacy_kwards,
    )

def _features_to_dataset(features):
    """ Convert features from `_df_to_features` into a single dataset
    """
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor(
        [f.attention_mask for f in features], dtype=torch.long
    )
    all_token_type_ids = torch.tensor(
        [f.token_type_ids for f in features], dtype=torch.long
    )
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    dataset = TensorDataset(
        all_input_ids, all_attention_mask, all_token_type_ids, all_labels
    )

    return dataset

train_features = _df_to_features(df_train, "train")
test_features = _df_to_features(df_test, "test")

train_dataset = _features_to_dataset(train_features)
test_dataset = _features_to_dataset(test_features)

In [None]:
BATCH_SIZE = 32
MAX_PHYSICAL_BATCH_SIZE = 8

In [None]:
!pip install opacus
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from opacus.utils.uniform_sampler import UniformWithReplacementSampler


train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=BATCH_SIZE)

In [None]:
import torch

# Move the model to appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set the model to train mode (HuggingFace models load in eval mode)
model = model.train()
# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, eps=1e-8)

In [None]:
EPOCHS = 3 #EPOCHS VALUE CAN BE CHANGED
LOGGING_INTERVAL = 300 # once every how many steps we run evaluation cycle and report metrics
EPSILON = 7.5
DELTA = 1 / len(train_dataloader) # Parameter for privacy accounting. Probability of not achieving privacy guarantees

In [None]:
import numpy as np
from tqdm.notebook import tqdm

def accuracy(preds, labels):
    return (preds == labels).mean()

# define evaluation cycle
def evaluate(model):    
    model.eval()

    loss_arr = []
    accuracy_arr = []
    
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels':         batch[3]}

            outputs = model(**inputs)
            loss, logits = outputs[:2]
            
            preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
            labels = inputs['labels'].detach().cpu().numpy()
            
            loss_arr.append(loss.item())
            accuracy_arr.append(accuracy(preds, labels))
    
    model.train()
    return np.mean(loss_arr), np.mean(accuracy_arr)

In [None]:
from opacus import PrivacyEngine

MAX_GRAD_NORM = 0.1

privacy_engine = PrivacyEngine()

model, optimizer, train_dataloader = privacy_engine.make_private_with_epsilon(
    module=model,
    optimizer=optimizer,
    data_loader=train_dataloader,
    target_delta=DELTA,
    target_epsilon=EPSILON, 
    epochs=EPOCHS,
    max_grad_norm=MAX_GRAD_NORM,
)

  "Secure RNG turned off. This is perfectly fine for experimentation as it allows "
  f"Optimal order is the {extreme} alpha. Please consider expanding the range of alphas to get a tighter privacy bound."
  z = np.log((np.exp(t) + q - 1) / q)


In [None]:
from opacus.utils.batch_memory_manager import BatchMemoryManager

for epoch in range(1, EPOCHS+1):
    losses = []

    with BatchMemoryManager(
        data_loader=train_dataloader, 
        max_physical_batch_size=MAX_PHYSICAL_BATCH_SIZE, 
        optimizer=optimizer
    ) as memory_safe_data_loader:
        for step, batch in enumerate(tqdm(memory_safe_data_loader)):
            optimizer.zero_grad()

            batch = tuple(t.to(device) for t in batch)

            inputs = {'input_ids':      batch[0],
                    'token_type_ids': batch[2],
                    'labels':         batch[3]}

            outputs = model(**inputs) # output = loss, logits, hidden_states, attentions
            
            loss = outputs[0]
            loss.backward()
            losses.append(loss.item())

            optimizer.step()
            
            if step > 0 and step % LOGGING_INTERVAL == 0:
              print("HERE 2")
              train_loss = np.mean(losses)
              print("3.o")
              eps = privacy_engine.get_epsilon(DELTA)
              print("3")
              eval_loss, eval_accuracy = evaluate(model)
              print("4")
              print(
                f"Epoch: {epoch} | "
                f"Step: {step} | "
                f"Train loss: {train_loss:.3f} | "
                f"Eval loss: {eval_loss:.3f} | "
                f"Eval accuracy: {eval_accuracy:.3f} | "
                f"ɛ: {eps:.2f}"
              )

  0%|          | 0/375 [00:00<?, ?it/s]

  self._backward_hooks[handle.id] = hook
  The hook will be called every time the gradients with respect to module


HERE 2
3.o
3
4
Epoch: 1 | Step: 300 | Train loss: 1.395 | Eval loss: 1.428 | Eval accuracy: 0.273 | ɛ: 3.82


  0%|          | 0/375 [00:00<?, ?it/s]