In [1]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [2]:
!pip install -q transformers[sentencepiece]
!pip install -q colorama

In [3]:
# import os
# os.chdir('/content/drive/MyDrive/LLM - Detect AI Generated Text')

In [4]:
import os
import gc
import copy
import time
import random
import string

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold

# For Transformer Models
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"

2024-01-19 20:21:14.695321: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-19 20:21:14.695397: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-19 20:21:14.695437: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-19 20:21:14.705183: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
CONFIG = {"seed": 2023,
          "epochs": 2,
          #"model_name": "microsoft/deberta-v3-base",
          "model_name": "distilroberta-base",
          #"train_batch_size": 80,
          "train_batch_size": 16, # Peng's code
          "valid_batch_size": 16,
          #"max_length": 128,
          "max_length": 512,
          "learning_rate": 5e-5,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500,
          "weight_decay": 1e-6,
          "n_fold": 5,
          "n_accumulate": 1,
          "num_classes": 1,
          "margin": 0.5,
          "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'Baseline'

In [6]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(CONFIG['seed'])

In [7]:
df = pd.read_parquet("/home/peng_sun2/s3shared/kaggle/llm-2023/external_data/pair_wise_train_ds.parquet")
df.head()

Unnamed: 0,title,ai,human
0,Phones and driving,"As a grade 10 student, I strongly argue agains...",Phones\n\nModern humans today are always on th...
1,Phones and driving,While cell phones have become ubiquitous in mo...,Phones\n\nModern humans today are always on th...
2,Phones and driving,Social media platforms have taken over the wor...,Phones\n\nModern humans today are always on th...
3,Phones and driving,I think there should be stronger privacy prote...,Phones\n\nModern humans today are always on th...
4,Phones and driving,Drivers should not be able to use cell phones ...,Phones\n\nModern humans today are always on th...


In [8]:
class PairWiseDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        #self.prompt = df['prompt'].values
        self.ai = df['ai'].values
        self.human = df['human'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        #prompt = self.prompt[index]
        ai_text = self.ai[index]
        human_text = self.human[index]
        inputs_ai = self.tokenizer(
                                #prompt,
                                ai_text,
                                truncation=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        inputs_human = self.tokenizer(
                                #prompt,
                                human_text,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        target = 1

        ai_input_ids = inputs_ai['input_ids']
        ai_attention_mask = inputs_ai['attention_mask']

        human_input_ids = inputs_human['input_ids']
        huamn_attention_mask = inputs_human['attention_mask']


        return {
            'ai_input_ids': torch.tensor(ai_input_ids, dtype=torch.long),
            'ai_attention_mask': torch.tensor(ai_attention_mask, dtype=torch.long),
            'human_input_ids': torch.tensor(human_input_ids, dtype=torch.long),
            'huamn_attention_mask': torch.tensor(huamn_attention_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }

In [9]:
class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        #self.prompt = df['prompt'].values
        self.text = df['text'].values
        self.target = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        #prompt = self.prompt[index]
        text = self.text[index]
        inputs = self.tokenizer(
                                #prompt,
                                text,
                                truncation=True,
                                max_length=self.max_len,
                                padding='max_length')

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        target = self.target[index]


        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long),
        }

In [10]:
class AiDectModel(nn.Module):
    def __init__(self, model_name):
        super(AiDectModel, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])

    def forward(self, ids, mask):
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        # out = self.drop(out[1])
        # outputs = self.fc(out)
        return out.logits.view(-1)

In [11]:
def criterion(outputs1, outputs2, targets):

    ce_pos_label = torch.as_tensor([1.] * CONFIG['train_batch_size']).to(CONFIG['device'])
    ce_neg_label = torch.as_tensor([0.] * CONFIG['train_batch_size']).to(CONFIG['device'])
    ce_loss = nn.BCEWithLogitsLoss()(outputs1, ce_pos_label) +  nn.BCEWithLogitsLoss()(outputs2, ce_neg_label)
    rank_loss = nn.MarginRankingLoss(margin=CONFIG['margin'])(outputs1, outputs2, targets)
    return 0.5 * ce_loss + rank_loss

In [12]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()

    dataset_size = 0
    running_loss = 0.0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ai_input_ids = data['ai_input_ids'].to(device, dtype = torch.long)
        ai_attention_mask = data['ai_attention_mask'].to(device, dtype = torch.long)
        human_input_ids = data['human_input_ids'].to(device, dtype = torch.long)
        huamn_attention_mask = data['huamn_attention_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = ai_input_ids.size(0)
        ai_outputs = model(ai_input_ids, ai_attention_mask)
        
        human_outputs = model(human_input_ids, huamn_attention_mask)
        loss = criterion(ai_outputs, human_outputs, targets)
        loss = loss / CONFIG['n_accumulate']
        loss.backward()

        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()

        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()

    return epoch_loss

In [13]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0.0
    preds = []
    gts = []
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        input_ids = data['input_ids'].to(device, dtype = torch.long)
        attention_mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['target']
        
        batch_size = input_ids.size(0)

        outputs = model(input_ids, attention_mask)
        preds.append(outputs.sigmoid().cpu().numpy())
        gts.append(targets.numpy())

    preds = np.concatenate(preds)
    gts = np.concatenate(gts)

    auc_score = roc_auc_score(gts, preds)
    gc.collect()

    return auc_score

In [14]:
def run_training(model, optimizer, scheduler, device, num_epochs, fold):
    # To automatically log gradients


    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))

    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_score = -1
    history = defaultdict(list)

    for epoch in range(1, num_epochs + 1):
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler,
                                            dataloader=train_loader,
                                            device=CONFIG['device'], epoch=epoch)
        train_epoch_loss = 1
        val_epoch_score = valid_one_epoch(model, valid_loader, device=CONFIG['device'],
                                         epoch=epoch)

        history['Train Loss'].append(train_epoch_loss)
        history['Valid score'].append(val_epoch_score)


        # deep copy the model
        if val_epoch_score > best_epoch_score:
            print(f"{b_}Validation Loss Improved ({best_epoch_score} ---> {val_epoch_score})")
            best_epoch_score = val_epoch_score
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"/home/peng_sun2/s3shared/kaggle/llm-2023/model/distilroberta/Loss-Fold-{fold}.bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")

        print()

    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Score: {:.4f}".format(best_epoch_score))

    # load best model weights
    model.load_state_dict(best_model_wts)

    return model, history

In [15]:
def prepare_loaders(df_train, df_valid):
#     df_train = df[df.title != 'Car-free cities'].reset_index(drop=True)
#     df_valid = df[df.title == 'Car-free cities'].reset_index(drop=True)
    #val_prompt = df_valid.prompt.unique()[0]
    df_valid_pos = pd.DataFrame({
                                'text' : df_valid.ai.unique().tolist(),
    })
    df_valid_pos['label'] = 1
    df_valid_neg = pd.DataFrame({
                                'text' : df_valid.human.unique().tolist(),
    })
    df_valid_neg['label'] = 0
    df_valid = pd.concat([df_valid_pos, df_valid_neg])
    #df_valid['prompt'] = val_prompt
    train_dataset = PairWiseDataset(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'],
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)

    valid_dataset = TestDataset(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'],
                              num_workers=2, shuffle=False, pin_memory=True)

    return train_loader, valid_loader

In [16]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'],
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'],
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None

    return scheduler

<span style="color: #000508; font-family: Segoe UI; font-size: 1.5em; font-weight: 300;">Start Training</span>

In [17]:
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(CONFIG['n_fold'])

for fold, (train_idx, val_idx) in enumerate(gkf.split(df, groups= df['title'])):
    train_df = df.loc[train_idx, :].reset_index(drop = True)
    val_df = df.loc[val_idx, :].reset_index(drop = True)
    
    # Create Dataloader
    train_loader, valid_loader = prepare_loaders(train_df, val_df)

    model = AiDectModel(CONFIG['model_name'])
    
    ##multiple GPUs, peng's code
    #device_ids = [1, 2]
    model = torch.nn.DataParallel(model)
    model = model.to('cuda')
    
    # commented out by Peng
    #model.to(CONFIG['device'])

    # Define Optimizer and Scheduler
    optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
    scheduler = fetch_scheduler(optimizer)

    model, history = run_training(model, optimizer, scheduler,
                                  device=CONFIG['device'],
                                  num_epochs=CONFIG['epochs'],
                                  fold=fold)


    del model, history, train_loader, valid_loader
    _ = gc.collect()
    print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[INFO] Using GPU: Tesla V100-SXM2-16GB



100%|██████████| 13664/13664 [1:59:20<00:00,  1.91it/s, Epoch=1, LR=1.29e-5, Train_Loss=0.00136]
100%|██████████| 347/347 [00:29<00:00, 11.95it/s]


[34mValidation Loss Improved (-1 ---> 0.999800290486565)
Model Saved[0m



100%|██████████| 13664/13664 [1:59:05<00:00,  1.91it/s, Epoch=2, LR=1.4e-5, Train_Loss=3.17e-8] 
100%|██████████| 347/347 [00:29<00:00, 11.57it/s]



Training complete in 3h 59m 37s
Best Score: 0.9998



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[INFO] Using GPU: Tesla V100-SXM2-16GB



100%|██████████| 13548/13548 [1:58:22<00:00,  1.91it/s, Epoch=1, LR=2.11e-6, Train_Loss=0.00196] 
100%|██████████| 358/358 [00:30<00:00, 11.90it/s]


[34mValidation Loss Improved (-1 ---> 0.9966397377356282)
Model Saved[0m



100%|██████████| 13548/13548 [1:57:11<00:00,  1.93it/s, Epoch=2, LR=4.57e-5, Train_Loss=8.17e-8]
100%|██████████| 358/358 [00:29<00:00, 12.15it/s]


[34mValidation Loss Improved (0.9966397377356282 ---> 0.9966631541974007)
Model Saved[0m

Training complete in 3h 56m 55s
Best Score: 0.9967



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[INFO] Using GPU: Tesla V100-SXM2-16GB



 85%|████████▌ | 11739/13738 [1:41:18<17:08,  1.94it/s, Epoch=1, LR=2.38e-5, Train_Loss=0.00219]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 13738/13738 [1:59:04<00:00,  1.92it/s, Epoch=2, LR=1.28e-6, Train_Loss=8.68e-8]
100%|██████████| 339/339 [00:28<00:00, 11.92it/s]



Training complete in 3h 58m 46s
Best Score: 0.9993



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[INFO] Using GPU: Tesla V100-SXM2-16GB



100%|██████████| 13701/13701 [1:58:28<00:00,  1.93it/s, Epoch=1, LR=1.81e-5, Train_Loss=0.00267]
100%|██████████| 343/343 [00:28<00:00, 12.19it/s]


[34mValidation Loss Improved (-1 ---> 0.9966874655900165)
Model Saved[0m



 17%|█▋        | 2272/13701 [19:39<1:38:55,  1.93it/s, Epoch=2, LR=4.96e-5, Train_Loss=0.0022] 


KeyboardInterrupt: 