In [1]:
from transformers import LlamaTokenizer, LlamaForSequenceClassification, LlamaConfig

In [2]:
import warnings
import logging
import os
import torch
warnings.simplefilter('ignore')
logging.disable(logging.ERROR)
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['WANDB_PROJECT'] = 'kaggle-llm-2023-llama-2-7b-test-train'
os.environ['CUDA_VISIBLE_DEVICE'] = "0,1"

In [3]:
config_dir = '/root/autodl-tmp/llama2'
tokenizer_dir = '/root/autodl-tmp/llama2'
model_dir = '/root/autodl-tmp/llama2'
config = LlamaConfig.from_pretrained(config_dir)
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_dir)

## set up the pad token
tokenizer.pad_token = '[PAD]'
tokenizer.padding_side = "left"
config.pad_token_id = tokenizer.pad_token_id

#set up the numnber of labels
config.num_labels = 2

In [4]:
model = LlamaForSequenceClassification.from_pretrained(model_dir, config = config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
def tokenize(text, tokenizer, config):
    sep = tokenizer.sep_token
    tokenized = tokenizer(
        text,
        padding = True,
        truncation = True,
        max_length = 256
    )
    return {
        ** tokenized
    }

## Read in the training data

In [6]:
import pandas as pd
test = pd.read_csv('/root/autodl-tmp/llm-2023/test_essays.csv')
sub = pd.read_csv('/root/autodl-tmp/llm-2023/sample_submission.csv')
org_train = pd.read_csv('/root/autodl-tmp/llm-2023/train_essays.csv')

train = pd.read_csv('/root/autodl-tmp/llm-2023/train_v2_drcat_02.csv', sep = ',')
display(train.shape)

(44868, 5)

In [7]:
train_gen_data = pd.read_parquet('/root/autodl-tmp/llm-2023/gen_data_21122023.parquet')
display(train.shape)

(44868, 5)

## Define the device

In [8]:
device = 'cuda'

## The util functions

In [9]:
def tokenize_train(df, tokenizer):
    tokenized = tokenizer(
        df['text'],
        padding = True,
        truncation = True,
        max_length = 256
    )
    
    labels = df['label']
    return {
        **tokenized,
        'labels': labels
    }

## Split the training data using groupkfold

In [10]:
from sklearn.model_selection import GroupKFold

group_kfold = GroupKFold(n_splits= 5)
groups = train.prompt_name
group_kfold.get_n_splits(train['text'], train['label'], groups)

5

In [11]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(
    tokenizer = tokenizer,
    pad_to_multiple_of=16,
)

In [12]:
config.update({
    'hidden_dropout_prob': 0,
    'attention_probs_dropout_prob': 0,
    'num_labels': 2,
    'problem_type': 'single_label_classification',
    'max_position_embeddings': 256,
    'num_proc': 4
})

## Loss func

In [13]:
from sklearn import metrics
def compute_auc(eval_pred):
    preds, labels = eval_pred;
    fpr, tpr, thresholds = metrics.roc_curve(labels, preds, pos_label = 1)
    auc = metrics.auc(fpr, tpr);
    return {
        'auc': auc
    }

## Define the Dataset Class

In [14]:
import torch
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df;
        self.tokenizer = tokenizer;
        self.max_len = config.max_position_embeddings
    def __len__(self):
        return len(self.df);
    
    def __getitem__(self, index):
        data_row = self.df.iloc[index, :]
        labels = data_row['label']
        
        inputs = self.tokenizer(
            data_row['text'],
            padding = 'max_length',
            truncation = True,
            max_length = self.max_len
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'input_ids': torch.tensor(ids, dtype = torch.long),
            'attention_mask': torch.tensor(mask, dtype = torch.long),
            'labels': labels
        }
    

In [15]:
train.head(3)

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False


## Test training function coding

In [16]:
test_train = train.iloc[:100, :].reset_index(drop= True)
test_val = train.iloc[100:200, :].reset_index(drop = True)



In [17]:
from torch.utils.data import DataLoader

train_dataset = TrainDataset(test_train, tokenizer)
valid_dataset = TrainDataset(test_val, tokenizer)


train_loader = DataLoader(
    train_dataset,
    batch_size = 1,
    shuffle = True,
    num_workers = 4,
    pin_memory = True,
    drop_last = True
)

val_loader = DataLoader(
    valid_dataset,
    batch_size = 1,
    shuffle= False,
    num_workers = 4,
    pin_memory = True,
    drop_last = True
)   

In [18]:
def get_optimizer_params(model, lr, weight_decay = 0.0):
    param_optimizer = list(model.named_parameters());
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': lr, 'weight_decay': 0.0},
    ]
    
    return optimizer_parameters


In [19]:
import torch.nn as nn

model = model.to(device)
model = nn.DataParallel(model, device_ids=[0, 1])

In [20]:
lr = 1e-5
optimizer_params = get_optimizer_params(model, lr)

In [21]:
for step, batch in enumerate(train_loader):
    for key,value in batch.items():
        batch[key] = value.to('cuda:1')
    batch_size = batch['labels'].size(0)
    break;

In [22]:
torch.cuda.device_count()

2

In [23]:
res = model(**batch)

## Training function

In [None]:
def train_fn(train_loader, model, optimizer, epoch, scheduler, device,
             valid_loader, start_time, best_sore):
    model.train()
    losses = []
    for step, batch in enumerate(train_loader):
        for key, batch in batch.items():
            batch[key] = value.to(device)
        batch_size = batch['labels'].size(0)
    loss = model(**batch).loss
    losses.

## Test train-val split

In [13]:
from datasets import Dataset, disable_progress_bar
train_ds = Dataset.from_pandas(train.iloc[:30000, :])
val_ds = Dataset.from_pandas(train.iloc[30000:, :])

In [14]:
train_ds = train_ds.map(
    tokenize_train,
    batched = False,
    num_proc = config.num_proc,
    fn_kwargs = {'tokenizer': tokenizer}
)

val_ds = val_ds.map(
    tokenize_train,
    batched = False,
    num_proc = config.num_proc,
    fn_kwargs = {'tokenizer': tokenizer}
)

Map (num_proc=4):   0%|          | 0/30000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/14868 [00:00<?, ? examples/s]

In [15]:
data_collator = DataCollatorWithPadding(
    tokenizer = tokenizer,
    pad_to_multiple_of=16
)

In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=6e-6,
    weight_decay=1e-4,
    warmup_ratio=0,
    lr_scheduler_type='cosine',
    optim='adamw_torch',
    logging_dir='./logs',
    logging_strategy='steps',
    logging_steps=150,
    evaluation_strategy='steps',
    eval_steps=150,
    save_strategy='epoch',
    save_total_limit=1,
    save_steps=150,
    report_to='wandb',
    run_name='test-llm-2023-llama2-run',
    metric_for_best_model='auc'
)

In [19]:
from transformers import Trainer

os.environ['CUDA_VISIBLE_DEVICE'] = "0,1"

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_ds,
    eval_dataset = val_ds,
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_auc
)

In [20]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mpeng_sun[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112955336769422, max=1.0…

OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
    output = module(*input, **kwargs)
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py", line 1346, in forward
    transformer_outputs = self.model(
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py", line 1068, in forward
    layer_outputs = decoder_layer(
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py", line 796, in forward
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
  File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py", line 413, in forward
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 442.00 MiB (GPU 0; 31.74 GiB total capacity; 30.86 GiB already allocated; 363.12 MiB free; 30.96 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


In [28]:
type(val_ds)

datasets.arrow_dataset.Dataset