In [1]:
from transformers import LlamaTokenizer, LlamaForSequenceClassification, LlamaConfig

In [2]:
import warnings
import logging
import os
warnings.simplefilter('ignore')
logging.disable(logging.ERROR)
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['WANDB_PROJECT'] = 'kaggle-llm-2023-llama-2-7b-test-train'

In [3]:
config_dir = '/root/autodl-tmp/llama2'
tokenizer_dir = '/root/autodl-tmp/llama2'
model_dir = '/root/autodl-tmp/llama2'
config = LlamaConfig.from_pretrained(config_dir)
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_dir)

## set up the pad token
tokenizer.pad_token = '[PAD]'
tokenizer.padding_side = "left"
config.pad_token_id = tokenizer.pad_token_id

#set up the numnber of labels
config.num_labels = 2

In [4]:
model = LlamaForSequenceClassification.from_pretrained(model_dir, config = config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# def tokenize(text, tokenizer, config):
#     sep = tokenizer.sep_token
#     tokenized = tokenizer(
#         text,
#         padding = False,
#         truncation = True,
#         max_length = 1600
#     )
#     return {
#         ** tokenized
#     }

## Read in the training data

In [6]:
import pandas as pd
test = pd.read_csv('/root/autodl-tmp/llm-2023/test_essays.csv')
sub = pd.read_csv('/root/autodl-tmp/llm-2023/sample_submission.csv')
org_train = pd.read_csv('/root/autodl-tmp/llm-2023/train_essays.csv')

train = pd.read_csv('/root/autodl-tmp/llm-2023/train_v2_drcat_02.csv', sep = ',')
display(train.shape)

(44868, 5)

In [7]:
train_gen_data = pd.read_parquet('/root/autodl-tmp/llm-2023/gen_data_21122023.parquet')
display(train.shape)

(44868, 5)

## The util functions

In [8]:
def tokenize_train(df, tokenizer):
    tokenized = tokenizer(
        df['text'],
        padding = True,
        truncation = True,
        max_length = 1600
    )
    
    labels = df['class']
    return {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask'],
        'labels': labels
    }

## Split the training data using groupkfold

In [9]:
from sklearn.model_selection import GroupKFold

group_kfold = GroupKFold(n_splits= 5)
groups = train.prompt_name
group_kfold.get_n_splits(train['text'], train['label'], groups)

5

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(
    tokenizer = tokenizer,
    pad_to_multiple_of=16,
)

In [11]:
config.update({
    'hidden_dropout_prob': 0,
    'attention_probs_dropout_prob': 0,
    'num_labels': 2,
    'problem_type': 'single_label_classification',
    'max_position_embeddings': 1600,
    'num_proc': 4
})

## Loss func

In [12]:
from sklearn import metrics
def compute_auc(eval_pred):
    preds, labels = eval_pred;
    fpr, tpr, thresholds = metrics.roc_curve(labels, preds, pos_label = 1)
    auc = metrics.auc(fpr, tpr);
    return {
        'auc': auc
    }

## Test train-val split

In [13]:
train = train.rename({'label': 'class'}, axis =1)

In [14]:
from datasets import Dataset, disable_progress_bar
col_list = ['text', 'class']
train_ds = Dataset.from_pandas(train.loc[:30000, col_list])
val_ds = Dataset.from_pandas(train.loc[30000:, col_list])

In [15]:
train_ds

Dataset({
    features: ['text', 'class'],
    num_rows: 30001
})

In [16]:
train_ds = train_ds.map(
    tokenize_train,
    batched = False,
    num_proc = config.num_proc,
    fn_kwargs = {'tokenizer': tokenizer}
)

val_ds = val_ds.map(
    tokenize_train,
    batched = False,
    num_proc = config.num_proc,
    fn_kwargs = {'tokenizer': tokenizer}
)

Map (num_proc=4):   0%|          | 0/30001 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/14868 [00:00<?, ? examples/s]

In [17]:
type(train_ds)

datasets.arrow_dataset.Dataset

In [18]:
data_collator = DataCollatorWithPadding(
    tokenizer = tokenizer,
    pad_to_multiple_of=16
)

In [31]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=6e-6,
    weight_decay=1e-4,
    warmup_ratio=0,
    lr_scheduler_type='cosine',
    optim='adamw_torch',
    logging_dir='./logs',
    logging_strategy='steps',
    logging_steps=150,
    evaluation_strategy='steps',
    eval_steps=150,
    save_strategy='steps',
    save_total_limit=1,
    save_steps=150,
    report_to='wandb',
    run_name='test-llm-2023-llama2-run',
    metric_for_best_model='auc',
    fp16= True,
    gradient_accumulation_steps = 1,
    load_best_model_at_end = True,
    greater_is_better = False,
)

In [32]:
import torch
import torch.nn as nn
torch.cuda.device_count()

2

In [33]:
model = nn.DataParallel(model)

In [34]:
from transformers import Trainer

os.environ['CUDA_VISIBLE_DEVICE'] = "0,1"

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_ds,
    eval_dataset = val_ds,
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_auc
)

In [36]:
trainer.train()

IndexError: Invalid key: 28255 is out of bounds for size 0

In [28]:
type(val_ds)

datasets.arrow_dataset.Dataset