In [1]:
# imports

import os
import numpy as np
import pandas as pd

from hydra import compose, initialize
from omegaconf import OmegaConf
from sagemaker import hyperparameters
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
from pathlib import Path
from peft import LoraConfig, PrefixTuningConfig, AdaLoraConfig, LoKrConfig

from utils.utils import set_random_seed, get_root_dir
from model.data_preprocess.football_torch_dataset import FootballTorchDataset

set_random_seed()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


2024-05-28 15:30:21.054022: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# load config params

root_dir = get_root_dir()
with initialize(config_path='config'):
    cfg = compose(config_name='conf')
    print(OmegaConf.to_yaml(cfg))

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  with initialize(config_path='config'):


data:
  datasets:
  - player_valuations
  bucket: deep-learning-projects
  data_folder: sagemaker/FootballGPT_Data
model:
  model_name: bigscience/bloomz-560m
  load_in_4bit: true
  quant_type: nf4
  quant_compute_dtype: float16
  use_double_quant: false
peft:
  method: lora
  r: 64
  lora_a: 16
  target_modules:
  - query_key_value
  dropout: 0.1
train:
  num_epochs: 3
  batch_size: 4
  gradient_accumulation_steps: 1
  optim: paged_adamw_32bit
  save_steps: 1000
  logging_steps: 1000
  learning_rate: 0.0002
  weight_decay: 0.001
  checkpoint_name: results



In [3]:
# set up quantization config only if LoRA variation is the specified PEFT method
if 'abc' in cfg.peft.method:
    # config quantization for QLoRA
    print('configure quantization parameters')
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=cfg.model.load_in_4bit,
        bnb_4bit_quant_type=cfg.model.quant_type,
        bnb_4bit_compute_dtype=cfg.model.quant_compute_dtype,
        bnb_4bit_use_double_quant=cfg.model.use_double_quant,
    )
    device_map = {"": 0}

else:
    bnb_config = None
    device_map = None

In [None]:
# load base model
base_model_name = cfg.model.model_name
print(f'load model {base_model_name}')
foundation_model = AutoModelForCausalLM.from_pretrained(base_model_name,
                                                        quantization_config=bnb_config,
                                                        device_map=device_map)
foundation_model.config.use_cache = False
foundation_model.config.pretraining_tp = 1

load model bigscience/bloomz-560m


In [None]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

# load data from s3 bucket
data_location = f's3://{cfg.data.bucket}/{cfg.data.folder}'
if cfg.data.datasets is not None:  # read only specified datasets
    dfs = [pd.read_csv(f'{data_location}/{ds}.csv') for ds in cfg.data.datasets]
else:  # read all available datasets
    dfs = [pd.read_csv(f'{data_location}/{ds}') for ds in os.listdir(data_location)
           if Path(f'{data_location}/{ds}.csv').exists() and Path(f'{data_location}/{ds}').suffix == '.csv']

if len(dfs) > 1:
    combined_df = pd.concat([df for df in dfs], ignore_index=True)
combined_df = combined_df.dropna()  # Drop rows with missing values

# tokenize all data
inputs = tokenizer(combined_df['text'].tolist(), max_length=512, truncation=True,
                   padding='max_length', return_tensors='pt')

# transform datasets to pytorch Dataset instance
train_ds = FootballTorchDataset(inputs)

In [None]:
# config peft method and parameters
method_name = cfg.peft.method

# define peft methods and configurations
if method_name == 'lora':
    peft_config = LoraConfig(r=cfg.peft.r, lora_alpha=cfg.peft.lora_a,
                             target_modules=cfg.peft.target_modules,
                             lora_dropout=cfg.peft.dropout, bias='none')
elif method_name == 'adalora':
    peft_config = AdaLoraConfig(r=cfg.peft.r, lora_alpha=cfg.peft.lora_a,
                                target_modules=cfg.peft.target_modules,
                                lora_dropout=cfg.peft.dropout)
elif method_name == 'lokr':
    peft_config = LoKrConfig(r=cfg.peft.r, lora_alpha=cfg.peft.lora_a,
                             target_modules=cfg.peft.target_modules)
else:
    peft_config = PrefixTuningConfig(num_virtual_tokens=20, token_dim=768,
                                     num_transformer_submodules=1,
                                     num_attention_heads=12, num_layers=12,
                                     encoder_hidden_size=768)


# define peft methods and configurations
print(f'configure {cfg.peft.method} for PEFT')

In [None]:
# define training arguments
checkpoint_name = cfg.train.checkpoint_name
out_dir = os.path.join(root_dir, checkpoint_name)
Path(out_dir).mkdir(parents=True, exists_ok=True)

training_args = TrainingArguments(
    output_dir=out_dir,
    num_train_epochs=cfg.train.num_epochs,
    per_device_train_batch_size=cfg.train.batch_size,
    gradient_accumulation_steps=cfg.train.gradient_accumulation_steps,
    optim=cfg.train.optim,
    save_steps=cfg.train.save_steps,
    logging_steps=cfg.train.logging_steps,
    learning_rate=cfg.train.learning_rate,
    weight_decay=cfg.train.weight_decay,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type='constant',
    report_to="tensorboard"
)

In [None]:
# train
print('start training')
print("=" * 80)
trainer = SFTTrainer(
    model=foundation_model,
    train_dataset=train_ds,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_args,
    packing=False
)
trainer.train()
print("=" * 80)
print('save fine-tuned model')
trainer.model.save_pretrained(f'{base_model_name}-football-{cfg.peft.method}-ft')