# Model fine-tuning (Llama-2-7b-chat)
for predicting esg risk level

Reference: https://www.datacamp.com/tutorial/fine-tuning-llama-2

In [4]:
import os
import re
import torch
import pandas as pd
import warnings
from datasets import Dataset
from datetime import datetime
from peft import LoraConfig
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import SFTTrainer
from sklearn.metrics import accuracy_score

os.chdir("../../")

### Load and preprocess training data

In [5]:
train_path = "data/processed/train_esg_shortened.csv"
df_train = pd.read_csv(train_path)
print(df_train.shape)

(547, 6)


In [7]:
df_train.head()

Unnamed: 0,symbol,year,quarter,transcript_esg,esg_score,esg_risk_level
0,A,2022,1,thank emily welcome everyone agilents conferen...,15.0,Low
1,A,2022,3,thank hannah welcome everyone agilents confere...,15.0,Low
2,AAPL,2022,1,good day welcome apple q fy earnings conferenc...,17.0,Low
3,AAPL,2022,2,good day welcome apple q fy earnings conferenc...,17.0,Low
4,AAPL,2022,3,good day welcome apple q fy earnings conferenc...,17.0,Low


In [8]:
# check the unique labels in train
unique_labels = df_train['esg_risk_level'].unique()
print(unique_labels)

['Low' 'Medium' 'Severe' 'High' 'Negligible']


In [9]:
def combine_question_answer(transcript_esg, esg_risk_level):
    system_msg = "<<SYS>>\n" \
        + "You are an rating agency. Your task is to predict the a company's ESG Risk Level from a meeting transcript." \
        + "You should evaluate the company's performance on Environmental, Social and Governance issues." \
        + "The possible Risk Levels, from low to high, are `Negligible`, `Low`, `Medium`, `High`, `Severe`.\n" \
        + "<</SYS>>\n\n"
    prompt = f"<s>[INST] {system_msg}###Transcript: {transcript_esg}###Risk Level: [/INST] `{esg_risk_level}`</s>"
    return prompt

df_train["text"] = df_train.apply(
    lambda row: combine_question_answer(row.transcript_esg, row.esg_risk_level), axis=1
)
df_train[["transcript_esg", "esg_risk_level", "text"]].head()

Unnamed: 0,transcript_esg,esg_risk_level,text
0,thank emily welcome everyone agilents conferen...,Low,<s>[INST] <<SYS>>\nYou are an rating agency. Y...
1,thank hannah welcome everyone agilents confere...,Low,<s>[INST] <<SYS>>\nYou are an rating agency. Y...
2,good day welcome apple q fy earnings conferenc...,Low,<s>[INST] <<SYS>>\nYou are an rating agency. Y...
3,good day welcome apple q fy earnings conferenc...,Low,<s>[INST] <<SYS>>\nYou are an rating agency. Y...
4,good day welcome apple q fy earnings conferenc...,Low,<s>[INST] <<SYS>>\nYou are an rating agency. Y...


In [10]:
df_train.text[0]

"<s>[INST] <<SYS>>\nYou are an rating agency. Your task is to predict the a company's ESG Risk Level from a meeting transcript.You should evaluate the company's performance on Environmental, Social and Governance issues.The possible Risk Levels, from low to high, are `Negligible`, `Low`, `Medium`, `High`, `Severe`.\n<</SYS>>\n\n###Transcript: thank emily welcome everyone agilents conference call first quarter fiscal year mike mcmullen agilent president ceo bob mcmahon agilent senior vice president cfo joining qa mike bob comment jacob thaysen president agilent life science applied market group sam raha president agilent diagnostics genomics group padraig mcdonnell president agilent crosslab group presentation webcast live change impact company consolidated financial statement also make forwardlooking statement financial performance company statement subject risk uncertainty valid today company assumes obligation update thanks parmeet thanks everyone joining call today momentum continue

### Load tokenizer and base model

In [13]:
base_model = "meta-llama/Llama-2-7b-chat-hf"

In [14]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [15]:
def get_seq_len(text):
    return len(tokenizer.tokenize(text))

get_seq_len(combine_question_answer("", ""))

114

In [16]:
df_train_cut = df_train[df_train.text.apply(lambda t: get_seq_len(t) < 1024)]
df_train_cut = df_train_cut.sample(frac=1)
dataset_train = Dataset.from_pandas(df_train_cut[["text"]][:int(df_train_cut.shape[0]*0.8)])
dataset_eval = Dataset.from_pandas(df_train_cut[["text"]][int(df_train_cut.shape[0]*0.8):])
print(dataset_train.shape, dataset_eval.shape)

(164, 2) (42, 2)


In [17]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [18]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

### Set params for PEFT fine-tuning

In [20]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [21]:
timestamp = datetime.now().strftime("%y%m%d_%H%M")
training_params = TrainingArguments(
    output_dir=f"./results/results_{timestamp}",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=5,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.001,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    load_best_model_at_end=True,
)

In [22]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

### Model fine-tuning

In [23]:
trainer.train()

Step,Training Loss,Validation Loss
10,4.0681,3.912088
20,3.5943,3.641784
30,3.5621,3.436919
40,2.8425,3.300761
50,3.3578,3.202756
60,2.8156,3.167489
70,3.3827,3.141281
80,2.7241,3.12068
90,3.171,3.11201
100,2.8722,3.096177


TrainOutput(global_step=164, training_loss=3.0459093233434165, metrics={'train_runtime': 1779.508, 'train_samples_per_second': 0.184, 'train_steps_per_second': 0.092, 'total_flos': 9622103436115968.0, 'train_loss': 3.0459093233434165, 'epoch': 2.0})

In [24]:
# edited, added "llama_" at the beginning

trainer.model.save_pretrained(f"llama_models/model_{timestamp}")
trainer.tokenizer.save_pretrained(f"llama_models/model_{timestamp}")

('llama_models/model_240425_1745/tokenizer_config.json',
 'llama_models/model_240425_1745/special_tokens_map.json',
 'llama_models/model_240425_1745/tokenizer.model',
 'llama_models/model_240425_1745/added_tokens.json',
 'llama_models/model_240425_1745/tokenizer.json')

### Model Inferencing & Evaluation

In [25]:
test_path = "data/processed/test_esg_shortened.csv"
df_test = pd.read_csv(test_path)
print(df_test.shape)

(138, 6)


In [26]:
df_test.head()

Unnamed: 0,symbol,year,quarter,transcript_esg,esg_score,esg_risk_level
0,ADP,2022,1,call reference nongaap financial measures beli...,14.0,Low
1,ADP,2022,1,call reference nongaap financial measures beli...,14.0,Low
2,ADSK,2022,1,addition replay call available autodeskcominve...,16.0,Negligible
3,ADSK,2022,3,addition replay call available autodeskcominve...,16.0,Negligible
4,ADSK,2022,4,addition replay call available autodeskcominve...,16.0,Negligible


In [27]:
def get_question_prompt(transcript_esg):
    system_msg = "<<SYS>>\n" \
        + "You are an rating agency. Your task is to predict the a company's ESG Risk Level from a meeting transcript." \
        + "You should evaluate the company's performance on Environmental, Social and Governance issues." \
        + "The possible Risk Levels, from low to high, are `Negligible`, `Low`, `Medium`, `High`, `Severe`.\n" \
        + "<</SYS>>\n\n"
    prompt = f"<s>[INST] {system_msg}###Transcript: {transcript_esg}###Risk Level: [/INST]"
    return prompt

In [34]:
df_train_cut["prompt"] = df_train_cut["transcript_esg"].apply(get_question_prompt)

In [28]:
df_test["prompt"] = df_test["transcript_esg"].apply(get_question_prompt)
df_test_cut = df_test[df_test.prompt.apply(lambda p: get_seq_len(p) < 1000)]
df_test_cut.shape

(61, 7)

In [29]:
pipe = pipeline(
    task="text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    max_new_tokens=8,
    device_map="auto"
)

def get_generated_risk_level(generated_text):
    output_text = re.split(r"\[\/INST\]\s*", generated_text, maxsplit=1)[1]
    output_text = output_text.split("`")
    if len(output_text) >= 3:
        return output_text[1]
    return None

def evaluate_and_save_results(pipeline, df, filename):

    # get generated text results as the predicted esg risk level
    # input to llama2 to generate text
    chunk_size = 4
    num_chunks = df.shape[0] // chunk_size + int(df.shape[0] % chunk_size > 0)
    results = []
    with warnings.catch_warnings(category=UserWarning):
        warnings.filterwarnings("ignore")
        for i in tqdm(range(num_chunks)):
            result = pipeline(df["prompt"].to_list()[i*chunk_size: (i+1)*chunk_size])
            results.extend(result)

    # extract the answers from generated text
    df["generatedText"] = [res[0]["generated_text"] for res in results]
    df["predicted_esg_risk_level"] = df["generatedText"].apply(get_generated_risk_level)
    df.head()

    # Calculate accuracy
    accuracy = accuracy_score(df['esg_risk_level'], df["predicted_esg_risk_level"])
    print(f"Accuracy for {filename}: {accuracy}")

    # Save to CSV
    df.to_csv(filename, index=False)
    print(f"Results saved to {filename}")
    return accuracy

In [30]:
# demo
generated_text = pipe(df_test_cut.prompt[0])[0]["generated_text"]
get_generated_risk_level(generated_text)

'Medium'

In [35]:
train_accuracy = evaluate_and_save_results(pipe, df_train_cut.copy(), f"data/processed/train_results_{timestamp}.csv")

100%|██████████| 52/52 [04:54<00:00,  5.66s/it]

Accuracy for data/processed/train_results_240425_1745.csv: 0.4029126213592233
Results saved to data/processed/train_results_240425_1745.csv





In [31]:
test_accuracy = evaluate_and_save_results(pipe, df_test_cut.copy(), f"data/processed/test_results_{timestamp}.csv")

 56%|█████▋    | 9/16 [00:51<00:40,  5.84s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 16/16 [01:26<00:00,  5.38s/it]

Accuracy for data/processed/test_results_240425_1745.csv: 0.29508196721311475
Results saved to data/processed/test_results_240425_1745.csv



