In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from datasets import Dataset
from datasets.dataset_dict import DatasetDict
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
import copy
import json
import os

%matplotlib inline 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.device_count()

1

### **1.** Offensiveness rating

**Load data and profiles**

In [3]:
path_to_data_dirs = '../../../data/Potato-Prolific-Dataset/dataset'
data_file_name = 'raw_data.csv'
task = 'offensiveness'

path_to_data = f'{path_to_data_dirs}/{task}/{data_file_name}'
df_data = pd.read_csv(path_to_data)
df_data

Unnamed: 0,instance_id,user_id,text,offensiveness,gender,race,age,occupation,education
0,530,0,"I think a lot of Dethklok songs use drop C, wo...",1.0,Man,White,35-39,Unemployed,High school diploma or equivalent
1,1280,0,There are relatively simple ways around all of...,1.0,Man,White,35-39,Unemployed,High school diploma or equivalent
2,621,0,Tell the british soldier in WW1 to shoot that ...,1.0,Man,White,35-39,Unemployed,High school diploma or equivalent
3,676,0,Top comment pretty much. I have gay friends an...,1.0,Man,White,35-39,Unemployed,High school diploma or equivalent
4,635,0,Don't tell them just let them and their liniag...,3.0,Man,White,35-39,Unemployed,High school diploma or equivalent
...,...,...,...,...,...,...,...,...,...
13031,471,262,They’re closed anti-vaxx Facebook groups. When...,1.0,Woman,Asian,54-59,Self-employed,College degree
13032,1033,262,Bioethics; an interesting field in which the w...,1.0,Woman,Asian,54-59,Self-employed,College degree
13033,740,262,Or they are really secure but hang around inse...,5.0,Woman,Asian,54-59,Self-employed,College degree
13034,894,262,Don't have to worry about being too big to fit...,4.0,Woman,Asian,54-59,Self-employed,College degree


In [4]:
path_to_profiles_files = '../../data_analysis/popquorn/extracted_profiles'
profiles_file_name = 'offensiveness_profiles.csv'

path_to_profiles = f'{path_to_profiles_files}/{profiles_file_name}'
df_profiles = pd.read_csv(path_to_profiles)

In [5]:
stages_of_adulthood = {
    '18-24': 'Early adulthood',
    '25-29': 'Early adulthood',
    '30-34': 'Early adulthood',
    '35-39': 'Middle adulthood',
    '40-44': 'Middle adulthood',
    '45-49': 'Middle adulthood',
    '50-54': 'Middle adulthood',
    '54-59': 'Middle adulthood',
    '60-64': 'Middle adulthood',
    '>65': 'Late adulthood'
}

In [6]:
def select_profile(idx:int):
    return df_profiles.iloc[20].iloc[:-1]

def print_profile(profile:pd.Series):
    width = len(profile.to_string().split('\n')[0])
    print(f"{'='*width}\n{profile.to_string()}\n{'='*width}") 

**Extract POPQUORN annotation data fitting specific annotator profile**

*Choose a profile by index*

In [7]:
profile = select_profile(idx=20)
print_profile(profile)

gender                  Woman
race                    White
age          Middle adulthood
education      College degree


*Extract instances given by annotators that fit the profile*

In [8]:
stage_of_adulthood = profile['age']
ages = [k for k, v in stages_of_adulthood.items() if v == stage_of_adulthood]
age_mapping = {stage_of_adulthood: ages}

In [9]:
conditions = []
for attr in profile.index:
    if attr == 'age':
        conditions.append(df_data[attr].isin(age_mapping[profile[attr]]))
    elif attr == 'education' and profile[attr] == 'College degree': 
        conditions.append(df_data[attr].isin(['College degree', 'Graduate degree']))
    else:
        conditions.append(df_data[attr] == profile[attr])

df_filtered = df_data[pd.concat(conditions, axis=1).all(axis=1)]

In [10]:
df_data_profiled = df_filtered.groupby(['instance_id', 'text'], as_index=False)['offensiveness'].mean()

# round the means to the nearest integer value, subtract 1 from all afterwards to avoid the following assertion error during training:
# ../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
df_data_profiled.offensiveness = df_data_profiled.offensiveness.round().astype(int).map(lambda offn: offn-1)
df_data_profiled

Unnamed: 0,instance_id,text,offensiveness
0,0,That’s a pretty slippery slope you’ve got ther...,0
1,1,I don't know. Sitting here now answering this ...,0
2,2,Pretty much the same as when straight people p...,3
3,3,"If you dont see it in your post, youre probabl...",3
4,4,You believe most priests are gay?,0
...,...,...,...
1180,1494,This behaviour is misleading and deceptive in ...,4
1181,1495,Offer myself as their slave and informant.,0
1182,1496,"hey, they don't call it the devil's lettuce fo...",1
1183,1497,If you want to take away my rights because of ...,0


In [16]:
df_data_profiled.to_csv('profiled_data_sample.csv', index=False)

**Prepare dataset**

*Perform stratified train-eval split*

In [11]:
train_df, eval_df = train_test_split(df_data_profiled, test_size=0.15, stratify=df_data_profiled.offensiveness, random_state=0)

In [12]:
id2label = {
    0: 'not offensive',
    1: 'slightly offensive',
    2: 'moderately offensive',
    3: 'very offensive',
    4: 'extremely offensive'
}

label2id = {
    'not offensive': 0,
    'slightly offensive': 1,
    'moderately offensive': 2,
    'very offensive': 3,
    'extremely offensive': 4
}

print(id2label)
print(label2id)

{0: 'not offensive', 1: 'slightly offensive', 2: 'moderately offensive', 3: 'very offensive', 4: 'extremely offensive'}
{'not offensive': 0, 'slightly offensive': 1, 'moderately offensive': 2, 'very offensive': 3, 'extremely offensive': 4}


*Create dataset object, perform oversampling by duplication*

In [14]:
dataset_dict = {'train':None, 'val':None}

for split, split_df in zip(dataset_dict.keys(), [train_df, eval_df]):
    
    majority_labels = split_df.offensiveness.value_counts().nlargest(2).index
    # dataframe to duplicate the entries in
    dupl_df = split_df[split_df.offensiveness != majority_labels[0]]
    dupl_df = dupl_df[dupl_df.offensiveness != majority_labels[1]]
    # upsampled dataframe
    ups_df = pd.concat([split_df, dupl_df], ignore_index=True)
    
    print(split)
    print(split_df.offensiveness.value_counts())
    print(ups_df.offensiveness.value_counts())
    
    split_dict = {
        'label': split_df.offensiveness.tolist(),
        'text': split_df.text.tolist()
    }
    dataset_dict[split] = Dataset.from_dict(split_dict)
    
dataset = DatasetDict(dataset_dict)
dataset

train
offensiveness
0    506
1    282
2    109
3     70
4     40
Name: count, dtype: int64
offensiveness
0    506
1    282
2    218
3    140
4     80
Name: count, dtype: int64
val
offensiveness
0    89
1    50
2    20
3    12
4     7
Name: count, dtype: int64
offensiveness
0    89
1    50
2    40
3    24
4    14
Name: count, dtype: int64


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1007
    })
    val: Dataset({
        features: ['label', 'text'],
        num_rows: 178
    })
})

In [55]:
models_dir = '/bigwork/nhwpnagm/hf_models/'
model_id = 'Mistral-7B-Instruct-v0.3'

model = AutoModelForSequenceClassification.from_pretrained(f'{models_dir}/{model_id}', num_labels=len(id2label), id2label=id2label, label2id=label2id, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(f'{models_dir}/{model_id}') # possibly set model_max_length (default 512)

Loading checkpoint shards: 100%|██████████| 3/3 [08:49<00:00, 176.43s/it]
Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at /bigwork/nhwpnagm/hf_models//Mistral-7B-Instruct-v0.3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.05,
    r=32,
    bias='none',
    task_type='SEQ_CLS'
)
# model.add_adapter(peft_config)

In [61]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [62]:
#model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [63]:
model.print_trainable_parameters()

trainable params: 13,651,968 || all params: 7,127,478,272 || trainable%: 0.1915


In [64]:
def preprocess_function(examples):
    return tokenizer(examples['text'], padding=True, truncation=True, return_tensors='pt')#.to('cuda')

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1007 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1007/1007 [00:02<00:00, 354.89 examples/s]
Map: 100%|██████████| 178/178 [00:00<00:00, 12198.32 examples/s]


In [65]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)#, pad_to_multiple_of=8)

In [66]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # f1_weighted = f1_score(labels, predictions, average = 'weighted')
    acc = accuracy_score(labels, predictions)
    return {'acc': acc}

In [67]:
training_args = TrainingArguments(
    output_dir=f'{models_dir}/{model_id}_seq_cls', # checkpoints location
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    eval_strategy='steps',
    eval_steps=100,
    logging_steps=100,
    save_steps=200,
    save_strategy='steps',
    load_best_model_at_end=True,
    metric_for_best_model='acc',
    save_total_limit=1,
    warmup_steps=50,
    report_to='none',
    lr_scheduler_type='cosine'
)

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01 # small improvement to reset patience
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [early_stopping]
)

  trainer = Trainer(


In [68]:
torch.cuda.empty_cache()

In [69]:
trainer.train()

Step,Training Loss,Validation Loss,Acc
100,4.445,1.567787,0.488764
200,1.4419,1.341872,0.426966
300,1.3979,1.308064,0.432584
400,1.2486,1.282445,0.477528
500,1.0852,1.283193,0.477528


TrainOutput(global_step=504, training_loss=1.9191467374090165, metrics={'train_runtime': 640.9579, 'train_samples_per_second': 3.142, 'train_steps_per_second': 0.786, 'total_flos': 1.2337957820399616e+16, 'train_loss': 1.9191467374090165, 'epoch': 2.0})

In [None]:
trainer.save_model(models_dir)