### creating the datasets

In [1]:
import shutil
import os

# Define your cache directory path
cache_dir = "/root/.cache/huggingface/datasets/contextual_abuse_dataset4/default/1.0.0"

# Remove the directory if it exists
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)

### loading the dataset for training

In [3]:
from datasets import load_dataset
import contextual_abuse_dataset4
from contextual_abuse_dataset4 import ContextualAbuseRedditDataset
import csv
import pandas as pd
import re
import datasets



In [4]:
from datasets import concatenate_datasets
import pandas as pd

def prepare_contextual_abuse_datasets(level):
    # Instantiate the dataset builder for the specified level
    dataset_builder = ContextualAbuseRedditDataset(level=level)
    dataset_builder.download_and_prepare()
    dataset = dataset_builder.as_dataset()

    # Split the dataset into train, test, and validation parts
    test_dataset = dataset["test"]
    train_dataset = dataset["train"]
    validation_dataset = dataset["validation"]

    # Concatenate the train and validation datasets for all levels
    #total_train_dataset = concatenate_datasets([train_dataset, validation_dataset])

    # Convert the datasets to pandas DataFrames
    df_train = pd.DataFrame(train_dataset)
    df_test = pd.DataFrame(test_dataset)
    df_validation = pd.DataFrame(validation_dataset)

    return df_train, df_validation, df_test

# To call the function for a specific level, e.g., level 3:
df_train, df_validation, df_test = prepare_contextual_abuse_datasets(level=3)

Using custom data configuration default


Downloading and preparing dataset contextual_abuse_dataset4/default to /root/.cache/huggingface/datasets/contextual_abuse_dataset4/default/1.0.0...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset contextual_abuse_dataset4 downloaded and prepared to /root/.cache/huggingface/datasets/contextual_abuse_dataset4/default/1.0.0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
sample = df_train.iloc[800]
parent_text_formatted = sample['parent_text']

formatted_text = f"""
Text: {sample['text']}
Parent Text: 
{parent_text_formatted}
ID: {sample['id']}
Labels Info: {sample['labels_info']}
"""
print(formatted_text)


Text: Speaker1: It always blows my mind that Ben Carson is like the most accomplished brain surgeon to ever live [SEP]
Parent Text: 
Speaker2: He got asked about it in one of the primary debates and he said something about how he thinks that doing them all at once somehow causes autism, that he knows medicine better than anyone, and that if we just spread it out over time or something everything will be fine. [linebreak]  [linebreak] They then asked Ben Carson to respond and has was all slow and mopey like "yeah, everything he just said is bullshit". [SEP] Speaker3: I wish someone would ask Daddy about his opinions on vaccines, I bet he's got some really special thoughts floating around in that head of his. [SEP] Speaker4: Wife of White House communications director galaxy brains her way into tweeting out the most retarded anti-vaxx take imaginable setting twatter record ratio [SEP]
ID: egf0qpc
Labels Info: {'label': [0]}



In [6]:
import pandas as pd

# Assuming df_train, df_validation, and df_test are already defined
# Combine all datasets for analysis
combined_df = pd.concat([df_train, df_validation, df_test])

# Function to calculate word count
def word_count(text):
    return len(str(text).split())

# Concatenate 'text' and 'parent_text' columns into a new column 'combined_text'
combined_df['combined_text'] = combined_df['text'] + " " + combined_df['parent_text']

# Apply the function to the 'combined_text' column
combined_df['combined_text_word_count'] = combined_df['combined_text'].apply(word_count)

# Calculate average, median, maximum, and 85th percentile word count for combined text
avg_word_count_combined = combined_df['combined_text_word_count'].mean()
median_word_count_combined = combined_df['combined_text_word_count'].median()
max_word_count_combined = combined_df['combined_text_word_count'].max()
percentile_95_word_count_combined = combined_df['combined_text_word_count'].quantile(0.95)

print("Average word count in 'combined_text':", avg_word_count_combined)
print("Median word count in 'combined_text':", median_word_count_combined)
print("Maximum word count in 'combined_text':", max_word_count_combined)
print("95th percentile word count in 'combined_text':", percentile_95_word_count_combined)

Average word count in 'combined_text': 91.20416791220053
Median word count in 'combined_text': 56.0
Maximum word count in 'combined_text': 3185
95th percentile word count in 'combined_text': 277.0


### training the model

In [7]:
import torch
import random
import os
import pandas as pd
import transformers
from torch.utils.data import Dataset
from transformers import BertTokenizer, Trainer, TrainingArguments, BertForSequenceClassification
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from ast import literal_eval
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments,EarlyStoppingCallback, BertConfig
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from torch.nn import BCEWithLogitsLoss
from sklearn.utils.class_weight import compute_class_weight
from transformers import logging
from torch.nn import CrossEntropyLoss

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [8]:
n_classes = df_train['labels_info'].apply(lambda x: max(x['label'])).max() + 1
n_classes

4

In [9]:
def get_label(x):
    try:
        return x['label'][0]
    except IndexError:
        return None  # or some default value

labels = df_train['labels_info'].apply(get_label).values

df_train = df_train[df_train['labels_info'].apply(get_label).notna()]

print(np.unique(labels))

[0 1 2 3]


In [10]:
from collections import Counter

def calculate_class_weights(dataset_labels):
    # Count each label's occurrences
    label_counts = Counter(dataset_labels)

    # Total number of samples
    total_count = sum(label_counts.values())

    # Calculate weight for each class
    class_weights = {label: total_count / (len(label_counts) * count) for label, count in label_counts.items()}

    # Convert to a list (if necessary for your framework)
    weights = [class_weights[i] for i in range(len(class_weights))]

    return weights

all_labels = [label for labels_info in df_train['labels_info'] for label in labels_info['label']]
class_weights = calculate_class_weights(all_labels)

In [11]:
# Function to set seeds for reproducibility
def set_seed(seed_value):
    """Set seed for reproducibility."""
    random.seed(seed_value)  # Python
    np.random.seed(seed_value)  # Numpy
    torch.manual_seed(seed_value)  # PyTorch CPU
    torch.cuda.manual_seed(seed_value)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed_value)  # PyTorch multi-GPU
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    # Additional PyTorch settings for reproducibility
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set a seed value
SEED = 42  # This can be any number
set_seed(SEED)

def get_torch_device(verbose: bool = True, gpu_ix: int = 0) -> torch.device:
    if torch.cuda.is_available():
        device = torch.device("cuda")
        if verbose:
            print('There are %d GPU(s) available.' % torch.cuda.device_count())
            print('We will use the GPU:', torch.cuda.get_device_name(gpu_ix))
    else:
        if verbose: print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")
    return device

def save_model(output_dir:str, model, tokenizer):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print("Saving model to %s" % output_dir)
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, num_classes):
      self.len = len(dataframe)
      self.data = dataframe
      self.tokenizer = tokenizer
      self.max_len = max_len
      self.num_classes = num_classes

    def __getitem__(self, index):
        text = self.data.text[index]
        parent_text = self.data.parent_text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            parent_text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length', 
            return_token_type_ids=True,
            truncation=True, 
            return_overflowing_tokens=False
        )
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        labels_info = self.data.labels_info[index]
        label = labels_info['label'][0]

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }


    def __len__(self):
        return self.len
class MyBertForSequenceClassification(BertForSequenceClassification):
    def __init__(self, config, class_weights=None):
        super().__init__(config)
        if class_weights is not None:
            device = get_torch_device()
            self.class_weights = class_weights.to(device)
        else:
            self.class_weights = None

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None):
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
        logits = outputs.logits
        if labels is not None:
            loss_fct = CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + (logits,) + outputs[2:]
        return outputs

device = get_torch_device()

class_weights = calculate_class_weights(all_labels)

# Convert class weights to a tensor and move it to the correct device
class_weights_tensor = torch.FloatTensor(class_weights).to(device)

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Pass the class weights tensor to your custom model
model = MyBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=n_classes, class_weights=class_weights_tensor)



#define the datasets
df_train1 = CustomDataset(df_train, tokenizer, max_len=300, num_classes=n_classes)
df_validation1 = CustomDataset(df_validation, tokenizer, max_len=300, num_classes=n_classes)


# Define the compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = torch.argmax(torch.tensor(pred.predictions), dim=-1)
    preds = (preds > 0.5).int()

    # Now labels and preds are in the same format, we can compute the metrics
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


# Define the training arguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',  # Add this line to set the save strategy
    warmup_steps=100,
    weight_decay=0.03,
    learning_rate=0.00002, 
    save_total_limit=3,
    logging_dir='./logs',
    load_best_model_at_end=True,
    lr_scheduler_type='cosine_with_restarts',)

There are 1 GPU(s) available.
We will use the GPU: Quadro RTX 4000


Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

There are 1 GPU(s) available.
We will use the GPU: Quadro RTX 4000


Some weights of the model checkpoint at bert-base-uncased were not used when initializing MyBertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing MyBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MyBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MyBertForSequenceClassification were not initialized from the model check

In [12]:
# Create an instance of the EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3) 
# Create the Trainer and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=df_train1,
    eval_dataset=df_validation1,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 13584
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1698
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.3544,0.8975,0.599867,0.656836,0.783344,0.599867
2,0.8889,0.709063,0.698188,0.721564,0.772438,0.698188


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

TrainOutput(global_step=1698, training_loss=1.0653524769489842, metrics={'train_runtime': 1136.4106, 'train_samples_per_second': 23.907, 'train_steps_per_second': 1.494, 'total_flos': 4188474324403200.0, 'train_loss': 1.0653524769489842, 'epoch': 2.0})

In [14]:
df_test1 = CustomDataset(df_test, tokenizer, max_len=300, num_classes=n_classes)
eval_results = trainer.evaluate(df_test1)

***** Running Evaluation *****
  Num examples = 5307
  Batch size = 16
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tok

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [15]:
# Use Trainer to predict
predictions = trainer.predict(df_test1)

***** Running Prediction *****
  Num examples = 5307
  Batch size = 16
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tok

In [24]:
#level 1
import numpy as np
from sklearn.metrics import classification_report

# Extract true labels
true_labels = predictions.label_ids

# Convert binary_predictions to class labels
class_predictions = np.argmax(binary_predictions, axis=1)

print(classification_report(true_labels, class_predictions))

              precision    recall  f1-score   support

           0       0.92      0.82      0.87      4410
           1       0.52      0.35      0.42       514
           2       0.36      0.49      0.41       242
           3       0.35      0.48      0.40       237

   micro avg       0.82      0.75      0.78      5403
   macro avg       0.54      0.54      0.53      5403
weighted avg       0.83      0.75      0.78      5403
 samples avg       0.74      0.75      0.74      5403



  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
#level 2
import numpy as np
from sklearn.metrics import classification_report

# Extract true labels
true_labels = predictions.label_ids

# Convert binary_predictions to class labels
class_predictions = np.argmax(binary_predictions, axis=1)

print(classification_report(true_labels, class_predictions))

              precision    recall  f1-score   support

           0       0.92      0.83      0.87      4410
           1       0.51      0.35      0.41       514
           2       0.34      0.38      0.36       242
           3       0.33      0.45      0.38       237

   micro avg       0.82      0.75      0.78      5403
   macro avg       0.53      0.50      0.51      5403
weighted avg       0.83      0.75      0.78      5403
 samples avg       0.73      0.75      0.74      5403



  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
#level 3
import numpy as np
from sklearn.metrics import classification_report

# Extract true labels
true_labels = predictions.label_ids

# Convert binary_predictions to class labels
class_predictions = np.argmax(binary_predictions, axis=1)

print(classification_report(true_labels, class_predictions))

              precision    recall  f1-score   support

           0       0.92      0.81      0.86      4410
           1       0.49      0.38      0.43       514
           2       0.33      0.48      0.39       242
           3       0.38      0.35      0.37       237

   micro avg       0.82      0.73      0.77      5403
   macro avg       0.53      0.50      0.51      5403
weighted avg       0.83      0.73      0.78      5403
 samples avg       0.72      0.73      0.73      5403



  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
def get_label_map():
    label_map = {'Neutral': 0, 'IdentityDirectedAbuse': 1, 'AffiliationDirectedAbuse': 2, 'PersonDirectedAbuse': 3}
    inv_label_map = {v: k for k, v in label_map.items()}
    return label_map, inv_label_map

# Step 1: Map binary predictions back to original categories
 #Retrieve the inverse label map
_, inv_label_map = get_label_map()

# Convert numeric predictions to string labels
converted_predictions = [inv_label_map[label] for label in binary_predictions.flatten()]
print(f"Length of converted_predictions: {len(converted_predictions)}")
print(f"Number of rows in df_test: {len(df_test)}")

Length of converted_predictions: 21228
Number of rows in df_test: 5307


In [22]:
import pandas as pd
import numpy as np

# Assuming predictions are obtained from the Trainer
preds = predictions.predictions
labels = predictions.label_ids

# Convert predictions to label names
_, inv_label_map = get_label_map()
processed_preds = [inv_label_map[label] for label in np.argmax(preds, axis=1)]

# Extract relevant columns from df_test1 for analysis
df_analysis = df_test[['text', 'parent_text', 'labels_info']].copy()

# Add predictions to the DataFrame with specific naming
level = 3 # Set the level appropriately
df_analysis[f'prediction_level_{level}'] = processed_preds

# Convert labels_info from numeric to label names
df_analysis['labels_info'] = df_analysis['labels_info'].apply(lambda x: inv_label_map[x['label'][0]])

In [23]:
df_analysis.to_csv(f'multiclass_{level}.csv', index=False)