### loading the dataset for training

In [1]:
import shutil
import os

# Define your cache directory 
#deleting this is needed for each time you're making a new dataset based on the level
cache_dir = "/root/.cache/huggingface/datasets/contextual_abuse_dataset3/default/1.0.0"

# Remove the directory if it exists
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)

In [2]:
from datasets import load_dataset
import contextual_abuse_dataset3
from contextual_abuse_dataset3 import ContextualAbuseRedditDataset
import csv
import pandas as pd
import re
import datasets



### training the model

In [3]:
from datasets import concatenate_datasets
import pandas as pd

def prepare_contextual_abuse_datasets(level):
    # Instantiate the dataset builder for the specified level
    dataset_builder = ContextualAbuseRedditDataset(level=level)
    dataset_builder.download_and_prepare()
    dataset = dataset_builder.as_dataset()

    # Split the dataset into train, test, and validation parts
    test_dataset = dataset["test"]
    train_dataset = dataset["train"]
    validation_dataset = dataset["validation"]

    # Concatenate the train and validation datasets for all levels
    #total_train_dataset = concatenate_datasets([train_dataset, validation_dataset])

    # Convert the datasets to pandas DataFrames
    df_train = pd.DataFrame(train_dataset)
    df_test = pd.DataFrame(test_dataset)
    df_validation = pd.DataFrame(validation_dataset)

    return df_train, df_validation, df_test

# To call the function for a specific level, e.g., level 3:
df_train, df_validation, df_test = prepare_contextual_abuse_datasets(level=3)

Using custom data configuration default


Downloading and preparing dataset contextual_abuse_dataset3/default to /root/.cache/huggingface/datasets/contextual_abuse_dataset3/default/1.0.0...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset contextual_abuse_dataset3 downloaded and prepared to /root/.cache/huggingface/datasets/contextual_abuse_dataset3/default/1.0.0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
df_train['label'] = df_train['labels_info'].apply(lambda x: x['label'][0] if (isinstance(x['label'], list) and len(x['label']) > 0) else None)
print(df_train.label.value_counts())

# Now filter the DataFrame
df_majority = df_train[df_train.label == 0]
df_minority = df_train[df_train.label == 1]

# Undersample the majority class
df_majority_undersampled = df_majority.sample(len(df_minority), random_state=42)

# Combine the minority class with the undersampled majority class
df_balanced = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the balanced dataframe
df_balanced_train = df_balanced.sample(frac=1, random_state=42)
df_balanced_train.reset_index(drop=True, inplace=True)
print(df_balanced_train.label.value_counts())


0    11199
1     2385
Name: label, dtype: int64
0    2385
1    2385
Name: label, dtype: int64


In [5]:

df_validation['label'] = df_validation['labels_info'].apply(lambda x: x['label'][0] if (isinstance(x['label'], list) and len(x['label']) > 0) else None)
print(df_validation.label.value_counts())

# Now filter the DataFrame
df_majorit = df_validation[df_validation.label == 0]
df_minorit = df_validation[df_validation.label == 1]

# Undersample the majority class
df_majority_undersamp = df_majorit.sample(len(df_minorit), random_state=42)

# Combine the minority class with the undersampled majority class
df_balanced_val = pd.concat([df_majority_undersamp, df_minorit])

# Shuffle the balanced dataframe
df_balanced_val = df_balanced_val.sample(frac=1, random_state=42)
df_balanced_val.reset_index(drop=True, inplace=True)
print(df_balanced_val.label.value_counts())



0    3733
1     793
Name: label, dtype: int64
0    793
1    793
Name: label, dtype: int64


In [6]:
text = df_train.iloc[548]["text"]
parent_text = df_train.iloc[540]["parent_text"]
text, parent_text

("Speaker1: What? [linebreak]  [linebreak] But Tokyo has more Michelin Stars than New York and Paris combined.... [linebreak]  [linebreak] And he works with Asian chefs... [linebreak]  [linebreak] He'll likely hire very talented Asian chefs here too. [SEP]",
 "Speaker2: Why, if you don't mind me asking? [SEP] Speaker3: I actually enjoy Swedish accents [SEP] Speaker4: I despise my Swedish accent [SEP]")

In [7]:
import pandas as pd

# Assuming df_train, df_validation, and df_test are already defined as shown
# Combine all datasets for analysis
combined_df = pd.concat([df_balanced_train, df_balanced_val, df_test])

# Function to calculate word count
def word_count(text):
    return len(str(text).split())

# Apply the function to the 'text' and 'parent_text' columns
combined_df['text_word_count'] = combined_df['text'].apply(word_count)
combined_df['parent_text_word_count'] = combined_df['parent_text'].apply(word_count)

# Calculate average, median, maximum, and 85th percentile word count
avg_word_count_text = combined_df['text_word_count'].mean()
median_word_count_text = combined_df['text_word_count'].median()
max_word_count_text = combined_df['text_word_count'].max()
percentile_85_word_count_text = combined_df['text_word_count'].quantile(0.90)

avg_word_count_parent_text = combined_df['parent_text_word_count'].mean()
median_word_count_parent_text = combined_df['parent_text_word_count'].median()
max_word_count_parent_text = combined_df['parent_text_word_count'].max()
percentile_90_word_count_parent_text = combined_df['parent_text_word_count'].quantile(0.90)

print("Average word count in 'text':", avg_word_count_text)
print("Median word count in 'text':", median_word_count_text)
print("Maximum word count in 'text':", max_word_count_text)
print("90th percentile word count in 'text':", percentile_85_word_count_text)

print("\nAverage word count in 'parent_text':", avg_word_count_parent_text)
print("Median word count in 'parent_text':", median_word_count_parent_text)
print("Maximum word count in 'parent_text':", max_word_count_parent_text)
print("90th percentile word count in 'parent_text':", percentile_90_word_count_parent_text)


Average word count in 'text': 34.49475837381744
Median word count in 'text': 18.0
Maximum word count in 'text': 1857
90th percentile word count in 'text': 71.0

Average word count in 'parent_text': 59.540611949203104
Median word count in 'parent_text': 29.0
Maximum word count in 'parent_text': 2234
90th percentile word count in 'parent_text': 142.0


In [7]:
n_classes = df_balanced_train['labels_info'].apply(lambda x: max(x['label'])).max() + 1
print (n_classes)

2


In [8]:
def get_label(x):
    try:
        return x['label'][0]
    except IndexError:
        return None  # or some default value

labels = df_train['labels_info'].apply(get_label).values

import numpy as np
df_train = df_train[df_train['labels_info'].apply(get_label).notna()]

print(np.unique(labels))

[0 1]


In [9]:
import torch
import pandas as pd
import transformers
from torch.utils.data import Dataset
from transformers import BertTokenizer, Trainer, TrainingArguments, BertForSequenceClassification
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

import torch
from ast import literal_eval
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments,EarlyStoppingCallback, BertConfig
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from torch.nn import BCEWithLogitsLoss
from sklearn.utils.class_weight import compute_class_weight
from transformers import logging

from transformers import BertModel, BertConfig, BertPreTrainedModel
import torch
import torch.nn as nn

def get_torch_device(verbose: bool = True, gpu_ix: int = 0) -> torch.device:
    if torch.cuda.is_available():
        device = torch.device("cuda")
        if verbose:
            print('There are %d GPU(s) available.' % torch.cuda.device_count())
            print('We will use the GPU:', torch.cuda.get_device_name(gpu_ix))
    else:
        if verbose: print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")
    return device

def save_model(output_dir:str, model, tokenizer):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print("Saving model to %s" % output_dir)
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __getitem__(self, index):
        text = self.data.iloc[index].text
        parent_text = self.data.iloc[index].parent_text
        # Adjusted to extract the first label from the nested structure
        label = torch.tensor([self.data.iloc[index]['labels_info']['label'][0]]).float()

        inputs = self.tokenizer.encode_plus(
            text + ' [SEP] ' + parent_text,  # Combining text and parent_text with [SEP] token
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label  # Ensure this matches the model's expected label format
        }
    
    def __len__(self):
        return self.len
    
class CustomBertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 1
        self.context_bert = BertModel(config)
        self.text_bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.text_attention = nn.Linear(config.hidden_size, 1)
        self.parent_text_attention = nn.Linear(config.hidden_size, 1)
        self.classifier = nn.Linear(config.hidden_size * 3, self.num_labels)
        self.init_weights()
    
    def forward(self, input_ids, attention_mask, labels=None):
        # Apply separate attention mechanisms for text and parent text
        context_outputs = self.context_bert(input_ids, attention_mask=attention_mask)
        text_outputs = self.text_bert(input_ids, attention_mask=attention_mask)

        context_pooled_output = context_outputs[1]
        text_pooled_output = text_outputs[1]

        context_pooled_output = self.dropout(context_pooled_output)
        text_pooled_output = self.dropout(text_pooled_output)

        text_attention_scores = self.text_attention(text_outputs[0])
        parent_text_attention_scores = self.parent_text_attention(context_outputs[0])

        text_attention_scores = torch.softmax(text_attention_scores, dim=1)
        parent_text_attention_scores = torch.softmax(parent_text_attention_scores, dim=1)

        text_attended_output = torch.sum(text_attention_scores * text_outputs[0], dim=1)
        parent_text_attended_output = torch.sum(parent_text_attention_scores * context_outputs[0], dim=1)

        # Combine the attended outputs with different weights
        text_weight = 0.7
        parent_text_weight = 0.3
        combined_output = text_weight * text_attended_output + parent_text_weight * parent_text_attended_output

        # Concatenate the pooled output and combined attended output
        final_output = torch.cat((context_pooled_output, text_pooled_output, combined_output), dim=-1)

        logits = self.classifier(final_output)

        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1), labels.view(-1))
            return loss, logits
        else:
            return logits


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Assuming you have extended CustomBertForSequenceClassification correctly
# and it accepts the same initialization arguments as BertModel.
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2)
model = CustomBertForSequenceClassification(config)

# Define the datasets
df_train1 = CustomDataset(df_balanced_train, tokenizer, max_len=400)
df_val1 = CustomDataset(df_balanced_val, tokenizer, max_len=400)

# Define the compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = torch.sigmoid(torch.tensor(pred.predictions))
    preds = (preds > 0.5).int()

    # Now labels and preds are in the same format, we can compute the metrics
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define the training arguments

#level 1 best result with 0.0000008 and 9 epochs.
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',  # Add this line to set the save strategy
    warmup_steps=100,
    weight_decay=0.03,
    learning_rate=0.00002, 
    save_total_limit=3,
    logging_dir='./logs',
    load_best_model_at_end=True,
    lr_scheduler_type='cosine_with_restarts',)

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
# Create an instance of the EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(early_stopping_patience=2) 
# Create the Trainer and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=df_train1,
    eval_dataset=df_val1,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

In [11]:
trainer.train()

***** Running training *****
  Num examples = 4770
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 897
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.698657,0.5,0.333333,0.25,0.5
2,0.737800,0.696039,0.578815,0.576224,0.58079,0.578815
3,0.737800,0.703206,0.5971,0.596422,0.597756,0.5971


***** Running Evaluation *****
  Num examples = 1586
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-299
Configuration saved in ./results/checkpoint-299/config.json
Model weights saved in ./results/checkpoint-299/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-302] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1586
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-598
Configuration saved in ./results/checkpoint-598/config.json
Model weights saved in ./results/checkpoint-598/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-604] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1586
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-897
Configuration saved in ./results/checkpoint-897/config.json
Model weights saved in ./results/checkpoint-897/pytorch_model.bin
Deleting older checkpoint [results/che

TrainOutput(global_step=897, training_loss=0.6968893776237632, metrics={'train_runtime': 2067.4462, 'train_samples_per_second': 6.922, 'train_steps_per_second': 0.434, 'total_flos': 5883025095288000.0, 'train_loss': 0.6968893776237632, 'epoch': 3.0})

In [13]:
df_test1 = CustomDataset(df_test, tokenizer, max_len=400)


In [14]:
eval_results = trainer.evaluate(df_test1)

***** Running Evaluation *****
  Num examples = 5307
  Batch size = 16


In [2]:
# Use Trainer to predict
predictions = trainer.predict(df_test1)

NameError: name 'trainer' is not defined

In [16]:
#02 #level 1
from sklearn.preprocessing import binarize  
from sklearn.metrics import classification_report
binary_predictions = binarize(predictions.predictions, threshold=0.5)

# Extract true labels
true_labels = predictions.label_ids 

print(classification_report(true_labels, binary_predictions))

              precision    recall  f1-score   support

           0       0.93      0.76      0.83      4410
           1       0.45      0.58      0.51       897

   micro avg       0.81      0.73      0.77      5307
   macro avg       0.69      0.67      0.67      5307
weighted avg       0.85      0.73      0.78      5307
 samples avg       0.73      0.73      0.73      5307



  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
#02 #level 2
from sklearn.preprocessing import binarize  
from sklearn.metrics import classification_report
binary_predictions = binarize(predictions.predictions, threshold=0.5)

# Extract true labels
true_labels = predictions.label_ids 

print(classification_report(true_labels, binary_predictions))

              precision    recall  f1-score   support

           0       0.94      0.70      0.80      4401
           1       0.41      0.67      0.51       906

   micro avg       0.77      0.69      0.73      5307
   macro avg       0.67      0.68      0.65      5307
weighted avg       0.85      0.69      0.75      5307
 samples avg       0.69      0.69      0.69      5307



  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
#02 #level 3 baseline conctanting text
from sklearn.preprocessing import binarize  
from sklearn.metrics import classification_report
binary_predictions = binarize(predictions.predictions, threshold=0.5)

# Extract true labels
true_labels = predictions.label_ids 

print(classification_report(true_labels, binary_predictions))

              precision    recall  f1-score   support

           0       0.94      0.69      0.80      4401
           1       0.41      0.67      0.51       906

   micro avg       0.77      0.69      0.73      5307
   macro avg       0.67      0.68      0.65      5307
weighted avg       0.85      0.69      0.75      5307
 samples avg       0.69      0.69      0.69      5307



  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
#experiment with two bert models one for context one for text
from sklearn.preprocessing import binarize  
from sklearn.metrics import classification_report
binary_predictions = binarize(predictions.predictions, threshold=0.5)

# Extract true labels
true_labels = predictions.label_ids 

print(classification_report(true_labels, binary_predictions))

              precision    recall  f1-score   support

           0       0.88      0.79      0.83      4401
           1       0.32      0.48      0.38       906

    accuracy                           0.73      5307
   macro avg       0.60      0.63      0.61      5307
weighted avg       0.78      0.73      0.75      5307



In [47]:
def get_label_map():
    label_map = {'Neutral': 0, 'Abusive Speech': 1}
    inv_label_map = {v: k for k, v in label_map.items()}
    return label_map, inv_label_map

# Step 1: Map binary predictions back to original categories
 #Retrieve the inverse label map
_, inv_label_map = get_label_map()

# Convert numeric predictions to string labels
converted_predictions = [inv_label_map[label] for label in binary_predictions.flatten()]
print(f"Length of converted_predictions: {len(converted_predictions)}")
print(f"Number of rows in df_test: {len(df_test)}")

Length of converted_predictions: 5307
Number of rows in df_test: 5307


In [48]:
import pandas as pd
import numpy as np

# Assuming predictions are obtained from the Trainer
preds = predictions.predictions
labels = predictions.label_ids

# Convert predictions to label names
_, inv_label_map = get_label_map()
processed_preds = [inv_label_map[label] for label in np.argmax(preds, axis=1)]

# Extract relevant columns from df_test1 for analysis
df_analysis = df_test[['text', 'parent_text', 'labels_info']].copy()

# Add predictions to the DataFrame with specific naming
level = 3  # Set the level appropriately
df_analysis[f'prediction_level_{level}'] = processed_preds

# Convert labels_info from numeric to label names
df_analysis['labels_info'] = df_analysis['labels_info'].apply(lambda x: inv_label_map[x['label'][0]])

# Save the DataFrame to a CSV file
#df_analysis.to_csv(f'error_analysis_level_{level}.csv', index=False)

# For adding predictions from other levels later, you can read this file and add new columns accordingly


In [49]:
df_analysis.to_csv(f'error_analysis_level_{level}.csv', index=False)