In [None]:
!pip install kaggle

In [48]:
import os
import kaggle
import pandas as pd
from collections import Counter
import numpy as np
import ast
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import tqdm.notebook as tq
from tensorflow.keras.preprocessing.sequence import pad_sequences

#imports from torch version since tf version didn't work outside of local
from transformers import AdamW
from transformers import BertModel
import torch
import torch.nn as nn

In [2]:
# Ensure the kaggle.json file is in the correct location
os.environ['KAGGLE_CONFIG_DIR'] = os.path.expanduser('~/.kaggle')

# Download the dataset (example: Jigsaw Unintended Bias in Toxicity Classification)
!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification

# do manual unzipping

# Load the dataset using pandas
all_data = pd.read_csv('jigsaw-data/all_data.csv')
content_flags = pd.read_csv('jigsaw-data/identity_individual_annotations.csv')

jigsaw-unintended-bias-in-toxicity-classification.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
for column in ['disability', 'gender', 'race_or_ethnicity', 'religion', 'sexual_orientation']:
    print(column)
    content_flags[column] = content_flags.apply(lambda row: row[column].split(' '), axis=1)



disability
gender
race_or_ethnicity
religion
sexual_orientation


In [4]:
content_flags

Unnamed: 0,id,worker,disability,gender,race_or_ethnicity,religion,sexual_orientation
0,59856,211,[none],[none],[none],[none],[none]
1,59856,683,"[intellectual_or_learning, none]",[none],[none],[none],[none]
2,59856,8899,[none],[none],[none],[none],[none]
3,59856,67,[none],[none],[none],[none],[none]
4,239579,8900,[none],[none],[none],[none],[none]
...,...,...,...,...,...,...,...
2597360,6333923,2477,[none],[none],[none],[atheist],[none]
2597361,6333923,3488,[none],[none],[none],[none],[none]
2597362,6333923,805,[none],[none],[none],[atheist],[none]
2597363,6333923,2493,[none],[none],[other],[atheist],[none]


In [11]:
df = content_flags
# ['disability', 'gender', 'race_or_ethnicity', 'religion', 'sexual_orientation']:
# Concatenate lists in 'list_column_1' and 'list_column_2' within the same row
def replace_entries(lst, mapping):
    return [mapping.get(item, item) for item in lst]

# Apply the replacement function to the 'list_column'
for column in ['disability', 'gender', 'race_or_ethnicity', 'religion', 'sexual_orientation']:
    df[column] = df[column].apply(lambda x: replace_entries(x, {'other': "other_" + column}))

df['concatenated_lists'] = df.apply(lambda row: row['disability'] + row['gender'] + row['race_or_ethnicity'] + row['religion'] + row['sexual_orientation'], axis=1)

# Group by 'id' and concatenate lists in 'concatenated_lists'
df_grouped = df.groupby('id')['concatenated_lists'].apply(lambda x: sum(x, [])).reset_index()

#print("\nDataFrame after concatenating lists within groups:")
#print(df_grouped)

# Define function to filter elements based on occurrence count, only counts if a majority of respondents indicated the content match
def filter_elements(grouped_lists, group_counts):
    threshold = group_counts / 2
    counter = Counter(grouped_lists)
    return [element for element, count in counter.items() if count > threshold]

# Calculate the number of rows in each group
group_counts = df.groupby('id').size().reset_index(name='counts')

# Merge the concatenated lists with the counts
df_merged = pd.merge(df_grouped, group_counts, on='id')

# Apply the filtering function
df_merged['tags'] = df_merged.apply(lambda row: filter_elements(row['concatenated_lists'], row['counts']), axis=1)

#print("\nDataFrame after filtering elements based on occurrence count:")
#print(df_merged[['id', 'tags']])

def remove_entries(lst, entries):
    return [item for item in lst if item not in entries]

# Apply the removal function to all list columns
df_merged['tags'] = df_merged['tags'].apply(lambda x: remove_entries(x, ['none', 'male', 'female']))
df_merged = df_merged[['id', 'tags']]

full_merge = pd.merge(df_merged, all_data, how='left', on='id')
full_merge = full_merge[['id', 'comment_text', 'tags']]
full_merge = full_merge[full_merge['comment_text'] != 'None']
full_merge.to_csv('cleaned_data_.csv')
#print(df_merged)

In [12]:
full_merge[full_merge['comment_text'].isna()]
full_merge[full_merge['id'] == 5353666]

Unnamed: 0,id,comment_text,tags


In [32]:
df_merged['tags'] = df_merged['tags'].apply(lambda x: remove_entry(x, 'none'))
df_merge = df_merged[['id', 'tags']]

full_merge = pd.merge(df_merged, all_data, how='left', on='id')
full_merge = full_merge[['id', 'comment_text', 'tags']]
full_merge
full_merge.to_csv('cleaned_data.csv')

In [33]:
df = pd.read_csv('cleaned_data.csv')
df['tags'] = df['tags'].apply(ast.literal_eval)
df['tags'].value_counts()

[]                                                                       302211
[christian]                                                               27844
[white]                                                                   15783
[muslim]                                                                  13968
[black]                                                                    7271
                                                                          ...  
[jewish, black, other_race_or_ethnicity]                                      1
[black, jewish, muslim, latino]                                               1
[latino, jewish, muslim, black]                                               1
[transgender, asian, muslim]                                                  1
[psychiatric_or_mental_illness, christian, transgender, heterosexual]         1
Name: tags, Length: 897, dtype: int64

In [34]:
#Since most of the data is simply empty tags, we will remove a large portion of them to prevent overfitting while also speeding up the training process
df = df.drop(df[df['tags'].apply(lambda x: x== [])].sample(n=260000, random_state=22).index)
df['tags'].value_counts()

[]                                                                       42211
[christian]                                                              27844
[white]                                                                  15783
[muslim]                                                                 13968
[black]                                                                   7271
                                                                         ...  
[jewish, black, other_race_or_ethnicity]                                     1
[black, jewish, muslim, latino]                                              1
[latino, jewish, muslim, black]                                              1
[transgender, asian, muslim]                                                 1
[psychiatric_or_mental_illness, christian, transgender, heterosexual]        1
Name: tags, Length: 897, dtype: int64

In [35]:
# One-hot encode the labels
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(df['tags'])
print(len(mlb.classes_))
df['labels'] = list(binary_labels)
df = df.drop(columns=['Unnamed: 0', 'id', 'tags'])
df

22


Unnamed: 0,comment_text,labels
0,haha you guys are a bunch of losers.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"Angry trolls, misogynists and Racists"", oh my....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Yet call out all Muslims for the acts of a few...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
10,I think you left out one very important organi...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
11,"Ah, so part of the ""back end"" and ""algorithms""...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
405149,"""It doesn't matter when it's erected and for w...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
405151,"Every time there are testimonies, like this, I...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
405153,It is of course normal and natural for Eugene ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
405154,Believing in God or not believing in God are p...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [36]:
# from this point, adapted from https://github.com/dtolk/multilabel-BERT/blob/master/notebooks/multi_label_text_classification_BERT.ipynb 
# split into train and test
df_train, df_test = train_test_split(df, random_state=77, test_size=0.30, shuffle=True)
# split test into test and validation datasets
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

print(f"Train: {df_train.shape}, Test: {df_test.shape}, Valid: {df_valid.shape}")

Train: (101611, 2), Test: (21774, 2), Valid: (21774, 2)


In [37]:
# Hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid

In [31]:
# Load pre-trained BERT tokenizer and model
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = list(df['comment_text'])
        self.targets = list(df['labels'])
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'title': title
        }
    
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN)
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN)

next(iter(train_dataset))

{'input_ids': tensor([  101,  2748,  1010,  2216,  2040,  1999,  6767,  3489,  7404,  2024,
         15554,  1012,  1012,  1012,  1012,  2122,  5223,  3993,  1010, 16939,
          1010,  9253,  1011, 13157,  1013,  1047, 19658,  1013,  2317, 10514,
         28139,  7712,  5130,  7404,  4697,  2033,   999,  2057,  2342,  2000,
          2022,  2844,  1998,  2552,  2007,  8295, 16056,  1012,  1012,  1012,
          2293,  2003,  1996,  2087,  3928,  2486,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [40]:
# Data loaders
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset, 
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)


In [49]:

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.2)
        self.linear = torch.nn.Linear(768, 22) # 22 possible output labels

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()

# define the optimizer
optimizer = AdamW(model.parameters(), lr = LEARNING_RATE)  

# Use GPU resources if available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("Using CPU")

model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using CPU




BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [51]:
# BCEWithLogitsLoss combines a Sigmoid layer and the BCELoss in one single class. 
# This version is more numerically stable than using a plain Sigmoid followed 
# by a BCELoss as, by combining the operations into one layer, 
# we take advantage of the log-sum-exp trick for numerical stability.
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)


# Training of the model for one epoch
def train_model(training_loader, model, optimizer):

    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to training mode (activate droput, batch norm)
    model.train()
    # initialize the progress bar
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader), 
                      leave=True, colour='steelblue')
    for batch_idx, data in loop:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        # forward
        outputs = model(ids, mask, token_type_ids)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        # training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round() # (arrive at decision from probabilities (<0.5 v >=0.5))
        targets = targets.cpu().detach().numpy()
        correct_predictions += np.sum(outputs==targets)
        num_samples += targets.size   # total number of elements in the 2D array

        # backward
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()

        # Update progress bar
        #loop.set_description(f"")
        #loop.set_postfix(batch_loss=loss)

    # returning: trained model, model accuracy, mean loss
    return model, float(correct_predictions)/num_samples, np.mean(losses)

In [52]:
def eval_model(validation_loader, model, optimizer): #evaluates with validation data to check generalisability
    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # validation accuracy
            # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            targets = targets.cpu().detach().numpy()
            correct_predictions += np.sum(outputs==targets)
            num_samples += targets.size   # total number of elements in the 2D array

    return float(correct_predictions)/num_samples, np.mean(losses)

In [53]:
#Training the model

from collections import defaultdict
history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model, optimizer)

    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    # save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), os.path.join("/output", str(epoch), "MLTC_model_state.bin"))
        best_accuracy = val_acc
     

Epoch 1/10


  0%|          | 0/3176 [00:00<?, ?it/s]

KeyboardInterrupt: 

tf version whose optmizer doesn't work outside of local:

In [16]:
# One-hot encode the labels
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(df['tags'])

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(mlb.classes_), from_pt=True)

# Tokenization and padding
sequences = [tokenizer.encode(text, max_length=256, truncation=True) for text in df['comment_text']]
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, binary_labels, test_size=0.2)

# Convert data to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(100).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(16)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train the model
model.fit(train_dataset, epochs=3, validation_data=test_dataset)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
  17/7258 [..............................] - ETA: 28:06:18 - loss: 0.5460 - accuracy: 0.1912

KeyboardInterrupt: 

In [17]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("Using CPU")

Using CPU


In [9]:
model(tf.random.uniform((5,5), maxval=100, dtype=tf.int32))

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(5, 5, 768), dtype=float32, numpy=
array([[[-0.1926953 , -0.19953968, -0.24369815, ..., -0.0476762 ,
          0.78095937, -0.10719464],
        [-0.09016032, -0.1465548 , -0.2265495 , ..., -0.15154383,
          0.800614  , -0.17425951],
        [-0.09033497, -0.11411548, -0.2296839 , ..., -0.13924852,
          0.7794273 , -0.2271377 ],
        [-0.11165614, -0.20364816, -0.18446675, ..., -0.10939424,
          0.7787872 , -0.2094966 ],
        [-0.11141557, -0.20753846, -0.18952729, ..., -0.08506671,
          0.74222237, -0.26211852]],

       [[-0.26172006, -0.19809552, -0.22027454, ..., -0.03760334,
          0.7533313 ,  0.0119855 ],
        [-0.16966552, -0.13177472, -0.19482699, ..., -0.12876847,
          0.7590233 , -0.06027906],
        [-0.17079079, -0.0879863 , -0.19970554, ..., -0.12122375,
          0.72431463, -0.08815363],
        [-0.17086516, -0.1531767 , -0.15847021, ..., -0.10263245