#Initial

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers[torch] accelerate -U
!pip install datasets torch
!pip install scikit-learn



In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score

# Helper Methods

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    labels = labels.numpy()

    print("Predictions:", predictions)
    print("Labels:", labels)

    f1 = f1_score(labels, predictions, average='weighted')
    roc_auc = roc_auc_score(labels, predictions, average='weighted', multi_class='ovr')
    pr_auc = average_precision_score(labels, predictions, average='weighted')

    return {'f1': f1, 'roc_auc': roc_auc, 'pr_auc': pr_auc}

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

In [None]:
def preprocess(text, label):
    labels = [label] * len(text)
    labels = pd.Series(labels) # make into a dataframe Series
    df = pd.concat([text, labels], axis=1) # put both together in a dataframe
    df.columns = ['text', 'label'] # label the columns
    return df

In [None]:
def getRandom(messages, size):
    subset = messages.sample(n=size)
    df = pd.DataFrame(subset)
    df.columns = ['text']
    return df

#Preprocessing

In [None]:
# getting the test messages
messages = pd.read_csv("/content/drive/MyDrive/core_search_index_IM_formatted.csv").iloc[:, 6]

In [None]:
messages6k = getRandom(messages, 6000)
messages7k = getRandom(messages, 7000)
messages8k = getRandom(messages, 8000)
messages9k = getRandom(messages, 9000)
messages10k = getRandom(messages, 10000)

print(messages6k)

                                                     text
62608                         HAHAHAHAHAHAHHAHAAHAHAHHAHA
34248   Well so did many of us. But why choose monarch...
72014   Kek it's a white mans language that you dirty ...
47465   He doesn't? Never noticed. Guess he just left ...
199842  I find it actually quite amazing how you could...
...                                                   ...
75945   I wouldn't recommend anyone to play with their...
76882   How the fuck did Kalash get four charges for t...
203189  Although I am not strasserist, how strasserist...
106738  What's wrong with integralism? This one had so...
22324   @CyrusTheGreat What do you believe about God, ...

[6000 rows x 1 columns]


In [None]:
# ground-truth stuff
conspiratorial = pd.read_csv("/content/drive/MyDrive/messages/ConspiratorialThinking.csv")
hatred = pd.read_csv("/content/drive/MyDrive/messages/HatredAgainstDifferentlyAbled.csv")
identification = pd.read_csv("/content/drive/MyDrive/messages/IdentificationwithExtremistGroups.csv")
political = pd.read_csv("/content/drive/MyDrive/messages/PoliticalHatred.csv")
racial = pd.read_csv("/content/drive/MyDrive/messages/RacialHatred.csv")
religious = pd.read_csv("/content/drive/MyDrive/messages/ReligiousHatred.csv")
sexual = pd.read_csv("/content/drive/MyDrive/messages/SexualOrientationHatred.csv")
violent = pd.read_csv("/content/drive/MyDrive/messages/ViolentMeansExplicit.csv")

In [None]:
conspiratorial = preprocess(conspiratorial, 0) # Conspiratorial Thinking
hatred = preprocess(hatred, 1) # Hatred Against Differently Abled
identification = preprocess(identification, 2) # Identification with Extremist Groups
political = preprocess(political, 3) # Political Hatred
racial = preprocess(racial, 4) # Racial Hatred
religious = preprocess(religious, 5) # Religious Hatred
sexual = preprocess(sexual, 6) # Sexual Orientation Hatred
violent = preprocess(violent, 7) # Violent Means - Explicit
# change this to one-hot encoding

In [None]:
# df with messages and then the category that each message belongs to
df = pd.concat([conspiratorial, hatred, identification, political, racial, religious, sexual, violent], axis=0)
print(df)

                                                  text  label
0    I think there is an absolute truth, but how we...      0
1    Ok, this was a marxist-maoist terrorist group....      0
2    Maduro isn't a Taxi Driver man, he is a Bus dr...      0
3    What exactly did segregation enforce? In any c...      0
4    Hungaria makes another good point: we are "fre...      0
..                                                 ...    ...
124  He the Negro is but a grown up child, and must...      7
125  Novorossia is the personification of the schiz...      7
126  Fuehrer of the galaxy Kai Murros says that the...      7
127  The best thing I can say for him is that this ...      7
128  Dylann is a hero, he did what he thought was r...      7

[1504 rows x 2 columns]


# Tokenization

In [None]:
X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

In [None]:
print(train_df.shape)
print(test_df.shape)

(1203, 2)
(301, 2)


In [None]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=8)

In [None]:
def tokenize_train(examples):
    tokenized = tokenizer(examples['text'], padding='max_length', truncation=True)
    tokenized['label'] = examples['label']
    return tokenized

In [None]:
def tokenize_test(examples):
    examples['text'] = [str(x) for x in examples['text']]
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [None]:
train_dataset = train_dataset.map(tokenize_train, batched=True)
train_dataset = train_dataset.remove_columns(['text'])

Map:   0%|          | 0/1203 [00:00<?, ? examples/s]

In [None]:
test_dataset = test_dataset.map(tokenize_test, batched=True)
test_dataset = test_dataset.remove_columns(['text'])

Map:   0%|          | 0/301 [00:00<?, ? examples/s]

# Fine-tuning / training

In [None]:
label_map = {
    0: 'Conspiratorial Thinking',
    1: 'Hatred Against Differently Abled',
    2: 'Identification with Extremist Groups',
    3: 'Political Hatred',
    4: 'Racial Hatred',
    5: 'Religious Hatred',
    6: 'Sexual Orientation Hatred',
    7: 'Violent Means - Explicit'
}

reverse_label_map = {v: k for k, v in label_map.items()}

In [None]:
def compute_metrics(labels, predictions, probability_scores):
    true_labels_text = [label_map[label] for label in labels]
    pred_labels_text = [label_map[pred] for pred in predictions]

    f1 = f1_score(labels, predictions, average='weighted')
    roc_auc = roc_auc_score(labels, probability_scores, multi_class='ovo', average='weighted')
    pr_auc = average_precision_score(labels, probability_scores, average='weighted')

    return {
        'f1': f1,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps = 500,
    logging_dir='./logs',
    logging_steps=10
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
output = trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
print(output.evaluate())