# Test GPU functionality

In [1]:
import torch
print(torch.cuda.is_available())
torch.__version__

True


'1.12.1'

# Format input data
Put it in a HuggingFace Dataset

In [1]:
# Load white supremacist data
import pandas as pd

path = '../tmp/white_supremacist_train_corpus.pkl'
ws_data = pd.read_pickle(path).assign(label=1)
ws_data.info()

# Load neutral data
path = '../tmp/neutral_train_corpus.pkl'
neutral_data = pd.read_pickle(path).assign(label=0)
neutral_data.info()

# Combine, shuffle and sample if desired
selected_cols = ['text', 'label']
data = pd.concat([ws_data[selected_cols], neutral_data[selected_cols]])
data.info()

# Make a HuggingFace Dataset
from datasets import Dataset

dataset = Dataset.from_pandas(data).train_test_split(test_size=0.1)
dataset

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_data = dataset.map(preprocess, batched=True)

In [6]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Train

In [7]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_metric
import numpy as np

metrics = {'accuracy': load_metric('accuracy'), 
           'f1': load_metric('f1'),
           'precision': load_metric('precision'),
           'recall': load_metric('recall')
            }

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {metric_name: metric.compute(predictions=predictions, references=labels) for metric_name, metric in metrics.items()}

# model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained("results/checkpoint-480000")

batch_size = 16
checkpoint = batch_size * int(1e4)
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    # evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps=checkpoint,
    # eval_steps=checkpoint,
    save_steps=checkpoint,
    # load_best_model_at_end=True,
    # metric_for_best_model='f1'
)

trainer = Trainer(
    model=model,
    args=training_args,
    # train_dataset=tokenized_data["train"],
    # eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [2]:
trainer.train()



KeyboardInterrupt: 

In [9]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, id. If text, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922457
  Batch size = 32


KeyboardInterrupt: 

# Evaluate on unseen test datasets

In [12]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess(examples):
    return tokenizer(examples["text"], truncation=True)

path = '../tmp/annotated_test_corpus.pkl'
annotated = pd.read_pickle(path)
annotated.info()

result_lines = []
for dataset in annotated.dataset.unique():
    print(dataset)
    selected = annotated.query('dataset==@dataset')
    test_dataset = Dataset.from_pandas(selected)
    tokenized_test = test_dataset.map(preprocess, batched=True)
    res = trainer.evaluate(tokenized_test)

    result_lines.append(
        {'dataset': dataset, 'f1': res['eval_f1']['f1'], 'precision': res['eval_precision']['precision'],
         'recall': res['eval_recall']['recall'], 'accuracy': res['eval_accuracy']['accuracy']}
    )
pd.DataFrame(result_lines)

loading configuration file config.json from cache at /home/mamille3/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.22.0",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/mamille3/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/vocab.txt
loading file tokenizer.json from cache at /home/mamille3/.cache/huggingface/hub/models--dis

<class 'pandas.core.frame.DataFrame'>
Index: 7450 entries, alatawi2021_0 to siegel2021_5450
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     7450 non-null   object
 1   dataset  7450 non-null   object
 2   source   7450 non-null   object
 3   domain   7450 non-null   object
 4   label    7450 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 349.2+ KB
alatawi2021


  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, text, dataset, source, domain. If id, text, dataset, source, domain are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1999
  Batch size = 32


siegel2021


  0%|          | 0/6 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, text, dataset, source, domain. If id, text, dataset, source, domain are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5451
  Batch size = 32


Unnamed: 0,dataset,f1,precision,recall,accuracy
0,alatawi2021,0.687823,0.578882,0.847273,0.576788
1,siegel2021,0.094775,0.049766,0.991525,0.179967


In [13]:
# Get confusion matrix
pred_output = trainer.predict(tokenized_test)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, text, dataset, source, domain. If id, text, dataset, source, domain are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 5451
  Batch size = 32


PredictionOutput(predictions=array([[-5.2167153 ,  3.8906655 ],
       [-2.71311   ,  2.0095308 ],
       [-0.812488  ,  0.59591043],
       ...,
       [-3.0237956 ,  2.0903351 ],
       [-5.260662  ,  3.944611  ],
       [ 0.34558126, -0.3611545 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 0]), metrics={'test_loss': 3.5426225662231445, 'test_accuracy': {'accuracy': 0.17996697853604843}, 'test_f1': {'f1': 0.09477521263669501}, 'test_precision': {'precision': 0.049766056997022544}, 'test_recall': {'recall': 0.9915254237288136}, 'test_runtime': 8.3545, 'test_samples_per_second': 652.463, 'test_steps_per_second': 20.468})

In [14]:
pred_output = preds

In [16]:
import numpy as np

preds = np.argmax(pred_output.predictions, axis=-1)
preds

array([1, 1, 1, ..., 1, 1, 0])

In [18]:
print(preds.shape)
np.count_nonzero(preds)

(5451,)


4702

In [22]:
np.count_nonzero(pred_output.label_ids)

236

In [31]:
from sklearn.metrics import confusion_matrix
# tn, fp, fn, tp = confusion_matrix(pred_output.label_ids, preds).ravel()
df = pd.DataFrame(confusion_matrix(pred_output.label_ids, preds), columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
# df.index.name = 'true'
# df.columns.name = 'predicted'
df

Unnamed: 0,pred_0,pred_1
true_0,747,4468
true_1,2,234


In [25]:
confusion_matrix(pred_output.label_ids, preds)

array([[ 747, 4468],
       [   2,  234]])

## Old

In [None]:
# Results on Siegel+2021
import pandas as pd
res = trainer.evaluate(tokenized_test)
res
# pd.DataFrame(res)

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, text, dataset, source, domain. If id, text, dataset, source, domain are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5451
  Batch size = 32
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second
accuracy,3.542623,0.179967,,,,12.4014,439.547,13.789
f1,3.542623,,0.094775,,,12.4014,439.547,13.789
precision,3.542623,,,0.049766,,12.4014,439.547,13.789
recall,3.542623,,,,0.991525,12.4014,439.547,13.789


In [9]:
res

{'eval_loss': 3.5426225662231445,
 'eval_accuracy': {'accuracy': 0.17996697853604843},
 'eval_f1': {'f1': 0.09477521263669501},
 'eval_precision': {'precision': 0.049766056997022544},
 'eval_recall': {'recall': 0.9915254237288136},
 'eval_runtime': 12.4014,
 'eval_samples_per_second': 439.547,
 'eval_steps_per_second': 13.789}

In [10]:
# Results on Alatawi+2021
import pandas as pd
res = trainer.evaluate(tokenized_test)
pd.DataFrame(res)

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: domain, dataset, text, source, id. If domain, dataset, text, source, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1999
  Batch size = 32


{'eval_loss': 1.223117709159851,
 'eval_accuracy': {'accuracy': 0.5767883941970986},
 'eval_f1': {'f1': 0.6878228782287823},
 'eval_precision': {'precision': 0.5788819875776398},
 'eval_recall': {'recall': 0.8472727272727273},
 'eval_runtime': 2.0892,
 'eval_samples_per_second': 956.837,
 'eval_steps_per_second': 30.155}

In [10]:
# Results on Alatawi+2021
import pandas as pd
res = trainer.evaluate(tokenized_test)
pd.DataFrame(res)

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: domain, dataset, text, source, id. If domain, dataset, text, source, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1999
  Batch size = 32


{'eval_loss': 1.223117709159851,
 'eval_accuracy': {'accuracy': 0.5767883941970986},
 'eval_f1': {'f1': 0.6878228782287823},
 'eval_precision': {'precision': 0.5788819875776398},
 'eval_recall': {'recall': 0.8472727272727273},
 'eval_runtime': 2.0892,
 'eval_samples_per_second': 956.837,
 'eval_steps_per_second': 30.155}

# Old/1-time

In [3]:
%%timeit
# Load white supremacist data

import pandas as pd

path = '../tmp/white_supremacist_train_corpus.pkl'
data = pd.read_pickle(path)

1.85 s ± 23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
%%timeit
# Load white supremacist data

import pandas as pd

path = '../data/white_supremacist_train_corpus.json'
data = pd.read_json(path, orient='table')

25.2 s ± 28.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
