# Test GPU functionality

In [1]:
import torch
print(torch.cuda.is_available())
torch.__version__

True


'1.12.1'

# Format input data
Put it in a HuggingFace Dataset

In [1]:
# Load white supremacist data
import pandas as pd

path = '../tmp/white_supremacist_train_corpus.pkl'
ws_data = pd.read_pickle(path).assign(label=1)
ws_data.info()

# Load neutral data
path = '../tmp/neutral_train_corpus.pkl'
neutral_data = pd.read_pickle(path).assign(label=0)
neutral_data.info()

# Combine, shuffle and sample if desired
selected_cols = ['text', 'label']
data = pd.concat([ws_data[selected_cols], neutral_data[selected_cols]])
data.info()

# Make a HuggingFace Dataset
from datasets import Dataset

dataset = Dataset.from_pandas(data).train_test_split(test_size=0.1)
dataset

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_data = dataset.map(preprocess, batched=True)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

<class 'pandas.core.frame.DataFrame'>
Index: 4850296 entries, qian2018_0 to pruden2022_161
Data columns (total 6 columns):
 #   Column     Dtype              
---  ------     -----              
 0   text       object             
 1   timestamp  datetime64[ns, UTC]
 2   dataset    object             
 3   source     object             
 4   domain     object             
 5   label      int64              
dtypes: datetime64[ns, UTC](1), int64(1), object(4)
memory usage: 259.0+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 4374274 entries, reddit_match_0 to twitter_match_89106
Data columns (total 7 columns):
 #   Column      Dtype              
---  ------      -----              
 0   text        object             
 1   timestamp   datetime64[ns, UTC]
 2   dataset     object             
 3   source      object             
 4   domain      object             
 5   word_count  int64              
 6   label       int64              
dtypes: datetime64[ns, UTC](1), int64(2), object

  0%|          | 0/8303 [00:00<?, ?ba/s]

  0%|          | 0/923 [00:00<?, ?ba/s]



# Train

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_metric
import numpy as np

metrics = {'accuracy': load_metric('accuracy'), 
           'f1': load_metric('f1')}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {metric_name: metric.compute(predictions=predictions, references=labels) for metric_name, metric in metrics.items()}

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("results/checkpoint-160")

batch_size = 16
checkpoint = batch_size * int(1e4)
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    # evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps=checkpoint,
    # eval_steps=checkpoint,
    save_steps=checkpoint,
    # load_best_model_at_end=True,
    # metric_for_best_model='f1'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/mamille3/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.3",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /home/mamille3/.cache/huggingface/transfor

Step,Training Loss
160000,0.3146
320000,0.2792


Saving model checkpoint to ./results/checkpoint-160000
Configuration saved in ./results/checkpoint-160000/config.json
Model weights saved in ./results/checkpoint-160000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-160000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-160000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-320000
Configuration saved in ./results/checkpoint-320000/config.json
Model weights saved in ./results/checkpoint-320000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-320000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-320000/special_tokens_map.json


In [9]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, id. If text, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922457
  Batch size = 32


KeyboardInterrupt: 

# Evaluate on unseen test dataset

In [11]:
path = '../tmp/annotated_test_corpus.pkl'
annotated = pd.read_pickle(path)
annotated.info()

alatawi2021 = annotated.query('dataset=="alatawi2021"')
test_dataset = Dataset.from_pandas(alatawi2021)
tokenized_test = test_dataset.map(preprocess, batched=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1999 entries, alatawi2021_0 to alatawi2021_1998
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     1999 non-null   object
 1   dataset  1999 non-null   object
 2   source   1999 non-null   object
 3   domain   1999 non-null   object
 4   label    1999 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 93.7+ KB


  0%|          | 0/2 [00:00<?, ?ba/s]

In [12]:
trainer.evaluate(tokenized_test)

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, domain, text, dataset, source. If id, domain, text, dataset, source are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1999
  Batch size = 32


{'eval_loss': 1.0954207181930542,
 'eval_accuracy': {'accuracy': 0.5597798899449725},
 'eval_f1': {'f1': 0.7018970189701897}}

# Old/1-time

In [3]:
%%timeit
# Load white supremacist data

import pandas as pd

path = '../tmp/white_supremacist_train_corpus.pkl'
data = pd.read_pickle(path)

1.85 s ± 23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
%%timeit
# Load white supremacist data

import pandas as pd

path = '../data/white_supremacist_train_corpus.json'
data = pd.read_json(path, orient='table')

25.2 s ± 28.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
