# Format input data
Put it in a HuggingFace Dataset

In [7]:
# Load white supremacist data
import pandas as pd

path = '../tmp/white_supremacist_train_corpus.pkl'
ws_data = pd.read_pickle(path).assign(label=1)
ws_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4850296 entries, qian2018_0 to pruden2022_161
Data columns (total 6 columns):
 #   Column     Dtype              
---  ------     -----              
 0   text       object             
 1   timestamp  datetime64[ns, UTC]
 2   dataset    object             
 3   source     object             
 4   domain     object             
 5   label      int64              
dtypes: datetime64[ns, UTC](1), int64(1), object(4)
memory usage: 259.0+ MB


In [8]:
# Load neutral data
path = '../tmp/neutral_train_corpus.pkl'
neutral_data = pd.read_pickle(path).assign(label=0)
neutral_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4285167 entries, reddit_match_0 to news_match_12866
Data columns (total 7 columns):
 #   Column      Dtype              
---  ------      -----              
 0   text        object             
 1   timestamp   datetime64[ns, UTC]
 2   dataset     object             
 3   source      object             
 4   domain      object             
 5   word_count  int64              
 6   label       int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(4)
memory usage: 261.5+ MB


In [9]:
# Combine, shuffle and sample if desired
selected_cols = ['text', 'label']
data = pd.concat([ws_data[selected_cols], neutral_data[selected_cols]]).sample(1000)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, reddit_match_2537618 to papasavva2020_502104
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1000 non-null   object
 1   label   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 23.4+ KB


In [10]:
# Make a HuggingFace Dataset
from datasets import Dataset

dataset = Dataset.from_pandas(data).train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 900
    })
    test: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 100
    })
})

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_data = dataset.map(preprocess, batched=True)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Train

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Downloading pytorch_model.bin:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

In [23]:
import torch

torch.cuda.is_available()

False

In [24]:
torch.__version__

'1.12.1'

In [20]:
torch.version

<module 'torch.version' from '/home/mamille3/white_supremacist_lang/conda_env/lib/python3.10/site-packages/torch/version.py'>

In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, id. If text, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 900
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 285


Step,Training Loss


KeyboardInterrupt: 

# Old/1-time

In [3]:
%%timeit
# Load white supremacist data

import pandas as pd

path = '../tmp/white_supremacist_train_corpus.pkl'
data = pd.read_pickle(path)

1.85 s ± 23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
%%timeit
# Load white supremacist data

import pandas as pd

path = '../data/white_supremacist_train_corpus.json'
data = pd.read_json(path, orient='table')

25.2 s ± 28.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
