In [1]:
import pandas as pd

from transformers import AutoTokenizer

## Import data

In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

df = pd.read_json("../data/data.json")
df.columns

Index(['content', 'followers', 'following', 'retweet', 'account_category',
       'created_at', 'troll', 'orig_index', 'tokens'],
      dtype='object')

# Tokenize tweet content

In [3]:
# df['tokens'] = df['content'].apply(lambda x: tokenizer(x, truncation=True))
from transformers import DataCollatorWithPadding
from datasets import Dataset

from sklearn.model_selection import train_test_split

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

df['label'] = df['troll'].astype(int)
df['text'] = df['content']

train, test = train_test_split(df[['text', 'label']], test_size=0.2)

train = Dataset.from_pandas(train[:80000])
test = Dataset.from_pandas(test[:20000])

In [4]:
def preprocess_function(examples):

    return tokenizer(examples["text"], truncation=True)

tokenized_tweet =  {
    "train" : train.map(preprocess_function, batched=True),
    "test" : test.map(preprocess_function, batched=True)
}

  0%|          | 0/80 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

# Train

In [5]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, default_data_collator
from sklearn.model_selection import train_test_split
from datasets import Dataset


In [7]:
train.shape[0]

80000

In [67]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="../results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tweet['train'],
    eval_dataset=tokenized_tweet['test'],
    tokenizer=tokenizer
)

trainer.train()

loading configuration file config.json from cache at /Users/pietertolsma/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /Users/pietertolsma/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-u

Step,Training Loss
500,0.261
1000,0.1853
1500,0.1778
2000,0.1613


Saving model checkpoint to ../results/checkpoint-500
Configuration saved in ../results/checkpoint-500/config.json
Model weights saved in ../results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ../results/checkpoint-1000
Configuration saved in ../results/checkpoint-1000/config.json
Model weights saved in ../results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ../results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ../results/checkpoint-1500
Configuration saved in ../results/checkpoint-1500/config.json
Model weights saved in ../results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ../results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ../results/checkpoint-15

KeyboardInterrupt: 

In [9]:
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained("../results/checkpoint-2000/")
training_args = TrainingArguments(
    output_dir="../results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tweet['train'],
    eval_dataset=tokenized_tweet['test'],
    tokenizer=tokenizer
)

troll_tweets = list(df[df['label'] == 1]['text'][:1000])
nontroll_tweets = list(df[df['label'] == 0]['text'][:1000])

test_tweet = "Trump is the best!"
tokenized = [tokenizer(tweet, truncation=True, padding=True) for tweet in troll_tweets]

troll_preds = trainer.predict(tokenized).predictions

tokenized = [tokenizer(tweet, truncation=True, padding=True) for tweet in nontroll_tweets]
nontroll_preds = trainer.predict(tokenized).predictions

def discretize(preds):
    return np.apply_along_axis(lambda r: np.argmax(r), 1, preds)

print("=== TROLL TWEETS ===")
trolls = discretize(troll_preds)
print(f"{sum(trolls)}/{len(trolls)} trolls detected")

print("=== NONTROLL TWEETS ===")
nontrolls = discretize(nontroll_preds)
print(f"{len(nontrolls) - sum(nontrolls)}/{len(nontrolls)} valid users correctly detected")

loading configuration file ../results/checkpoint-2000/config.json
Model config DistilBertConfig {
  "_name_or_path": "../results/checkpoint-2000/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file ../results/checkpoint-2000/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at ../results/checkpoint-

***** Running Prediction *****
  Num examples = 1000
  Batch size = 16


=== TROLL TWEETS ===
978/1000 trolls detected
=== NONTROLL TWEETS ===
835/1000 valid users correctly detected


In [58]:
from datetime import datetime
trainer.save_model(f"../results/model{datetime.now()}")

Saving model checkpoint to ../results/model2022-11-08 17:09:11.948703
Configuration saved in ../results/model2022-11-08 17:09:11.948703/config.json
Model weights saved in ../results/model2022-11-08 17:09:11.948703/pytorch_model.bin
tokenizer config file saved in ../results/model2022-11-08 17:09:11.948703/tokenizer_config.json
Special tokens file saved in ../results/model2022-11-08 17:09:11.948703/special_tokens_map.json


['"Women of colour online are taking control of their own representation through a new movement called Art Hoe.... https://t.co/cw4XoJxnvB', 'Συμπέρασμα των #ElectionDay ειναι οτι η αντίστοιχη Β Αθηνας ειναι το Ohio...Επίσης οι δημοσκοποι παντου ειναι ΓΤΠ', '@timkaine maybe not have you online team post while you are on national tv? Kinda ruins the illusion.', "'@BlackMoses2015  i love these pictures'", 'RT @Yamiche: Jay Z on Donald Trump: "He cannot be our president. Once you divide us you weaken us. We are stronger together."', "'@LiliMoM Like people assuming we support trump bc we won't vote hillary. It's one of the stupidest things going on on twitter.'", 'Congratulations @realDonaldTrump  You won Florida! https://t.co/88qCOLlcM6', "If you were to pay that $11-which we clearly didn't-you'd drive past a row of flags to a gift shop w George Washington shot glasses.", 'RIGGED! But you can protect your vote #StopSoros Call ttp://pastebin.com/ExZ42cEt #TRUMP https://t.co/XvhoVlaQk2 http