In [161]:
import pandas as pd
import numpy as np
import re
import nltk
import datasets
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
from sentence_transformers import SentenceTransformer
from datasets import load_metric
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [162]:
df_polusa = pd.read_csv("polusa_2019_5k.csv")

In [163]:
df_polusa.shape

(5000, 12)

In [164]:
set(df_polusa["outlet"])

{'ABC News',
 'Breitbart',
 'CBS News',
 'Fox News',
 'HuffPost',
 'Los Angeles Times',
 'NBC News',
 'NPR',
 'National Review',
 'PBS',
 'Reuters',
 'Slate',
 'The Daily Caller',
 'The Guardian',
 'The Nation',
 'The New York Times',
 'The State',
 'Townhall',
 'USA Today',
 'Yahoo! News'}

In [165]:
set(df_polusa["political_leaning"])

{'CENTER', 'LEFT', 'RIGHT'}

In [166]:
df_polusa['political_leaning'].value_counts() / len(df_polusa)

CENTER    0.5688
LEFT      0.2624
RIGHT     0.1688
Name: political_leaning, dtype: float64

In [169]:
df_polusa = df_polusa.drop(columns={"Unnamed: 0"})
df_polusa.head(2)

Unnamed: 0,id,date_publish,outlet,headline,lead,body,authors,domain,url,political_leaning,head_lead_body
0,54579766,2019-08-26 17:10:59,National Review,Kevin Williamson & Bill Maher: Author Talks Mo...,Kevin Williamson & Bill Maher: Author Talks Mo...,Kevin Williamson & Bill Maher: Author Talks Mo...,Kyle Smith;Philip Magness;Robert Bryce;Dan Mcl...,www.nationalreview.com,https://www.nationalreview.com/corner/kevin-im...,RIGHT,Kevin Williamson & Bill Maher: Author Talks Mo...
1,54581007,2019-08-26 17:11:30,National Review,Federal Prosecutors to Seek Death Penalty for ...,Federal Prosecutors to Seek Death Penalty for ...,Federal Prosecutors to Seek Death Penalty for ...,Mairead Mcardle;Kyle Smith;Philip Magness;Dan ...,www.nationalreview.com,https://www.nationalreview.com/news/federal-pr...,RIGHT,Federal Prosecutors to Seek Death Penalty for ...


### Preprocess Dataset

In [170]:
df_transformer_format = df_polusa[["head_lead_body", "political_leaning"]]
convert_to_num = {"LEFT": 0, "CENTER": 1, "RIGHT": 2}
df_transformer_format = df_transformer_format.replace({"political_leaning": convert_to_num})
df_transformer_format = df_transformer_format.rename(columns={"head_lead_body":"text", "political_leaning":"label"})

In [171]:
df_train = df_transformer_format[:4000]
df_dev_test = df_transformer_format[-1000:-500]
df_test = df_transformer_format[-500:]

In [172]:
print(len(df_train))
print(len(df_dev_test))
print(len(df_test))

4000
500
500


In [173]:
train_dataset = datasets.Dataset.from_dict(df_train)
dev_test_dataset = datasets.Dataset.from_dict(df_dev_test)
test_dataset = datasets.Dataset.from_dict(df_test)

In [218]:
test_dataset["text"][1]

'Disputed Ellis Avenue residential-commercial project to return to Huntington Beach council###Disputed Ellis Avenue residential-commercial project to return to Huntington Beach council###Disputed Ellis Avenue residential-commercial project to return to Huntington Beach council'

In [175]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

In [180]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /Users/matthewlucich/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /Users/matthewlucich/.cache/huggingface/transformers

In [219]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_dev_test = dev_test_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [220]:
tokenized_train = tokenized_train.remove_columns(['text'])
tokenized_dev_test = tokenized_dev_test.remove_columns(['text'])
tokenized_test = tokenized_test.remove_columns(['text'])

### Classification Model: distilbert-base-uncased (transformer)

In [186]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /Users/matthewlucich/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 3052

In [190]:
def compute_metrics(pred):
    """Sourced from: https://huggingface.co/transformers/v3.0.2/training.html"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, labels=[0, 1, 2])
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [191]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [195]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [196]:
trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1500


Step,Training Loss
500,0.7871
1000,0.4705
1500,0.1989


Saving model checkpoint to tmp_trainer/checkpoint-500
Configuration saved in tmp_trainer/checkpoint-500/config.json
Model weights saved in tmp_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-500/special_tokens_map.json
Saving model checkpoint to tmp_trainer/checkpoint-1000
Configuration saved in tmp_trainer/checkpoint-1000/config.json
Model weights saved in tmp_trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to tmp_trainer/checkpoint-1500
Configuration saved in tmp_trainer/checkpoint-1500/config.json
Model weights saved in tmp_trainer/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-1500/tokenizer_config.json
Special tokens file saved in tmp_traine

TrainOutput(global_step=1500, training_loss=0.48549241638183593, metrics={'train_runtime': 5655.8907, 'train_samples_per_second': 2.122, 'train_steps_per_second': 0.265, 'total_flos': 385139271457152.0, 'train_loss': 0.48549241638183593, 'epoch': 3.0})

In [197]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


Trainer is attempting to log a value of "[0.60773481 0.69166667 0.4556962 ]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.61111111 0.66135458 0.52173913]" of type <class 'numpy.ndarray'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.6043956  0.72489083 0.40449438]" of type <class 'numpy.ndarray'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 1.367005467414856,
 'eval_accuracy': 0.624,
 'eval_f1': array([0.60773481, 0.69166667, 0.4556962 ]),
 'eval_precision': array([0.61111111, 0.66135458, 0.52173913]),
 'eval_recall': array([0.6043956 , 0.72489083, 0.40449438]),
 'eval_runtime': 46.0903,
 'eval_samples_per_second': 10.848,
 'eval_steps_per_second': 1.367,
 'epoch': 3.0}

In [224]:
preds_data = trainer.predict(tokenized_test)

***** Running Prediction *****
  Num examples = 500
  Batch size = 8


In [225]:
preds_data.metrics

{'test_loss': 1.0312914848327637,
 'test_accuracy': 0.71,
 'test_f1': array([0.51851852, 0.81322314, 0.59217877]),
 'test_precision': array([0.46666667, 0.80392157, 0.71621622]),
 'test_recall': array([0.58333333, 0.82274247, 0.5047619 ]),
 'test_runtime': 53.3258,
 'test_samples_per_second': 9.376,
 'test_steps_per_second': 1.181}

In [206]:
#trainer.save_model(output_dir="final-model")