In [1]:
from typing import List
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer, DataCollatorWithPadding)
from sklearn.utils import shuffle
from datasets import Dataset, load_dataset

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [35]:
full_file = "data/IBC/ibc.csv"
sample_file = "data/IBC/sample_ibc.csv"
ibc = pd.read_csv(full_file)
sample = pd.read_csv(sample_file)
# ibc = shuffle(pd.read_csv(full_file), random_state=1)

dsq = sample["sentence"].to_list()
# print(dsq)
ft_ibc = ibc.loc[~ibc["sentence"].isin(dsq), :].copy()
# print(ft_ibc.label.value_counts(), ibc.label.value_counts())

ft_ibc = shuffle(ft_ibc, random_state=1)
ft_ibc.iloc[0]

sentence    These changes , they argue , `` promise to dra...
label                                                 Liberal
Name: 1272, dtype: object

In [36]:
options = ["liberal", "neutral", "conservative"]

def add_to_dataset(dataset, sentence, label):
    if label == 'liberal':
        result = 0
    elif label == 'neutral':
        result = 1
    else:
        result = 2

    data = {"sentence": sentence,
            "label": result}
    dataset.append(data)

In [51]:
sample_dataset = []

for index in range(len(sample)):
    sentence = ft_ibc.iloc[index]["sentence"]
    add_to_dataset(sample_dataset, sentence, ft_ibc.iloc[index]["label"].lower())

sample_ex = sample_dataset[0]
sample_ex


{'sentence': 'These changes , they argue , `` promise to dramatically reduce health care costs and improve the quality of care for Medicare enrollees and other Americans .',
 'label': 0}

In [37]:
dataset = []

for index in range(len(ft_ibc)):
    sentence = ft_ibc.iloc[index]["sentence"]
    add_to_dataset(dataset, sentence, ft_ibc.iloc[index]["label"].lower())

example = dataset[0]
example

{'sentence': 'These changes , they argue , `` promise to dramatically reduce health care costs and improve the quality of care for Medicare enrollees and other Americans .',
 'label': 0}

In [38]:
test_split = (int) (0.1*len(dataset))
test_set = dataset[:test_split]
train_set = dataset[test_split:]

print(f"Size of test set: {len(test_set)}, size of train set: {len(train_set)}, no overlap: {len(train_set)+len(test_set)==len(dataset)}")

In [39]:
# load into Datasets
train_ds = Dataset.from_pandas(pd.DataFrame(data=train_set))
test_ds = Dataset.from_pandas(pd.DataFrame(data=test_set))

test_ds

Dataset({
    features: ['sentence', 'label'],
    num_rows: 416
})

In [40]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def preprocess_function(example):
    return tokenizer(example["sentence"], truncation=True)

preprocess_function(example)



{'input_ids': [101, 2122, 3431, 1010, 2027, 7475, 1010, 1036, 1036, 4872, 2000, 12099, 5547, 2740, 2729, 5366, 1998, 5335, 1996, 3737, 1997, 2729, 2005, 27615, 25612, 10285, 1998, 2060, 4841, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [41]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

tokenized_test

Map:   0%|          | 0/3752 [00:00<?, ? examples/s]

Map:   0%|          | 0/416 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'label', 'input_ids', 'attention_mask'],
    num_rows: 416
})

In [42]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [43]:
id2label = {0: "Liberal", 1: "Neutral", 2: "Conservative"}
label2id = {"Liberal": 0, "Neutral": 1, "Conservative": 2}

In [44]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
training_args = TrainingArguments(
    output_dir="pid-ft-distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.9487107992172241, 'eval_accuracy': 0.5384615384615384, 'eval_runtime': 56.9813, 'eval_samples_per_second': 7.301, 'eval_steps_per_second': 0.456, 'epoch': 1.0}
{'train_runtime': 1281.7634, 'train_samples_per_second': 2.927, 'train_steps_per_second': 0.183, 'train_loss': 0.6375928513547208, 'epoch': 1.0}


TrainOutput(global_step=235, training_loss=0.6375928513547208, metrics={'train_runtime': 1281.7634, 'train_samples_per_second': 2.927, 'train_steps_per_second': 0.183, 'total_flos': 73086050964960.0, 'train_loss': 0.6375928513547208, 'epoch': 1.0})

In [46]:
model.push_to_hub("pid-ft-distilbert")
tokenizer.push_to_hub("pid-ft-distilbert")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lhz1/pid-ft-distilbert/commit/e989feec2e241a00cbc2adbca2ffe5f6705dd353', commit_message='Upload tokenizer', commit_description='', oid='e989feec2e241a00cbc2adbca2ffe5f6705dd353', pr_url=None, pr_revision=None, pr_num=None)

In [56]:
# evaluate on same sample dataset
infer_tokenizer = AutoTokenizer.from_pretrained("lhz1/pid-ft-distilbert")
ft_distilbert = AutoModelForSequenceClassification.from_pretrained("lhz1/pid-ft-distilbert")


def run_model():
    ret = []
    for example in sample_dataset:
        inputs = infer_tokenizer(example["sentence"], return_tensors="pt")

        with torch.no_grad():
            logits = ft_distilbert(**inputs).logits

            predicted_class_id = logits.argmax().item()
            print(predicted_class_id)
            ret.append(ft_distilbert.config.id2label[predicted_class_id])

    return ret

distilbert_preds = run_model() 

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/741 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [57]:
distilbert_preds

['Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Liberal',
 'Li