# TRANSFORMERS Base: Pretrained Model to Fine-tuned for Classification

## Data Loading

In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv("data/dataset.csv", encoding="utf-8")

dataset.head()

Unnamed: 0,Date_published,Headline,Synopsis,Full_text,Final Status
0,2022-06-21,"Banks holding on to subsidy share, say payment...",The companies have written to the National Pay...,ReutersPayments companies and banks are at log...,Negative
1,2022-04-19,Digitally ready Bank of Baroda aims to click o...,"At present, 50% of the bank's retail loans are...",AgenciesThe bank presently has 20 million acti...,Positive
2,2022-05-27,Karnataka attracted investment commitment of R...,Karnataka is at the forefront in attracting in...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,Positive
3,2022-04-06,Splitting of provident fund accounts may be de...,The EPFO is likely to split accounts only at t...,Getty ImagesThe budget for FY22 had imposed in...,Negative
4,2022-06-14,Irdai weighs proposal to privatise Insurance I...,"Set up in 2009 as an advisory body, IIB collec...",AgenciesThere is a view in the insurance indus...,Positive


## Data Cleaning

Let's quickly do all the needed operations

In [2]:
# Convert to binary
dataset.loc[97, "Final Status"] = "Positive"
dataset["Final Status"] = dataset["Final Status"].map({"Positive": 1, "Negative": 0})

In [3]:
# Check  and clean empty Synopsis
dataset[dataset["Synopsis"].isna()].index

Index([56], dtype='int64')

In [4]:
dataset.loc[56, "Synopsis"] = " "

In [5]:
# replace contractions
def decontracted(phrase):
    phrase = re.sub(r"\'t", "not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [6]:
import re
def preprocess_text(text):
    text = decontracted(str(text))
    text = re.sub("[^a-zA-Z0-9.,!?$/ ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"(?<!\\)(\\\\)?\n", r"\\n", text)
    text = re.sub(r"(?<!\\)(\\\\)?\t", r"\\t", text)
    text = re.sub(r"(?<!\\)(\\\\)?\r", r"\\r", text)
    return text

In [7]:
corpus = []
dataset["processed_article"] = (
    dataset["Headline"] + " " + dataset["Synopsis"] + " " + dataset["Full_text"]
)


# for i in range(len(dataset["Headline"])):
#     dataset.loc[i, "processed_article"] = preprocess_text(dataset["Headline"][i])
#     corpus.append(dataset["processed_article"][i])

for i in range(len(dataset["processed_article"])):
    dataset.loc[i, "processed_article"] = preprocess_text(
        dataset["processed_article"][i]
    )
    corpus.append(dataset["processed_article"][i])

In [8]:
dataset[["processed_article", "Final Status"]]

dataset = dataset.rename(columns={"Final Status": "label"})

column_order = ["processed_article", "label"]
dataset = dataset[column_order]
dataset.head()

Unnamed: 0,processed_article,label
0,"Banks holding on to subsidy share, say payment...",0
1,Digitally ready Bank of Baroda aims to click o...,1
2,Karnataka attracted investment commitment of R...,1
3,Splitting of provident fund accounts may be de...,0
4,Irdai weighs proposal to privatise Insurance I...,1


For ease of usage with Transformer models, we convert the dataset into a Hugging Face dataset and split it into train, validation and test sets.

In [9]:
from datasets import Dataset

dataset_hf = Dataset.from_pandas(dataset)

In [10]:
from datasets import DatasetDict

train_test = dataset_hf.train_test_split(test_size=0.1)

valid_test = train_test['test'].train_test_split(test_size=0.5)

train_valid_test_dataset = DatasetDict({
    'train': train_test['train'],
    'validation': valid_test['train'],
    'test': valid_test['test']
})

In [11]:
train_valid_test_dataset

DatasetDict({
    train: Dataset({
        features: ['processed_article', 'label'],
        num_rows: 360
    })
    validation: Dataset({
        features: ['processed_article', 'label'],
        num_rows: 20
    })
    test: Dataset({
        features: ['processed_article', 'label'],
        num_rows: 20
    })
})

## Fine-tuning a pretrained model

In [12]:
model_name = "distilbert-base-uncased"

### Tokenizer

We first load the tokenizer for our model:

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [14]:
def preprocess_function(sample):
    return tokenizer(sample["processed_article"], truncation=True, padding=True)

In [15]:
tokenized_dataset = train_valid_test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['processed_article', 'label', 'input_ids', 'attention_mask'],
        num_rows: 360
    })
    validation: Dataset({
        features: ['processed_article', 'label', 'input_ids', 'attention_mask'],
        num_rows: 20
    })
    test: Dataset({
        features: ['processed_article', 'label', 'input_ids', 'attention_mask'],
        num_rows: 20
    })
})

### Loading the model

In [17]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Fine-tuning

The next step is to [fine-tune](https://huggingface.co/docs/transformers/training) the model with our train data. To do so, we can make use of a [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer).
There are several aspects of training that you can specify via [TrainingArguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments).

In [18]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(
    output_dir="./base_model",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [20]:
trainer.train()

  0%|          | 0/450 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.700499415397644, 'eval_accuracy': 0.45, 'eval_runtime': 1.3086, 'eval_samples_per_second': 15.283, 'eval_steps_per_second': 2.292, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.3157048523426056, 'eval_accuracy': 0.9, 'eval_runtime': 1.1709, 'eval_samples_per_second': 17.08, 'eval_steps_per_second': 2.562, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5032778978347778, 'eval_accuracy': 0.75, 'eval_runtime': 1.23, 'eval_samples_per_second': 16.26, 'eval_steps_per_second': 2.439, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.2372409999370575, 'eval_accuracy': 0.95, 'eval_runtime': 1.5012, 'eval_samples_per_second': 13.323, 'eval_steps_per_second': 1.998, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.23906604945659637, 'eval_accuracy': 0.95, 'eval_runtime': 1.0932, 'eval_samples_per_second': 18.296, 'eval_steps_per_second': 2.744, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.48696452379226685, 'eval_accuracy': 0.85, 'eval_runtime': 1.0523, 'eval_samples_per_second': 19.006, 'eval_steps_per_second': 2.851, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5696629285812378, 'eval_accuracy': 0.8, 'eval_runtime': 1.0476, 'eval_samples_per_second': 19.091, 'eval_steps_per_second': 2.864, 'epoch': 7.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6468771696090698, 'eval_accuracy': 0.8, 'eval_runtime': 0.904, 'eval_samples_per_second': 22.123, 'eval_steps_per_second': 3.318, 'epoch': 8.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6035259366035461, 'eval_accuracy': 0.8, 'eval_runtime': 1.1148, 'eval_samples_per_second': 17.941, 'eval_steps_per_second': 2.691, 'epoch': 9.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6433780789375305, 'eval_accuracy': 0.8, 'eval_runtime': 1.102, 'eval_samples_per_second': 18.149, 'eval_steps_per_second': 2.722, 'epoch': 10.0}
{'train_runtime': 564.5822, 'train_samples_per_second': 6.376, 'train_steps_per_second': 0.797, 'train_loss': 0.2633765326605903, 'epoch': 10.0}


TrainOutput(global_step=450, training_loss=0.2633765326605903, metrics={'train_runtime': 564.5822, 'train_samples_per_second': 6.376, 'train_steps_per_second': 0.797, 'train_loss': 0.2633765326605903, 'epoch': 10.0})

We can check the model's performance in the evaluation set.

In [21]:
trainer.evaluate()

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.2372409999370575,
 'eval_accuracy': 0.95,
 'eval_runtime': 1.6311,
 'eval_samples_per_second': 12.262,
 'eval_steps_per_second': 1.839,
 'epoch': 10.0}

And more importantly, we can check how the model fares in our test set.

In [22]:
trainer.predict(test_dataset=tokenized_dataset["test"])

  0%|          | 0/3 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[-1.761248  ,  1.6921663 ],
       [-0.998291  ,  1.0099177 ],
       [ 1.9263896 , -1.6586193 ],
       [-0.27584478,  0.24548553],
       [ 2.1000998 , -1.7372221 ],
       [-1.800185  ,  1.7427756 ],
       [-1.7752267 ,  1.66463   ],
       [ 0.70275366, -0.73780197],
       [ 2.0145967 , -1.7936059 ],
       [ 2.0738091 , -1.7425429 ],
       [ 0.4593019 , -0.43392986],
       [ 0.99668705, -1.0789608 ],
       [-1.7478807 ,  1.6724325 ],
       [ 2.1232727 , -1.7507538 ],
       [-1.5535468 ,  1.5226    ],
       [ 2.0498128 , -1.7463988 ],
       [ 2.1496825 , -1.7557629 ],
       [ 1.8611399 , -1.5866663 ],
       [ 1.0218056 , -1.0552893 ],
       [ 2.0714223 , -1.7439951 ]], dtype=float32), label_ids=array([1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0]), metrics={'test_loss': 0.8655620813369751, 'test_accuracy': 0.75, 'test_runtime': 1.7028, 'test_samples_per_second': 11.745, 'test_steps_per_second': 1.762})

#### Saving the model

The model can be saved for future loading.

In [23]:
trainer.save_model("./nlp_assignment2_basemodel")

#### Loading and using a saved model

In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer2 = AutoTokenizer.from_pretrained("./nlp_assignment2_basemodel")
model2 = AutoModelForSequenceClassification.from_pretrained(
    "./nlp_assignment2_basemodel", num_labels=2
)

To exploit the model, we can use a pipeline.

In [20]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(model=model2, tokenizer=tokenizer2, truncation=True) #, return_all_scores=True)

In [21]:
pipe("my stocks went down the drain")

[{'label': 'negative', 'score': 0.6491075754165649}]

In [22]:
pipe("I have made a nice profit")

[{'label': 'positive', 'score': 0.6498883962631226}]

We can also use the model in a step-by-step fashion, as follows.

In [23]:
import torch

inputs = input("Sentence to classify: ")
print(inputs)

# tokenize inputs
tokenized_inputs = tokenizer2(inputs, return_tensors="pt")
print(tokenized_inputs)

# obtain model outputs
outputs = model2(**tokenized_inputs)
print(outputs)

# get the most likely label
labels = ['NEGATIVE', 'POSITIVE']
prediction = torch.argmax(outputs.logits)
print(labels[prediction])

the inflation is rising
{'input_ids': tensor([[  101,  1996, 14200,  2003,  4803,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.1403, -1.0085]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
NEGATIVE


In [27]:
model2.push_to_hub(repo_id="nlp-assignment2-basemodel", private=True)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ricardoinacio/nlp-assignment2-basemodel/commit/94c25f7c8e9e6d8951ca5549d1ec3f94d1178602', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='94c25f7c8e9e6d8951ca5549d1ec3f94d1178602', pr_url=None, pr_revision=None, pr_num=None)

In [28]:
tokenizer2.push_to_hub(repo_id="nlp-assignment2-basemodel", private=True)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ricardoinacio/nlp-assignment2-basemodel/commit/53d70381d8ea5547c56b12785c001394d64d4fc9', commit_message='Upload tokenizer', commit_description='', oid='53d70381d8ea5547c56b12785c001394d64d4fc9', pr_url=None, pr_revision=None, pr_num=None)