# Import libraries

In [1]:
!pip install --upgrade datasets evaluate transformers accelerate
# Data processing
import pandas as pd
import numpy as np

# Modeling
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline
from tensorflow.nn import softmax
# Hugging Face Dataset
from datasets import Dataset
# Model performance evaluation
import evaluate

import torch
torch.cuda.empty_cache()

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.1

# Import the Spam dataset (Dataset is taken from kaggle)

In [2]:
df = pd.read_csv("spam.csv")
df['label'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df = df.drop(columns=['Category'])
df.head()

Unnamed: 0,Message,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


# Imbalance dataset

In [3]:
df['label'].value_counts()

0    4825
1     747
Name: label, dtype: int64

# Downsample and split dataset into train:val:test with 60:20:20 ratio

In [4]:
def split_df(df, frac):
    train_data = df.sample(frac=frac, random_state=42)
    test_data = df.drop(train_data.index)
    return train_data, test_data

df_spam = df[df['label']==1]
df_ham = df[df['label']==0]
df_ham_downsampled = df_ham.sample(df_spam.shape[0])

train_spam, temp_spam = split_df(df_spam, 0.6)
train_ham, temp_ham = split_df(df_ham_downsampled, 0.6)

val_spam, test_spam = split_df(temp_spam, 0.5)
val_ham, test_ham = split_df(temp_ham, 0.5)

train_df = pd.concat([train_spam, train_ham])
val_df = pd.concat([val_spam, val_ham])
test_df = pd.concat([test_spam, test_ham])

In [5]:
print(f'train:{len(train_df)}, val:{len(val_df)}, test:{len(test_df)}')

train:896, val:300, test:298


# Import BERT model, tokenize and transform df into huggface dataset

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(datapoint):
    return tokenizer(datapoint["Message"], padding="max_length", truncation=True)

hg_train_data = Dataset.from_pandas(train_df)
hg_val_data = Dataset.from_pandas(val_df)
hg_test_data = Dataset.from_pandas(test_df)

tokenized_train_datasets = hg_train_data.map(tokenize_function, batched=True)
tokenized_val_datasets = hg_val_data.map(tokenize_function, batched=True)
tokenized_test_datasets = hg_test_data.map(tokenize_function, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/896 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/298 [00:00<?, ? examples/s]

In [7]:
# Take a look at the data
print(tokenized_train_datasets)
print(tokenized_val_datasets)
print(tokenized_test_datasets)

Dataset({
    features: ['Message', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 896
})
Dataset({
    features: ['Message', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 300
})
Dataset({
    features: ['Message', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 298
})


# Build Bert model with transfer learning on a custom dataset

In [19]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./test_trainer/",
    logging_strategy='epoch',
    logging_steps=100,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.0579,0.11388,0.976667
2,0.0465,0.131108,0.973333
3,0.0262,0.118708,0.98


TrainOutput(global_step=672, training_loss=0.04352378419467381, metrics={'train_runtime': 319.5735, 'train_samples_per_second': 8.411, 'train_steps_per_second': 2.103, 'total_flos': 707242516807680.0, 'train_loss': 0.04352378419467381, 'epoch': 3.0})

# predict test_dataset

In [22]:
# Predictions
y_test_predict = trainer.predict(tokenized_test_datasets)
y_test_probabilities = softmax(y_test_predict.predictions)

y_test_pred_labels = np.argmax(y_test_probabilities, axis=1)
y_test_actual_labels = y_test_predict.label_ids
trainer.evaluate(tokenized_test_datasets)

{'eval_loss': 0.11869163066148758,
 'eval_accuracy': 0.9731543624161074,
 'eval_runtime': 11.1754,
 'eval_samples_per_second': 26.666,
 'eval_steps_per_second': 6.711,
 'epoch': 3.0}

In [23]:
# Load f1 metric
metric_f1 = evaluate.load("f1")

# Compute f1 metric
metric_f1.compute(predictions=y_test_pred_labels, references=y_test_actual_labels)

{'f1': 0.9726027397260273}

In [18]:
# Save tokenizer
tokenizer.save_pretrained('./sentiment_transfer_learning_transformer/')

# Save model
trainer.save_model('./sentiment_transfer_learning_transformer/')