# ELECTRA
This notebook aims to use transfer learning on a ELECTRA model to perform text classification and detect suicidal text.

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install accelerate -U



In [None]:
!pip install -qqq transformers datasets wandb

In [None]:
# Import packages
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric

In [None]:
# Specify GPU
device = torch.device("cuda")

In [None]:
# Change to your own directory
try:
    os.chdir("/content/drive/MyDrive/suicidal-text-detection")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


## Define constants

In [None]:
# Define constants
EPOCHS = 1
BATCH_SIZE = 6
LEARNING_RATE = 1e-5
SEED = 4222

MODEL_SAVE_PATH = "Models/electra"
MODEL_CHECKPOINT_PATH = "Models/electra_checkpoint"
MODEL_LOGGING_PATH = "Models/electra_checkpoint/logs"

WANDB_ENTITY = "dennisrkibet"
WANDB_PROJECT = "suicide_detection"
WANDB_RUN = "electra"

## Load dataset

In [None]:
# Load dataset
df = pd.read_csv('Data/suicide_detection_final_cleaned.csv', header=0)
df.drop(columns=['cleaned_text'], inplace=True)
df['class'] = df['class'].map({'suicide': 1, 'non-suicide': 0})
df.head()

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,1
1,Am I weird I don't get affected by compliments...,0
2,Finally 2020 is almost over... So I can never ...,0
3,i need helpjust help me im crying so hard,1
4,It ends tonight.I can’t do it anymore. \nI quit.,1


In [None]:
# Split dataset into train, validation and test sets
train, temp = train_test_split(df,
                               random_state=SEED,
                               test_size=0.2,
                               stratify=df['class'])

val, test = train_test_split(temp,
                             random_state=SEED,
                             test_size=0.5,
                             stratify=temp['class'])

## Load ELECTRA Model

In [None]:
# Load ELECTRA tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")

In [None]:
def dataset_conversion(train, test, val):
  """Converts pandas dataframe to Dataset."""

  train.reset_index(drop=True, inplace=True)
  test.reset_index(drop=True, inplace=True)
  val.reset_index(drop=True, inplace=True)

  train_dataset = Dataset.from_pandas(train)
  test_dataset = Dataset.from_pandas(test)
  val_dataset = Dataset.from_pandas(val)

  return DatasetDict({"train": train_dataset,
                      "test": test_dataset,
                      "val": val_dataset})

raw_datasets = dataset_conversion(train, test, val)

In [None]:
def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/140661 [00:00<?, ? examples/s]

Map:   0%|          | 0/17583 [00:00<?, ? examples/s]

Map:   0%|          | 0/17583 [00:00<?, ? examples/s]

In [None]:
# Tokenise datasets
SAMPLE_SIZE = 20
small_train_dataset = tokenized_datasets["train"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
small_val_dataset = tokenized_datasets["val"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))

full_train_dataset = tokenized_datasets["train"]
full_test_dataset = tokenized_datasets["test"]
full_val_dataset = tokenized_datasets["val"]

In [None]:
# Import ELECTRA-base pretrained model
model = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Login wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mdennisrkibet[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# Initialise wandb
wandb.init(settings=wandb.Settings(start_method="fork"), project=WANDB_PROJECT, entity=WANDB_ENTITY, name=WANDB_RUN)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016678907366667772, max=1.0…

In [None]:
# Define custom metrics for computation
# def compute_metrics(eval_pred):
#     metric_acc = load_metric("accuracy")
#     metric_rec = load_metric("recall")
#     metric_pre = load_metric("precision")
#     metric_f1 = load_metric("f1")

#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)

#     accuracy = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]
#     recall = metric_rec.compute(predictions=predictions, references=labels)["recall"]
#     precision = metric_pre.compute(predictions=predictions, references=labels)["precision"]
#     f1 = metric_f1.compute(predictions=predictions, references=labels)["f1"]

#     return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1}


def compute_metrics(eval_pred):
    metric_acc = load_metric("accuracy")
    metric_rec = load_metric("recall")
    metric_pre = load_metric("precision")
    metric_f1 = load_metric("f1")
    loss_fn = torch.nn.CrossEntropyLoss()

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Compute loss
    logits_tensor = torch.from_numpy(logits)
    labels_tensor = torch.from_numpy(labels)
    loss = loss_fn(logits_tensor, labels_tensor).item()

    # Compute metrics
    accuracy = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]
    recall = metric_rec.compute(predictions=predictions, references=labels)["recall"]
    precision = metric_pre.compute(predictions=predictions, references=labels)["precision"]
    f1 = metric_f1.compute(predictions=predictions, references=labels)["f1"]

    return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1, "loss": loss}


In [None]:
# Define model and training parameters
training_args = TrainingArguments(
    output_dir=MODEL_CHECKPOINT_PATH,
    overwrite_output_dir = True,
    report_to = 'wandb',
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    seed=SEED,
    # evaluation_strategy="epoch",
    run_name=WANDB_RUN,
    logging_dir=MODEL_LOGGING_PATH,
    save_strategy="steps",
    save_steps=1500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

## Pre-trained ELECTRA

In [None]:
# Predict before fine-tuning
trainer.predict(full_test_dataset).metrics

{'test_runtime': 645.0963,
 'test_samples_per_second': 27.256,
 'test_steps_per_second': 4.544}

## Fine-tuned ELECTRA

In [None]:
# %%wandb # To observe training progress live

# Fine-tune model
trainer.train()

# Resume fine-tuning from checkpoint
# trainer.train(MODEL_CHECKPOINT_PATH + "/" + "checkpoint-18000")

ValueError: ignored

In [None]:
# Terminate wandb run
wandb.finish()

In [None]:
# Save fine-tuned model
trainer.save_model(MODEL_SAVE_PATH)

In [None]:
# Evaluate fine-tuned model
trainer.evaluate()

In [None]:
# Predict after fine-tuning
trainer.predict(full_test_dataset).metrics

In [None]:
def get_training_history(wandb_run):
  """Extract key metrics from training and eval across epochs from wandb run data."""

  # Get training history from wandb
  api = wandb.Api()
  run = api.run(wandb_run)
  history = run.history()

  # Rename columns
  train_column_dict = {'train/epoch': 'epoch', 'train/loss': 'training_loss'}
  val_column_dict = {'train/epoch': 'epoch', 'eval/loss': 'validation_loss', 'eval/accuracy': 'accuracy',
                'eval/precision': 'precision', 'eval/recall': 'recall', 'eval/f1': 'f1'}

  # Train data
  train_history = history[list(train_column_dict.keys())]
  train_history.columns = [train_column_dict.get(x, x) for x in train_history.columns]
  train_history = train_history.dropna()

  # Val data
  val_history = history[list(val_column_dict.keys())]
  val_history.columns = [val_column_dict.get(x, x) for x in val_history.columns]
  val_history = val_history.dropna()

  return pd.merge(train_history, val_history, how="right", on="epoch")


# Get dataframe for training history
WANDB_RUN_ID = "1bcfrimx" # Replace with your wandb run details, found in the training cell

training_history = get_training_history(WANDB_ENTITY + "/" + WANDB_PROJECT + "/" + WANDB_RUN_ID)
training_history

In [None]:
# Load fine-tuned model
saved_model = AutoModelForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)

# Load trainer after fine-tune
saved_trainer = Trainer(
    model=saved_model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Predict after fine-tuning
saved_trainer.predict(full_test_dataset).metrics

In [None]:
# Load fine-tuned model
saved_model = AutoModelForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)

# Load trainer after fine-tune
saved_trainer = Trainer(
    model=saved_model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

## GPU Memory Utilities

In [None]:
# Delete variables and empty cache
del trainer
del model
torch.cuda.empty_cache()

In [None]:
# Python garbage collection
import gc
gc.collect()

In [None]:
# Check memory allocation
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

In [None]:
# Check memory summary
print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [None]:
# Check GPU allocation and acprocesses
!nvidia-smi