# ELECTRA
This notebook aims to use transfer learning on a ELECTRA model to perform text classification and detect suicidal text.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install -qqq transformers datasets wandb

[K     |████████████████████████████████| 5.5 MB 4.5 MB/s 
[K     |████████████████████████████████| 451 kB 88.9 MB/s 
[K     |████████████████████████████████| 1.9 MB 80.5 MB/s 
[K     |████████████████████████████████| 182 kB 92.1 MB/s 
[K     |████████████████████████████████| 7.6 MB 76.4 MB/s 
[K     |████████████████████████████████| 115 kB 92.8 MB/s 
[K     |████████████████████████████████| 212 kB 91.3 MB/s 
[K     |████████████████████████████████| 127 kB 90.5 MB/s 
[K     |████████████████████████████████| 182 kB 88.4 MB/s 
[K     |████████████████████████████████| 168 kB 81.9 MB/s 
[K     |████████████████████████████████| 62 kB 1.7 MB/s 
[K     |████████████████████████████████| 168 kB 92.5 MB/s 
[K     |████████████████████████████████| 166 kB 96.5 MB/s 
[K     |████████████████████████████████| 166 kB 96.8 MB/s 
[K     |████████████████████████████████| 162 kB 96.3 MB/s 
[K     |████████████████████████████████| 162 kB 95.0 MB/s 
[K     |██████████████████

In [None]:
# Import packages
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric

In [None]:
# Specify GPU
device = torch.device("cuda")

In [None]:
# Change to your own directory
try:
    os.chdir("/content/drive/MyDrive/Suicide Project")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


## Define constants

In [None]:
# Define constants
EPOCHS = 1
BATCH_SIZE = 6
LEARNING_RATE = 1e-5
SEED = 4222

MODEL_SAVE_PATH = "Models/electra"
MODEL_CHECKPOINT_PATH = "Models/electra_checkpoint"
MODEL_LOGGING_PATH = "Models/electra_checkpoint/logs"

WANDB_ENTITY = "manit"
WANDB_PROJECT = "Suicide Prediction"
WANDB_RUN = "electra"

## Load dataset

In [None]:
# Load dataset
df = pd.read_csv('Data/suicide_detection_final_cleaned.csv', header=0, names=['text', 'label', 'cleaned_text'])
df.drop(columns=['cleaned_text'], inplace=True)
df['label'] = df['label'].map({'suicide': 1, 'non-suicide': 0})
df.head()

Unnamed: 0,text,label
0,Ex Wife Threatening SuicideRecently I left my ...,1
1,Am I weird I don't get affected by compliments...,0
2,Finally 2020 is almost over... So I can never ...,0
3,i need helpjust help me im crying so hard,1
4,It ends tonight.I can’t do it anymore. \nI quit.,1


In [None]:
# Split dataset into train, validation and test sets
train, temp = train_test_split(df,
                               random_state=SEED,
                               test_size=0.2,
                               stratify=df['label'])

val, test = train_test_split(temp,
                             random_state=SEED,
                             test_size=0.5,
                             stratify=temp['label'])

## Load ELECTRA Model

In [None]:
# Load ELECTRA tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")

Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def dataset_conversion(train, test, val):
  """Converts pandas dataframe to Dataset."""

  train.reset_index(drop=True, inplace=True)
  test.reset_index(drop=True, inplace=True)
  val.reset_index(drop=True, inplace=True)

  train_dataset = Dataset.from_pandas(train)
  test_dataset = Dataset.from_pandas(test)
  val_dataset = Dataset.from_pandas(val)

  return DatasetDict({"train": train_dataset,
                      "test": test_dataset,
                      "val": val_dataset})

raw_datasets = dataset_conversion(train, test, val)

In [None]:
def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/141 [00:00<?, ?ba/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

In [None]:
# Tokenise datasets
SAMPLE_SIZE = 20
small_train_dataset = tokenized_datasets["train"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
small_val_dataset = tokenized_datasets["val"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))

full_train_dataset = tokenized_datasets["train"]
full_test_dataset = tokenized_datasets["test"]
full_val_dataset = tokenized_datasets["val"]

In [None]:
# Import ELECTRA-base pretrained model
model = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.d

In [None]:
# Login wandb
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# Initialise wandb
wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, name=WANDB_RUN)

[34m[1mwandb[0m: Currently logged in as: [33mmanit[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Define custom metrics for computation
def compute_metrics(eval_pred):
    metric_acc = load_metric("accuracy")
    metric_rec = load_metric("recall")
    metric_pre = load_metric("precision")
    metric_f1 = load_metric("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]
    recall = metric_rec.compute(predictions=predictions, references=labels)["recall"]
    precision = metric_pre.compute(predictions=predictions, references=labels)["precision"]
    f1 = metric_f1.compute(predictions=predictions, references=labels)["f1"]

    return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1}

In [None]:
# Define model and training parameters
training_args = TrainingArguments(
    output_dir=MODEL_CHECKPOINT_PATH,
    overwrite_output_dir = True,
    report_to = 'wandb',
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    seed=SEED,
    # evaluation_strategy="epoch",
    run_name=WANDB_RUN,
    logging_dir=MODEL_LOGGING_PATH,
    save_strategy="steps",
    save_steps=1500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

## Pre-trained ELECTRA

In [None]:
# Predict before fine-tuning
trainer.predict(full_test_dataset).metrics

The following columns in the test set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text. If text are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 17583
  Batch size = 6
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  This is separate from the ipykernel package so we can avoid doing imports until


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'test_loss': 0.6895462870597839,
 'test_accuracy': 0.5414320650628448,
 'test_recall': 0.22079251352536922,
 'test_precision': 0.35579641847313853,
 'test_f1': 0.27248939817738876,
 'test_runtime': 153.9629,
 'test_samples_per_second': 114.203,
 'test_steps_per_second': 19.037}

## Fine-tuned ELECTRA

In [None]:
# %%wandb # To observe training progress live

# Fine-tune model
trainer.train()



The following columns in the training set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text. If text are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 140660
  Num Epochs = 1
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 23444
  Number of trainable parameters = 109483778
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
500,0.3407
1000,0.1999
1500,0.1851
2000,0.1778
2500,0.1864
3000,0.1588
3500,0.1374
4000,0.1427
4500,0.1455
5000,0.1396


Saving model checkpoint to Models/electra_checkpoint/checkpoint-1500
Configuration saved in Models/electra_checkpoint/checkpoint-1500/config.json
Model weights saved in Models/electra_checkpoint/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in Models/electra_checkpoint/checkpoint-1500/tokenizer_config.json
Special tokens file saved in Models/electra_checkpoint/checkpoint-1500/special_tokens_map.json
Saving model checkpoint to Models/electra_checkpoint/checkpoint-3000
Configuration saved in Models/electra_checkpoint/checkpoint-3000/config.json
Model weights saved in Models/electra_checkpoint/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in Models/electra_checkpoint/checkpoint-3000/tokenizer_config.json
Special tokens file saved in Models/electra_checkpoint/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to Models/electra_checkpoint/checkpoint-4500
Configuration saved in Models/electra_checkpoint/checkpoint-4500/config.json
Model weights sa

TrainOutput(global_step=23444, training_loss=0.12127708115485764, metrics={'train_runtime': 4009.3748, 'train_samples_per_second': 35.083, 'train_steps_per_second': 5.847, 'total_flos': 3.70092010469376e+16, 'train_loss': 0.12127708115485764, 'epoch': 1.0})

In [None]:
# Resume fine-tuning from checkpoint
trainer.train(MODEL_CHECKPOINT_PATH + "/" + "checkpoint-18000")

Loading model from Models/electra_checkpoint/checkpoint-18000.
The following columns in the training set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text. If text are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 140660
  Num Epochs = 1
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 23444
  Number of trainable parameters = 109483778
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 18000
  Will skip the first 0 epochs then the first 18000 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen

  0%|          | 0/18000 [00:00<?, ?it/s]

Step,Training Loss
18500,0.0806
19000,0.0802
19500,0.0861
20000,0.0628
20500,0.0586
21000,0.073
21500,0.0579
22000,0.0768
22500,0.0757
23000,0.0589


Saving model checkpoint to Models/electra_checkpoint/checkpoint-19500
Configuration saved in Models/electra_checkpoint/checkpoint-19500/config.json
Model weights saved in Models/electra_checkpoint/checkpoint-19500/pytorch_model.bin
tokenizer config file saved in Models/electra_checkpoint/checkpoint-19500/tokenizer_config.json
Special tokens file saved in Models/electra_checkpoint/checkpoint-19500/special_tokens_map.json
Saving model checkpoint to Models/electra_checkpoint/checkpoint-21000
Configuration saved in Models/electra_checkpoint/checkpoint-21000/config.json
Model weights saved in Models/electra_checkpoint/checkpoint-21000/pytorch_model.bin
tokenizer config file saved in Models/electra_checkpoint/checkpoint-21000/tokenizer_config.json
Special tokens file saved in Models/electra_checkpoint/checkpoint-21000/special_tokens_map.json
Saving model checkpoint to Models/electra_checkpoint/checkpoint-22500
Configuration saved in Models/electra_checkpoint/checkpoint-22500/config.json
Mode

TrainOutput(global_step=23444, training_loss=0.016410753113115795, metrics={'train_runtime': 1013.665, 'train_samples_per_second': 138.764, 'train_steps_per_second': 23.128, 'total_flos': 3.70092010469376e+16, 'train_loss': 0.016410753113115795, 'epoch': 1.0})

In [None]:
# Terminate wandb run
wandb.finish()

0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███▆▇▇▇▇███

0,1
eval/accuracy,0.97992
eval/f1,0.97414
eval/loss,0.10186
eval/precision,0.97621
eval/recall,0.97207
eval/runtime,151.2267
eval/samples_per_second,116.269
eval/steps_per_second,19.382
train/epoch,1.0
train/global_step,23444.0


In [None]:
# Save fine-tuned model
trainer.save_model(MODEL_SAVE_PATH)

Saving model checkpoint to Models/electra
Configuration saved in Models/electra/config.json
Model weights saved in Models/electra/pytorch_model.bin
tokenizer config file saved in Models/electra/tokenizer_config.json
Special tokens file saved in Models/electra/special_tokens_map.json


In [None]:
# Evaluate fine-tuned model
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text. If text are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17583
  Batch size = 6


{'eval_loss': 0.10186328738927841,
 'eval_accuracy': 0.9799237900244554,
 'eval_recall': 0.9720719403421553,
 'eval_precision': 0.9762114537444934,
 'eval_f1': 0.974137299435856,
 'eval_runtime': 151.2267,
 'eval_samples_per_second': 116.269,
 'eval_steps_per_second': 19.382,
 'epoch': 1.0}

In [None]:
# Predict after fine-tuning
trainer.predict(full_test_dataset).metrics

The following columns in the test set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text. If text are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 17583
  Batch size = 6


{'test_loss': 0.10229253023862839,
 'test_accuracy': 0.9799237900244554,
 'test_recall': 0.9751425647024419,
 'test_precision': 0.973292469352014,
 'test_f1': 0.9742166386677379,
 'test_runtime': 154.3579,
 'test_samples_per_second': 113.911,
 'test_steps_per_second': 18.988}

In [None]:
def get_training_history(wandb_run):
  """Extract key metrics from training and eval across epochs from wandb run data."""

  # Get training history from wandb
  api = wandb.Api()
  run = api.run(wandb_run)
  history = run.history()

  # Rename columns
  train_column_dict = {'train/epoch': 'epoch', 'train/loss': 'training_loss'}
  val_column_dict = {'train/epoch': 'epoch', 'eval/loss': 'validation_loss', 'eval/accuracy': 'accuracy',
                'eval/precision': 'precision', 'eval/recall': 'recall', 'eval/f1': 'f1'}

  # Train data
  train_history = history[list(train_column_dict.keys())]
  train_history.columns = [train_column_dict.get(x, x) for x in train_history.columns]
  train_history = train_history.dropna()

  # Val data
  val_history = history[list(val_column_dict.keys())]
  val_history.columns = [val_column_dict.get(x, x) for x in val_history.columns]
  val_history = val_history.dropna()

  return pd.merge(train_history, val_history, how="right", on="epoch")


# Get dataframe for training history
WANDB_RUN_ID = "1n5jgvnc" # Replace with your wandb run details, found in the training cell

training_history = get_training_history(WANDB_ENTITY + "/" + WANDB_PROJECT + "/" + WANDB_RUN_ID)
training_history

Unnamed: 0,epoch,training_loss,validation_loss,accuracy,precision,recall,f1
0,1.0,,0.101863,0.979924,0.976211,0.972072,0.974137


In [None]:
# Load fine-tuned model
saved_model = AutoModelForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)

# Load trainer after fine-tune
saved_trainer = Trainer(
    model=saved_model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Predict after fine-tuning
saved_trainer.predict(full_test_dataset).metrics

loading configuration file Models/electra/config.json
Model config ElectraConfig {
  "_name_or_path": "Models/electra",
  "architectures": [
    "ElectraForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file Models/electra/pytorch_model.bin
All model checkpoint weigh

{'test_loss': 0.10229253023862839,
 'test_accuracy': 0.9799237900244554,
 'test_recall': 0.9751425647024419,
 'test_precision': 0.973292469352014,
 'test_f1': 0.9742166386677379,
 'test_runtime': 151.0532,
 'test_samples_per_second': 116.403,
 'test_steps_per_second': 19.404}

## GPU Memory Utilities

In [None]:
# Delete variables and empty cache
del trainer
del model
torch.cuda.empty_cache()

In [None]:
# Python garbage collection
import gc
gc.collect()

38234

In [None]:
# Check memory allocation
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

439076352
2411724800


In [None]:
# Check memory summary
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  428785 KB |    5529 MB |  558027 GB |  558026 GB |
|       from large pool |  428288 KB |    5527 MB |  557947 GB |  557947 GB |
|       from small pool |     497 KB |      20 MB |      79 GB |      79 GB |
|---------------------------------------------------------------------------|
| Active memory         |  428785 KB |    5529 MB |  558027 GB |  558026 GB |
|       from large pool |  428288 KB |    5527 MB |  557947 GB |  557947 GB |
|       from small pool |     497 KB |      20 MB |      79 GB |      79 GB |
|---------------------------------------------------------------

In [None]:
# Check GPU allocation and acprocesses
!nvidia-smi

Sat Nov 26 07:41:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    43W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces