In [None]:
!pip install pytorch-lightning sentence-transformers torchmetrics rich

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import os

os.chdir('/content/drive/MyDrive/NLP Group Project/synthetic-squad-main/src')
os.getcwd()

'/content/drive/MyDrive/NLP Group Project/synthetic-squad-main/src'

# Distilbert Baseline

## Task 3

### Prepare the Data

In [4]:
import pandas as pd
from datautils import SynDataModule, SynBatcher

TASK = 3
EDPATH = r'/content/drive/MyDrive/NLP Group Project/Texts/New Files/new_punc_data_tr.csv'
MPATH = "distilbert-base-cased"

df = pd.read_csv(EDPATH)

label2id = {label: idx for idx, label in enumerate(df['alg'].unique())}
id2label = {v:k for k,v in label2id.items()}

In [5]:
label2id

{'human': 0,
 'fair': 1,
 'grover': 2,
 'gpt2': 3,
 'gpt3': 4,
 'instructgpt': 5,
 'gpt': 6,
 'ctrl': 7,
 'pplm': 8,
 'xlnet': 9,
 'xlm': 10}

In [6]:
batcher = SynBatcher(tnkzr_path=MPATH)
train_dm = SynDataModule(data_path=EDPATH, task='aa', label2id=label2id, batcher=batcher)
# train_dm.setup('fit')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

### Prepare the model

In [7]:
from modelling import SequenceClassification
from pytorch_lightning.callbacks import(
    EarlyStopping,
    ModelCheckpoint,
    RichModelSummary
)

SAVE_PATH = "/content/drive/MyDrive/NLP Group Project/synthetic-squad-main/models/task3/distilbert-base-cased"

model = SequenceClassification(MPATH, num_labels=len(label2id), id2label=id2label)
# print("Loaded fresh model")

# model = SequenceClassification.load_from_checkpoint("/content/drive/MyDrive/NLP Group Project/synthetic-squad-main/models/task3/distilbert-base-cased/task-3-epoch=2-val_loss=0.20.ckpt")

checkpoint_callback = ModelCheckpoint(
    SAVE_PATH,
    filename=f'task-{TASK}-{{epoch}}-{{val_loss:.2f}}',
    monitor='val_loss',
    save_weights_only=True,
)
early_stop_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=1e-4, patience=8,
    verbose=False,
    mode="min"
)

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bia

### Train the model

In [None]:
from pytorch_lightning import Trainer, seed_everything


seed_everything(42, workers=True)

trainer = Trainer(
    max_epochs=-1,
    deterministic=True,
    accumulate_grad_batches=4,
    callbacks=[checkpoint_callback, early_stop_callback, RichModelSummary()],
    accelerator='gpu',
    log_every_n_steps=16
)

trainer.fit(model, train_dm)

INFO:lightning_fabric.utilities.seed:Global seed set to 42
INFO:pytorch_lightning.utilities.rank_zero:Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

## Task 2

### Prepare the Data

In [8]:
TASK = 2

label2id = {
    'human': 0,
    'machine': 1 
}

In [9]:
batcher = SynBatcher(tnkzr_path=MPATH)
train_dm = SynDataModule(data_path=EDPATH, task='hvm', label2id=label2id, batcher=batcher)
# train_dm.setup('fit')

### Prepare the model

In [None]:
SAVE_PATH = "'/content/drive/MyDrive/NLP Group Project/synthetic-squad-main/src/models/task2/distilbert-base-cased"

model = SequenceClassification(MPATH, num_labels=len(label2id), id2label=id2label)
# print("Loaded fresh model")
# model = SequenceClassification.load_from_checkpoint("'/content/drive/MyDrive/NLP Group Project/synthetic-squad-main/src/models/task2/distilbert-base-cased/task-2-epoch=1-val_loss=0.03.ckpt")

checkpoint_callback = ModelCheckpoint(
    SAVE_PATH,
    filename=f'task-{TASK}-{{epoch}}-{{val_loss:.2f}}',
    monitor='val_loss',
    save_weights_only=True,
)
early_stop_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=1e-4, patience=8,
    verbose=False,
    mode="min"
)

### Train the model

In [None]:
from pytorch_lightning import Trainer, seed_everything

seed_everything(42, workers=True)

trainer = Trainer(
    max_epochs=-1,
    deterministic=True,
    accumulate_grad_batches=4,
    callbacks=[checkpoint_callback, early_stop_callback, RichModelSummary()],
    accelerator='gpu',
    log_every_n_steps=16
)

trainer.fit(model, train_dm)

INFO:lightning_fabric.utilities.seed:Global seed set to 42
INFO:pytorch_lightning.utilities.rank_zero:Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]