# Google Drive

In [1]:
####################################
#
#  ADD THIS TO EVERY COLAB FILE!
#
####################################

from google.colab import drive
drive.mount('/content/drive')

import drive.Shareddrives.GPTJ.project.settings as settings

PATH_PROJECT = settings.PATH_PROJECT
PATH_DATA = settings.PATH_DATA

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! cd $PATH_PROJECT && pip install -q -r requirements.txt

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


In [3]:
!nvidia-smi

# Standard -> Tesla T4
# Premium -> Tesla P100-PCIE-16GB

Sat Oct 15 23:49:15 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import pandas as pd
import torch
torch.manual_seed(42)

from torch.utils.data import Dataset


# Dataset

In [5]:

class SimpleDataset(Dataset):

    def __init__(self, contents, labels, tokenizer, max_length=1024):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        for content, label in zip(contents, labels):

            encodings_dict = tokenizer(
                content,
                truncation=True,
                max_length=max_length, 
                padding='max_length'
            )

            input_ids = torch.tensor(encodings_dict['input_ids'])    
            self.input_ids.append(input_ids)

            mask = torch.tensor(encodings_dict['attention_mask'])
            self.attn_masks.append(mask)

            label = torch.tensor(label)
            self.labels.append(label)

            # print('item', len(input_ids), '-', input_ids, mask, label)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx], self.labels[idx]


# Treino

In [6]:
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel, GPT2ForSequenceClassification

model_name = 'gpt2'

tokenizer = GPT2Tokenizer.from_pretrained(
    model_name,
    num_labels = 2 # 0, 1
)
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained(model_name).cuda()
model.config.pad_token_id = model.config.eos_token_id

# model.resize_token_embeddings(len(tokenizer))

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# cache
! mkdir -p /content/tmp/
! cp -r /content/drive/Shareddrives/GPTJ/data/kaggle/proc-1/* /content/tmp/

In [None]:
torch.manual_seed(42)

from torch.utils.data import random_split

labels = [1, 5, 7]
path_mask = '/content/tmp/{}/'
paths = [path_mask.format(i) for i in labels]

def load_dir(path):
    contents = []
    listdir = list(os.listdir(path)) # [:len_files]
    for filename in listdir:
        filepath = os.path.join(path, filename)
        with open(filepath, 'r') as f:
            contents.append(f.read())
    print(len(contents), 'files')
    return contents

train_dataset = None
val_dataset = None
test_dataset = None

for i, label in enumerate(labels):
    print('load class', label, 'as', i)
    contents = load_dir(paths[i])
    labels = [i]*len(contents)
    dataset_i = SimpleDataset(contents, labels, tokenizer)
    print('len', len(dataset_i))

    size_10 = int(0.1 * len(dataset_i))
    size_80 = len(dataset_i) - 2*size_10
    train_dataset_i, val_dataset_i, test_dataset_i = \
        random_split(dataset_i, [size_10, size_10, size_80])

    if not train_dataset:
        train_dataset = train_dataset_i
        val_dataset = val_dataset_i
        test_dataset = test_dataset_i
    else:
        train_dataset += train_dataset_i
        val_dataset += val_dataset_i
        test_dataset += test_dataset_i

# print('total len', len(dataset))

# size_10 = int(0.1 * len(dataset))
# size_80 = len(dataset) - 2*size_10

# train_dataset, val_dataset, test_dataset = random_split(dataset, [size_10, size_10, size_80])

print(train_dataset[-1])
print(val_dataset[-1])
print(test_dataset[-1])

load class 1 as 0
1541 files


In [None]:
# (tensor([  562,  2454, 50115,  ...,  9940,    87,  3571]), tensor([1, 1, 1,  ..., 1, 1, 1]), tensor(1))
# (tensor([  562,  2454, 50115,  ...,  1409,  1225,    72]), tensor([1, 1, 1,  ..., 1, 1, 1]), tensor(1))
# (tensor([  562,  2454, 50115,  ...,    72,  3571, 21844]), tensor([1, 1, 1,  ..., 1, 1, 1]), tensor(1))

In [None]:
training_args = TrainingArguments(
    output_dir='/content/',
    num_train_epochs=2,
    # logging_steps=5000,
    # save_steps=5000,                                   
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=0,
    weight_decay=0.01,  
    # logging_dir=os.path.join(PATH_DATA, model_name, 'logs')
)

trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset, 
    # This custom collate function is necessary 
    # to built batches of data
    data_collator=lambda data: {
        'input_ids': torch.stack([f[0] for f in data]),       
        'attention_mask': torch.stack([f[1] for f in data]),
        'labels': torch.stack([f[2] for f in data])
    }
)
# Start training process!
trainer.train()

# Classification

In [None]:
import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")

test_predictions_weights = trainer.predict(test_dataset)
test_predictions = np.argmax(test_predictions_weights[0], axis=1)
# print(test_predictions)

test_labels = [label for _, _, label in test_dataset]
# print(test_labels)

test_references = np.array(test_labels)

metric.compute(predictions=test_predictions, references=test_references)
# {'accuracy': 0.91888}