In [None]:
import wget
url = 'https://users.dcc.uchile.cl/~jperez/beto/uncased_2M/vocab.txt'
url1 = 'https://users.dcc.uchile.cl/~jperez/beto/uncased_2M/config.json'
wget.download(url)
wget.download(url1)

In [None]:
import torch
from transformers import  BertTokenizer
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, SequentialSampler
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import torch.optim
import numpy as np
import pandas as pd
import time
import datetime
import random
from sklearn.metrics import confusion_matrix

# Select cpu or cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Importing and prepping the data:
df = pd.read_parquet('../../data/verifiable/labeled_data_train.parquet')
df = df[['text', 'label']]

text = df['text']
label = df['label']
label = pd.DataFrame(list(map(lambda x: 0 if x=="no verificable" else 1, label)))


# Split dataset
X_train, X_val, y_train, y_val = train_test_split(text, label, stratify=label, test_size=0.15, random_state=42)

# Report datasets lenghts
print('Training set length : {}'.format(len(X_train)))
print('Validation set length : {}'.format(len(X_val)))

Training set length : 955
Validation set length : 169


In [None]:
tokenizer = BertTokenizer.from_pretrained("pytorch/",
            do_lower_case=True)

def preprocessing(dataset):
    input_ids = []
    attention_mask = []
    for doc in dataset:
        encoded_doc = tokenizer.encode_plus(doc,
                   add_special_tokens=True, max_length=254,
                   truncation=True,pad_to_max_length=True)
        input_ids.append(encoded_doc['input_ids'])
        attention_mask.append(encoded_doc['attention_mask'])
    return (torch.tensor(input_ids),
           torch.tensor(attention_mask))

# Apply preprocessing to dataset
X_train_inputs, X_train_masks = preprocessing(X_train)
X_val_inputs, X_val_masks = preprocessing(X_val)

# Report max n° tokens in a sentence
max_len = max([torch.sum(sen) for sen in X_train_masks])
print('Max n°tokens in a sentence: {0}'.format(max_len))



Max n°tokens in a sentence: 254


In [None]:
# Data loaders
batch_size = 32

y_train_labels = torch.tensor(y_train.values)

y_val_labels = torch.tensor(y_val.values)


def dataloader(x_inputs, x_masks, y_labels):
    data = TensorDataset(x_inputs, x_masks, y_labels)
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler,
                 batch_size=batch_size,
                 num_workers=0)
    return dataloader

train_dataloader = dataloader(X_train_inputs, X_train_masks,
                   y_train_labels)
val_dataloader = dataloader(X_val_inputs, X_val_masks,
                 y_val_labels)

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("pytorch/",
            do_lower_case=True)

In [None]:
from transformers import BertModel
import torch

# set random seed
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed_all(value)
set_seed(42)

# Create model
model = BertForSequenceClassification.from_pretrained(
        "pytorch/", num_labels=2, output_attentions=False,
         output_hidden_states=False)


Some weights of the model checkpoint at pytorch/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at pytorch/

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

GRADIENT_ACCUMULATION_STEPS = 1
NUM_TRAIN_EPOCHS = 7
LEARNING_RATE = 5e-5
WARMUP_PROPORTION = 0.1
MAX_GRAD_NORM = 5

num_train_steps = int(len(train_dataloader.dataset) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)
num_warmup = int(WARMUP_PROPORTION * num_train_steps)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup, num_training_steps=num_train_steps)

In [None]:

for i in tqdm(range(0, 3)):
    print(i)






100%|██████████| 3/3 [00:00<?, ?it/s]

0
1
2



