In [1]:
# !pip install pytorch-pretrained-bert pytorch-nlp


In [4]:
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Col 1 : code representing the source of the sentence

Col 2 : acceptability judgement label(0=unacceptale, 1=acceptable)

Col 3 : acceptability judgement as notated by author

Col 4 : the sentence

In [5]:
df = pd.read_csv("in_domain_train.tsv", delimiter= "\t", header=None, 
names=['sentence_source','label','label_notes','sentence'])

In [6]:
df.shape

(8551, 4)

In [7]:
df.sample(5)


Unnamed: 0,sentence_source,label,label_notes,sentence
1849,r-67,1,,"We'll do it together, you and me."
7763,ad03,0,*,Who did you believe that to kiss seemed wrong?
1175,r-67,1,,Mary has never kissed a man who is taller than...
6183,c_13,1,,What do you think Matt kissed?
1860,r-67,0,*,I saw Mary and downtown yesterday your friend ...


In [8]:
# create sentence and label units
sentences = df.sentence.values

# add special tokens at the beginning and end of each sentence for BERT to work

sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values



##### Bert tokenizer , used to  convert text into tokens to BERT's vocab.

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print("Tokenize the first sentence")
print(tokenized_texts[0])

100%|██████████| 231508/231508 [00:02<00:00, 109198.49B/s]


Tokenize the first sentence
['[CLS]', 'our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[SEP]']


In [10]:
sentences[0]

"[CLS] Our friends won't buy this analysis, let alone the next one we propose. [SEP]"

In [11]:
# Setting the maximum sequence length, The longest sequence in our training set is 47(original:512)
MAX_LEN = 128

In [None]:
input_ids= [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]


In [None]:
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

  # Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)


# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)                                             

In [None]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay_rate': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}]



In [None]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = BertAdam(optimizer_grouped_parameters,lr=2e-5,warmup=.1)
