In [1]:
import sys
sys.path.append('structural-probes')
from pathlib import Path

In [2]:
from run_experiment import setup_new_experiment_dir, execute_experiment
import yaml
import torch
import pandas as pd

In [3]:
CONFIG_FILE = 'example/config/bert_ptb3.yaml'
EXPERIMENT_NAME = ''
SEED = 123

class Object(object):
    pass

cli_args = Object()
cli_args.experiment_config = CONFIG_FILE
cli_args.results_dir = EXPERIMENT_NAME
cli_args.train_probe = -1
cli_args.report_results = 1
cli_args.seed = SEED

yaml_args = yaml.load(open(cli_args.experiment_config), Loader=yaml.FullLoader)
setup_new_experiment_dir(cli_args, yaml_args, cli_args.results_dir)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
yaml_args['device'] = device

Constructing new results directory at example/results/BERT-disk-parse-distance-2020-4-16-20-45-35-716891/


In [4]:
# execute_experiment(yaml_args, train_probe=cli_args.train_probe, report_results=cli_args.report_results)

# List all of the tasks we want to evaluate on

In [5]:
data_folder = Path("../../..")
cola = data_folder / 'data'/ 'CoLA' / 'original' / 'raw'

In [6]:
df = pd.read_csv(cola / 'in_domain_train.tsv', delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))# Display 10 random rows from the data.
print('0 - unacceptable & 1 - acceptable')
df.sample(10)

Number of training sentences: 8,551

0 - unacceptable & 1 - acceptable


Unnamed: 0,sentence_source,label,label_notes,sentence
2213,l-93,1,,Brenda and Molly chatted.
6676,m_02,1,,Frank bought the piano for Jane.
7231,sks13,1,,Bill's mother's friends are waiting at the res...
1818,r-67,0,*,John tried for Bill to play checkers.
2577,l-93,1,,Tamara poured water over the flowers.
1221,r-67,0,*,The money which I am discussing the claim that...
232,cj99,0,*,"When Bill smokes, all the more Susan hates him."
6211,c_13,1,,It is likely that Tami will leave New York.
5914,c_13,1,,The stodgy professor left with his teaching as...
4666,ks08,1,,She was sent to Seoul.


In [7]:
# Get the lists of sentences and their labels.
sentences = df.sentence.values
labels = df.label.values

In [8]:
from transformers import BertTokenizer# Load the BERT tokenizer.

In [9]:
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=True)

Loading BERT tokenizer...


In [10]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []# For every sentence...
for sent in sentences:
    # `encode` will:
    # (1) Tokenize the sentence.
    # (2) Prepend the `[CLS]` token to the start.
    # (3) Append the `[SEP]` token to the end.
    # (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
        sent, # Sentence to encode.
        add_special_tokens = True, # Add '[CLS]' and '[SEP]' # This function also supports truncation and conversion
    # to pytorch tensors, but we need to do padding, so we
    # can't use these features :( .
    #max_length = 128, # Truncate all sentences.
    #return_tensors = 'pt', # Return pytorch tensors.
    )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  Our friends won't buy this analysis, let alone the next one we propose.
Token IDs: [101, 1412, 2053, 1281, 112, 189, 4417, 1142, 3622, 117, 1519, 2041, 1103, 1397, 1141, 1195, 17794, 119, 102]


In [11]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Max sentence length:  47


In [14]:
# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences # Set the maximum sequence length.
# I've chosen 64 somewhat arbitrarily. It's slightly larger than the
# maximum training sentence length of 47...
MAX_LEN = 64
print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id)) # Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
print('\Done.')



Padding/truncating all sentences to 64 values...

Padding token: "[PAD]", ID: 0
\Done.
(8551, 64)


In [15]:
# Create attention masks
attention_masks = []# For each sentence...
for sent in input_ids:

    # Create the attention mask.
    # - If a token ID is 0, then it's padding, set the mask to 0.
    # - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]

    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [18]:
# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split # Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1)

In [20]:
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [22]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.

# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32
batch_size = 32
# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)# Create the DataLoader for our validation set.

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [23]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top.

model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
    # You can increase this for multi-class tasks. 
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
# Tell pytorch to run this model on the GPU.
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=361.0, style=ProgressStyle(description_…




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [34]:
import data
import task
from tqdm import tqdm

In [43]:
from transformers import BertTokenizer, BertForSequenceClassification, WordpieceTokenizer

In [47]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased')
LAYER_COUNT = 12
FEATURE_COUNT = 768
train_raw_text = 'my_tests/ptb3-wsj-dev.raw'

In [51]:
for index, line in enumerate(open(train_raw_text)):
    line = line.strip() # Remove trailing characters
    line = '[CLS] ' + line + ' [SEP]'
    tokenized_text = tokenizer.wordpiece_tokenizer.tokenize(line)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segment_ids = [1 for x in tokenized_text]
  
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segment_ids])
  
    with torch.no_grad():
        output = model(input_ids=tokens_tensor, attention_mask=segments_tensors)
        import pdb; pdb.set_trace()
        print(output)
    np.vstack([np.array(x) for x in encoded_layers])

> <ipython-input-51-4140468ee1d2>(15)<module>()
-> print(output)
(Pdb) print(output)
(tensor([[-0.0624, -0.1485]]),)
(Pdb) output.shape
*** AttributeError: 'tuple' object has no attribute 'shape'
(Pdb) q


BdbQuit: 