In [1]:
# !pip install pytorch-pretrained-bert pytorch-nlp


In [4]:
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Col 1 : code representing the source of the sentence

Col 2 : acceptability judgement label(0=unacceptale, 1=acceptable)

Col 3 : acceptability judgement as notated by author

Col 4 : the sentence

In [5]:
df = pd.read_csv("in_domain_train.tsv", delimiter= "\t", header=None, 
names=['sentence_source','label','label_notes','sentence'])

In [6]:
df.shape

(8551, 4)

In [7]:
df.sample(5)


Unnamed: 0,sentence_source,label,label_notes,sentence
1849,r-67,1,,"We'll do it together, you and me."
7763,ad03,0,*,Who did you believe that to kiss seemed wrong?
1175,r-67,1,,Mary has never kissed a man who is taller than...
6183,c_13,1,,What do you think Matt kissed?
1860,r-67,0,*,I saw Mary and downtown yesterday your friend ...


In [8]:
# create sentence and label units
sentences = df.sentence.values

# add special tokens at the beginning and end of each sentence for BERT to work

sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values



##### Bert tokenizer , used to  convert text into tokens to BERT's vocab.

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print("Tokenize the first sentence")
print(tokenized_texts[0])

100%|██████████| 231508/231508 [00:02<00:00, 109198.49B/s]


Tokenize the first sentence
['[CLS]', 'our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[SEP]']


In [10]:
sentences[0]

"[CLS] Our friends won't buy this analysis, let alone the next one we propose. [SEP]"

In [11]:
# Setting the maximum sequence length, The longest sequence in our training set is 47(original:512)
MAX_LEN = 128

In [12]:
input_ids= [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids

[[101,
  2256,
  2814,
  2180,
  1005,
  1056,
  4965,
  2023,
  4106,
  1010,
  2292,
  2894,
  1996,
  2279,
  2028,
  2057,
  16599,
  1012,
  102],
 [101,
  2028,
  2062,
  18404,
  2236,
  3989,
  1998,
  1045,
  1005,
  1049,
  3228,
  2039,
  1012,
  102],
 [101,
  2028,
  2062,
  18404,
  2236,
  3989,
  2030,
  1045,
  1005,
  1049,
  3228,
  2039,
  1012,
  102],
 [101,
  1996,
  2062,
  2057,
  2817,
  16025,
  1010,
  1996,
  13675,
  16103,
  2121,
  2027,
  2131,
  1012,
  102],
 [101, 2154, 2011, 2154, 1996, 8866, 2024, 2893, 14163, 8024, 3771, 1012, 102],
 [101, 1045, 1005, 2222, 8081, 2017, 1037, 4392, 1012, 102],
 [101, 5965, 27129, 1996, 4264, 4257, 1012, 102],
 [101, 3021, 19055, 2010, 2126, 2041, 1997, 1996, 4825, 1012, 102],
 [101, 2057, 1005, 2128, 5613, 1996, 2305, 2185, 1012, 102],
 [101, 11458, 25756, 1996, 3384, 4257, 1012, 102],
 [101, 1996, 4401, 4191, 1996, 2377, 2125, 1996, 2754, 1012, 102],
 [101, 1996, 8644, 10619, 5024, 1012, 102],
 [101, 3021, 4565, 2