In [14]:
#install hugging face transformer
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 17.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 62.0MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 55.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=2d0b964148452a4fd9

##initialize torch with GPU

In [2]:
import torch

if torch.cuda.is_available():
  device = torch.device("cuda")

  print(f"There are {torch.cuda.device_count()} GPU(s) available.")
  print(f'We will use the GPU - {torch.cuda.get_device_name(0)}')
else:
  print('No GPU available, using the CPU instead')
  device = torch.device("cpu")



There are 1 GPU(s) available.
We will use the GPU - Tesla T4


##Loading CoLA dataset

In [3]:
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9682 sha256=c84737face571859cd6ea47e70e0b279c87e59608dd880b84cb11b4229c8691c
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [8]:
import wget
import os

print("Downloading databset....")

url = "https://nyu-mll.github.io/CoLA/cola_public_1.1.zip"

wget.download(url,"./cola_public/cola_public_1.1.zip")

Downloading databset....


FileNotFoundError: ignored

In [9]:
if not os.path.exists("./cola_public/"):
  !unzip cola_public_1.1.zip

Archive:  cola_public_1.1.zip
   creating: cola_public/
  inflating: cola_public/README      
   creating: cola_public/tokenized/
  inflating: cola_public/tokenized/in_domain_dev.tsv  
  inflating: cola_public/tokenized/in_domain_train.tsv  
  inflating: cola_public/tokenized/out_of_domain_dev.tsv  
   creating: cola_public/raw/
  inflating: cola_public/raw/in_domain_dev.tsv  
  inflating: cola_public/raw/in_domain_train.tsv  
  inflating: cola_public/raw/out_of_domain_dev.tsv  


In [10]:
import pandas as pd
df = pd.read_csv("./cola_public/raw/in_domain_train.tsv", delimiter="\t", header=None, names=["sentence_source", "label", "label_notes", "sentence"])

#Report number of sentences
print(f'Number of training sentences: {df.shape[0]}')
df.sample(10)

Number of training sentences: 8551


Unnamed: 0,sentence_source,label,label_notes,sentence
2037,rhl07,1,,Jake kicked the ball halfway to Bill.
6513,g_81,1,,John gave the books to Mary and the records to...
8163,ad03,1,,Has Jenny eaten a cake?
4301,ks08,0,*,Stephen persuaded the cat to be out of the bag.
3279,l-93,1,,Cornelia lodged at Mrs. Parker's.
638,bc01,1,,Give the bottle to the baby full.
1716,r-67,0,*,A friend of mine and a girl who was from his h...
7734,ad03,1,,How did you eat the cake?
7322,sks13,0,*,I put the book.
5839,c_13,0,*,I've never seen him eats asparagus.


In [11]:
sentences = df.sentence.values
labels = df.label.values

In [15]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [17]:


#Tokenize all the sentences and map the tokens to their word IDs
input_ids = []

for sent in sentences:
  # encode will: 
  # (1) Tokenize the sentence
  # (2) Prepend the [CLS] token to the start
  # (3) Append the '[SEP] token to the end
  # (4) Map tokens to their IDS
  encoded_sent = tokenizer.encode(
            sent,
            add_special_tokens = True, # Add [CLS] and [SEP]
            )
  input_ids.append(encoded_sent)

print ("Original: ", sentences[0])
print('Token IDs: ', input_ids[0])

Original:  Our friends won't buy this analysis, let alone the next one we propose.
Token IDs:  [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102]


In [20]:
print("Max sentencs Length: ", max([len(sen) for sen in input_ids]))

Max sentencs Length:  47


In [22]:
#Padding with Keras libary
from keras.preprocessing.sequence import pad_sequences

# Set the maximum sentence length
MAX_LEN = 64

print(f"\nPadding/truncating all sentences to {MAX_LEN} values")
print(f"\nPadding token: '{tokenizer.pad_token}', ID: {tokenizer.pad_token_id}")

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long",
                          value=0, truncating="post", padding="post")
print("\nDone")



Padding/truncating all sentences to 64 values

Padding token: '[PAD]', ID: 0

Done


In [23]:
print(input_ids[:10])

[[  101  2256  2814  2180  1005  1056  4965  2023  4106  1010  2292  2894
   1996  2279  2028  2057 16599  1012   102     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [  101  2028  2062 18404  2236  3989  1998  1045  1005  1049  3228  2039
   1012   102     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [  101  2028  2062 18404  2236  3989  2030  1045  1005  1049  3228  2039
   1012   102     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0   

In [27]:
#Attention mask
attention_masks = []

for sent in input_ids:
  #Create attention mask
  att_mask = [int(token_id > 0) for token_id in sent]
  attention_masks.append(att_mask)

print(attention_masks[:5])  

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


##Training & Vlidation Split

In [30]:
#Use train_test_split to split our data into train and validation sets for training

from sklearn.model_selection import train_test_split

# Use 90% for training and 10% for validation
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)

# do the same for the mask
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1)

In [31]:
print(train_inputs[:2])
print(train_masks[:2])

[[  101  2002  2939  1996  3328  1012   102     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [  101  1998  2009  2001  2023  3043  2006  2029  1045 17535  2007  1996
   3472  1997  1996  7276  2837  1012   102     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]]
[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 

##Convert to PyTorch datatypes

In [32]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)



In [34]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

#Create the dataloader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#Create dataloader for validation set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

