In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [None]:
!pip install transformers



In [None]:
from transformers import BertTokenizer
from transformers import BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
model.cuda()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
input_ids = torch.tensor(tokenizer.encode("Hello, my cat is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids.to(device))

In [None]:
outputs[0].detach().cpu().numpy().shape

(1, 8, 768)

In [None]:
import pandas as pd
data_pd = pd.read_csv("IMDB Dataset.csv")

data_pd["label"] = data_pd["sentiment"].map({'positive': 1, 'negative': 0})
data_pd.head()

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [None]:
from tqdm import tqdm, tqdm_notebook

# instantiate
tqdm.pandas(tqdm_notebook)

MAX_LEN = 300
#tokenize
input_ids = []
for sentence in tqdm(data_pd["review"]):
  ids = tokenizer.encode(sentence, add_special_tokens=True,max_length = MAX_LEN  )
  input_ids.append(ids)

from keras.preprocessing.sequence import pad_sequences


# Pad our input tokens with value 0.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")



  from pandas import Panel
100%|██████████| 50000/50000 [02:20<00:00, 354.80it/s]
Using TensorFlow backend.


In [None]:
# Create attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [None]:
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
data_inputs = torch.tensor(input_ids[:10000])

data_masks = torch.tensor(attention_masks[:10000])


In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 16



# Create the DataLoader for our data
predict_data = TensorDataset(data_inputs, data_masks)
predict_sampler = SequentialSampler(predict_data)
predict_dataloader = DataLoader(predict_data, sampler=predict_sampler, batch_size=batch_size)



In [None]:
import numpy as np
model.eval()
bert_output = []

for batch in tqdm(predict_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()

    first_hidden_state = logits[:,0,:]
    bert_output.append(first_hidden_state)
    





  0%|          | 0/625 [00:00<?, ?it/s][A[A[A[A



  0%|          | 1/625 [00:00<01:24,  7.40it/s][A[A[A[A



  0%|          | 2/625 [00:00<01:21,  7.61it/s][A[A[A[A



  0%|          | 3/625 [00:00<01:19,  7.82it/s][A[A[A[A



  1%|          | 4/625 [00:00<01:18,  7.93it/s][A[A[A[A



  1%|          | 5/625 [00:00<01:17,  8.00it/s][A[A[A[A



  1%|          | 6/625 [00:00<01:16,  8.06it/s][A[A[A[A



  1%|          | 7/625 [00:00<01:16,  8.10it/s][A[A[A[A



  1%|▏         | 8/625 [00:00<01:15,  8.13it/s][A[A[A[A



  1%|▏         | 9/625 [00:01<01:15,  8.15it/s][A[A[A[A



  2%|▏         | 10/625 [00:01<01:15,  8.16it/s][A[A[A[A



  2%|▏         | 11/625 [00:01<01:15,  8.17it/s][A[A[A[A



  2%|▏         | 12/625 [00:01<01:15,  8.17it/s][A[A[A[A



  2%|▏         | 13/625 [00:01<01:14,  8.18it/s][A[A[A[A



  2%|▏         | 14/625 [00:01<01:14,  8.17it/s][A[A[A[A



  2%|▏         | 15/625 [00:01<01:14,  8.18it/s][A[A

 20%|██        | 127/625 [00:15<01:01,  8.15it/s][A[A[A[A



 20%|██        | 128/625 [00:15<01:00,  8.15it/s][A[A[A[A



 21%|██        | 129/625 [00:15<01:00,  8.15it/s][A[A[A[A



 21%|██        | 130/625 [00:15<01:00,  8.15it/s][A[A[A[A



 21%|██        | 131/625 [00:16<01:00,  8.15it/s][A[A[A[A



 21%|██        | 132/625 [00:16<01:00,  8.15it/s][A[A[A[A



 21%|██▏       | 133/625 [00:16<01:00,  8.15it/s][A[A[A[A



 21%|██▏       | 134/625 [00:16<01:00,  8.18it/s][A[A[A[A



 22%|██▏       | 135/625 [00:16<00:59,  8.17it/s][A[A[A[A



 22%|██▏       | 136/625 [00:16<00:59,  8.15it/s][A[A[A[A



 22%|██▏       | 137/625 [00:16<00:59,  8.15it/s][A[A[A[A



 22%|██▏       | 138/625 [00:16<00:59,  8.14it/s][A[A[A[A



 22%|██▏       | 139/625 [00:17<00:59,  8.14it/s][A[A[A[A



 22%|██▏       | 140/625 [00:17<00:59,  8.13it/s][A[A[A[A



 23%|██▎       | 141/625 [00:17<00:59,  8.13it/s][A[A[A[A



 23%|██▎       | 142/625 

 40%|████      | 253/625 [00:31<00:45,  8.12it/s][A[A[A[A



 41%|████      | 254/625 [00:31<00:45,  8.12it/s][A[A[A[A



 41%|████      | 255/625 [00:31<00:45,  8.12it/s][A[A[A[A



 41%|████      | 256/625 [00:31<00:45,  8.12it/s][A[A[A[A



 41%|████      | 257/625 [00:31<00:45,  8.11it/s][A[A[A[A



 41%|████▏     | 258/625 [00:31<00:45,  8.11it/s][A[A[A[A



 41%|████▏     | 259/625 [00:31<00:45,  8.10it/s][A[A[A[A



 42%|████▏     | 260/625 [00:31<00:44,  8.11it/s][A[A[A[A



 42%|████▏     | 261/625 [00:32<00:44,  8.11it/s][A[A[A[A



 42%|████▏     | 262/625 [00:32<00:44,  8.12it/s][A[A[A[A



 42%|████▏     | 263/625 [00:32<00:44,  8.11it/s][A[A[A[A



 42%|████▏     | 264/625 [00:32<00:44,  8.11it/s][A[A[A[A



 42%|████▏     | 265/625 [00:32<00:44,  8.11it/s][A[A[A[A



 43%|████▎     | 266/625 [00:32<00:44,  8.11it/s][A[A[A[A



 43%|████▎     | 267/625 [00:32<00:44,  8.11it/s][A[A[A[A



 43%|████▎     | 268/625 

 61%|██████    | 379/625 [00:46<00:30,  8.09it/s][A[A[A[A



 61%|██████    | 380/625 [00:46<00:30,  8.08it/s][A[A[A[A



 61%|██████    | 381/625 [00:46<00:30,  8.08it/s][A[A[A[A



 61%|██████    | 382/625 [00:46<00:30,  8.08it/s][A[A[A[A



 61%|██████▏   | 383/625 [00:47<00:29,  8.08it/s][A[A[A[A



 61%|██████▏   | 384/625 [00:47<00:29,  8.08it/s][A[A[A[A



 62%|██████▏   | 385/625 [00:47<00:29,  8.08it/s][A[A[A[A



 62%|██████▏   | 386/625 [00:47<00:29,  8.08it/s][A[A[A[A



 62%|██████▏   | 387/625 [00:47<00:29,  8.08it/s][A[A[A[A



 62%|██████▏   | 388/625 [00:47<00:29,  8.07it/s][A[A[A[A



 62%|██████▏   | 389/625 [00:47<00:29,  8.08it/s][A[A[A[A



 62%|██████▏   | 390/625 [00:47<00:29,  8.08it/s][A[A[A[A



 63%|██████▎   | 391/625 [00:48<00:28,  8.08it/s][A[A[A[A



 63%|██████▎   | 392/625 [00:48<00:28,  8.08it/s][A[A[A[A



 63%|██████▎   | 393/625 [00:48<00:28,  8.08it/s][A[A[A[A



 63%|██████▎   | 394/625 

 81%|████████  | 505/625 [01:02<00:14,  8.06it/s][A[A[A[A



 81%|████████  | 506/625 [01:02<00:14,  8.06it/s][A[A[A[A



 81%|████████  | 507/625 [01:02<00:14,  8.07it/s][A[A[A[A



 81%|████████▏ | 508/625 [01:02<00:14,  8.06it/s][A[A[A[A



 81%|████████▏ | 509/625 [01:02<00:14,  8.06it/s][A[A[A[A



 82%|████████▏ | 510/625 [01:02<00:14,  8.06it/s][A[A[A[A



 82%|████████▏ | 511/625 [01:02<00:14,  8.06it/s][A[A[A[A



 82%|████████▏ | 512/625 [01:03<00:14,  8.06it/s][A[A[A[A



 82%|████████▏ | 513/625 [01:03<00:13,  8.07it/s][A[A[A[A



 82%|████████▏ | 514/625 [01:03<00:13,  8.07it/s][A[A[A[A



 82%|████████▏ | 515/625 [01:03<00:13,  8.07it/s][A[A[A[A



 83%|████████▎ | 516/625 [01:03<00:13,  8.07it/s][A[A[A[A



 83%|████████▎ | 517/625 [01:03<00:13,  8.06it/s][A[A[A[A



 83%|████████▎ | 518/625 [01:03<00:13,  8.06it/s][A[A[A[A



 83%|████████▎ | 519/625 [01:03<00:13,  8.06it/s][A[A[A[A



 83%|████████▎ | 520/625 

In [None]:
X_data = np.vstack(bert_output)
y = data_pd["label"].values[0:10000]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X_data, y, test_size=0.3, random_state=42, stratify=y)

X_train,X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

print (X_train.shape,X_test.shape, X_val.shape)


(5600, 768) (3000, 768) (1400, 768)


In [None]:
from keras.layers import *
from keras.models import Model
from keras.callbacks import ModelCheckpoint


input_layer = Input(shape=(768,))
hidden_dense_layer = Dense(250,activation="sigmoid")(input_layer)
dense_layer = Dense(1,activation="sigmoid")(hidden_dense_layer)

model = Model(inputs=input_layer, outputs=dense_layer)
model.compile(loss='binary_crossentropy', optimizer="adam",metrics=["accuracy"])
print (model.summary())
mc = ModelCheckpoint("best_checkpoint.h5", monitor="val_acc" , save_best_only=True, save_weights_only=True)

model.fit(X_train,y_train,validation_data=(X_val,y_val), epochs= 50, batch_size=50, callbacks=[mc])


Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 768)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 250)               192250    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 251       
Total params: 192,501
Trainable params: 192,501
Non-trainable params: 0
_________________________________________________________________
None
Train on 5600 samples, validate on 1400 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epo

<keras.callbacks.callbacks.History at 0x7f2e087c4410>

In [None]:
model.evaluate(X_test,y_test)



[0.3936431167125702, 0.8453333377838135]