In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 12.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 56.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 51.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
 

In [2]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers

In [None]:
#CODE CHUNK 1: Download/demo Bio_ClinicalBERT


#Code guidance from: 
#
#
#For RoBERTA: https://huggingface.co/docs/transformers/model_doc/roberta


#Notes:
#This article indicates RoBERTA might be interested because it was trained 
# on a far larger CORPUS than the original bert
#https://towardsdatascience.com/bert-roberta-distilbert-xlnet-which-one-to-use-3d5ab82ba5f8

#General setup and model data: https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb

#Bio_ClinicalBERT help and description: https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT
#Note that this was trained on MIMIC III which could be helpful 

In [3]:
#from transformers import AutoTokenizer, AutoModel
tokenizer = ppb.AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = ppb.AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
#Test DF:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
df.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [5]:
#Tokenize: 
tokenized = df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized.head()

0    [101, 170, 20329, 117, 6276, 1105, 1921, 19920...
1    [101, 4547, 1231, 11192, 5521, 11813, 1121, 11...
2    [101, 1152, 3073, 22369, 1147, 3703, 192, 1186...
3    [101, 1142, 1110, 170, 19924, 15660, 187, 1408...
4    [101, 179, 7637, 22252, 2493, 1200, 112, 188, ...
Name: 0, dtype: object

In [23]:
#Create Padding:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
#How it works: tokenized.values gives an array of lists of ints of various lengths
#This goes through and says from the length of the list to max_len, fill in with 0
padded

array([[  101,   170, 20329, ...,     0,     0,     0],
       [  101,  4547,  1231, ...,     0,     0,     0],
       [  101,  1152,  3073, ...,     0,     0,     0],
       ...,
       [  101,  1103,  5444, ...,     0,     0,     0],
       [  101,   170,  5536, ...,     0,     0,     0],
       [  101,   170, 13108, ...,     0,     0,     0]])

In [7]:
#Convert to Tensor
input_ids = torch.tensor(np.array(padded))


In [8]:
#Put through model
with torch.no_grad():
    last_hidden_states = model(input_ids)

In [9]:
type(last_hidden_states)

transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions

In [10]:
last_hidden_states

BaseModelOutputWithPoolingAndCrossAttentions([('last_hidden_state',
                                               tensor([[[-0.0305, -0.1746, -0.5880,  ..., -0.3676,  1.1590, -0.8798],
                                                        [-0.6242,  0.1692, -0.8143,  ...,  0.0188,  0.4766, -0.3575],
                                                        [-0.5547, -0.1854, -0.4430,  ..., -0.4262,  0.6339, -0.8444],
                                                        ...,
                                                        [-0.4574, -0.4273, -0.7483,  ..., -0.5306,  0.7762, -0.6670],
                                                        [-0.4692, -0.3757, -0.6946,  ..., -0.5328,  0.7810, -0.6442],
                                                        [-0.4586, -0.5404, -0.6076,  ..., -0.6197,  0.8406, -0.6326]],
                                               
                                                       [[-0.1530, -0.0233, -0.8685,  ..., -0.5015,  0.8543, -0.336

In [None]:
#CHUNK 2: Same code but for RoBERTA: 

In [11]:
#For RoBERTA:
from transformers import RobertaConfig, RobertaModel
from transformers import RobertaTokenizer

#Tokenizer:
tokenizer_r = RobertaTokenizer.from_pretrained("roberta-base")

# Initializing a RoBERTa configuration
configuration = RobertaConfig()

# Initializing a model from the configuration
model_r = RobertaModel(configuration)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [24]:
tokenized_r = df[0].apply((lambda x: tokenizer_r.encode(x, add_special_tokens=True)))


In [25]:
max_len = 0
for i in tokenized_r.values:
    if len(i) > max_len:
        max_len = len(i)

padded_r = np.array([i + [0]*(max_len-len(i)) for i in tokenized_r.values])

padded_r

array([[    0,   102, 21881, ...,     0,     0,     0],
       [    0,  3340, 45314, ...,     0,     0,     0],
       [    0, 10010, 38337, ...,     0,     0,     0],
       ...,
       [    0,   627,  8543, ...,     0,     0,     0],
       [    0,   102,  3640, ...,     0,     0,     0],
       [    0,   102, 24234, ...,     0,     0,     0]])

In [26]:
input_ids_r = torch.tensor(np.array(padded_r))

In [27]:
with torch.no_grad():
    last_hidden_states = model_r(input_ids)

In [28]:
print(last_hidden_states)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-1.6869, -0.3610, -1.3810,  ..., -0.5707, -1.8487, -1.0593],
         [-2.3285,  0.9072, -1.3096,  ...,  0.4964, -2.6183, -0.8522],
         [-2.2499,  0.5145, -1.2987,  ..., -0.7342, -1.3347, -1.9598],
         ...,
         [-2.0939,  1.0266, -1.0485,  ..., -0.5820, -2.6113, -0.9403],
         [-2.2563,  0.9129, -0.9767,  ...,  0.2351, -2.1360, -1.1630],
         [-1.2229,  1.9630, -0.7914,  ..., -0.7446, -2.5538, -1.1960]],

        [[-1.2930, -0.4835, -2.3549,  ..., -1.1154, -1.8088, -1.9793],
         [-2.3069,  0.6606, -1.2617,  ...,  0.4700, -1.5296, -0.5666],
         [-1.0785,  1.6290, -1.1457,  ..., -0.1610, -2.0559, -2.5621],
         ...,
         [-2.7906,  0.7815, -0.7898,  ..., -0.3335, -3.2903, -1.2440],
         [-1.7762,  0.8416, -1.5144,  ...,  0.0457, -2.1890, -0.7942],
         [-2.5917,  0.0635,  0.0105,  ..., -1.9132, -1.3609, -0.9877]],

        [[-1.6591,  0.7043, -1.0362,  ..., -1.5686, -