# PyTorch Transformers
Excise the PyTorch Transformers library from the Hugging Face Transformers library.

**Warning**: The following code execute on Google Colab. 

In [None]:
!pip install folium
!pip install urllib3
!pip install pytorch_transformers

## The model of PyTorch Transformers

In [1]:
# import the BertForMaskedLM
from pytorch_transformers import BertForMaskedLM

msk_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
print(msk_model)

100%|██████████| 433/433 [00:00<00:00, 81102.74B/s]
100%|██████████| 440473133/440473133 [01:20<00:00, 5454288.11B/s]


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [2]:
# import the BertForSequenceClassification
from pytorch_transformers import BertForSequenceClassification

cls_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
print(cls_model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Setup the BERT model

In [3]:
# set up the model using BertConfig
from pytorch_transformers import BertConfig

config = BertConfig.from_pretrained('bert-base-uncased')
print(config)

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



# Tokenization

In [4]:
from pytorch_transformers import BertTokenizer

text = "I am a student at the University of Toronto."

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
words = tokenizer.tokenize(text)
print(words)

100%|██████████| 231508/231508 [00:00<00:00, 238482.16B/s]


['i', 'am', 'a', 'student', 'at', 'the', 'university', 'of', 'toronto', '.']
