In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")

train = pd.read_csv('train_spam.csv', encoding='utf8')
train

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...
...,...,...
16273,spam,if you are interested in binary options tradin...
16274,spam,dirty pictureblyk on aircel thanks you for bei...
16275,ham,or you could do this g on mon 1635465 sep 1635...
16276,ham,insta reels par 80 गंद bhara pada hai 👀 kuch b...


In [2]:
train["text_type"] = train["text_type"].apply(lambda x: 1 if x=='spam' else 0)
X, y = train["text"], train["text_type"]

In [3]:
import torch

device = torch.device('cuda')

In [4]:
X_bert = X.copy().apply(lambda x: '[CLS] ' + x + ' [SEP]')
y_bert = y.copy()

In [5]:
from pytorch_transformers import BertTokenizer, BertConfig

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [6]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in X_bert]
tokenized_texts[0]

['[CLS]',
 'make',
 'sure',
 'alex',
 'knows',
 'his',
 'birthday',
 'is',
 'over',
 'in',
 'fifteen',
 'minutes',
 'as',
 'far',
 'as',
 'your',
 '##e',
 'concerned',
 '[SEP]']

In [7]:
from keras.preprocessing.sequence import pad_sequences

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(
    input_ids,
    maxlen=512,
    dtype='long',
    truncating='post',
    padding='post'
)
att_mask = np.array([[float(i>0) for i in seq] for seq in input_ids])

Token indices sequence length is longer than the specified maximum sequence length for this model (965 > 512). Running this sequence through the model will result in indexing errors


In [8]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, y_train, y_val = train_test_split(input_ids, y_bert, test_size=0.01, random_state=0)
train_masks, val_masks, _, _ = train_test_split(att_mask, input_ids, test_size=0.01, random_state=0)

y_train.size, y_val.size

(16115, 163)

In [9]:
train_inputs, val_inputs, y_train, y_val, train_masks, val_masks =\
torch.tensor(train_inputs), torch.tensor(val_inputs), torch.tensor(y_train.values).reshape(-1, 1),\
torch.tensor(y_val.values).reshape(-1, 1), torch.tensor(train_masks), torch.tensor(val_masks)

In [10]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(val_inputs, val_masks, y_val)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [11]:
from pytorch_transformers import AdamW, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr = 1e-5)

In [13]:
from tqdm import tqdm, trange
from sklearn.metrics import roc_auc_score
from torch.optim.lr_scheduler import StepLR

train_loss_set = []
epochs = 2
scheduler = StepLR(optimizer, step_size=1, gamma=0.3)

for _ in range(epochs):
    model.train()

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(tqdm(train_dataloader, desc="Train")):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        train_loss_set.append(loss[0].item())
        loss[0].backward()
        optimizer.step()

        tr_loss += loss[0].item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    scheduler.step()
        
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

    model.eval()

    preds, gts = [], []

    for batch in tqdm(validation_dataloader, desc="Val"):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = torch.sigmoid(logits[0])[:, 1].detach().cpu().tolist()
        label_ids = b_labels.to('cpu').tolist()

        preds += logits
        gts += label_ids

    print("Validation ROC-AUC: {}".format(roc_auc_score(gts, preds)))

Train: 100%|███████████████████████████████████████████████████████████████████████| 1008/1008 [11:49<00:00,  1.42it/s]


Train loss: 0.1460994447970445


Val: 100%|█████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.53it/s]


Validation ROC-AUC: 0.999379652605459


Train: 100%|███████████████████████████████████████████████████████████████████████| 1008/1008 [11:49<00:00,  1.42it/s]


Train loss: 0.046310543122152534


Val: 100%|█████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.54it/s]

Validation ROC-AUC: 0.999793217535153





In [15]:
torch.save(model.state_dict(), 'bert_99.pt')