In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

train = pd.read_csv('train_spam.csv', encoding='utf8')
train

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...
...,...,...
16273,spam,if you are interested in binary options tradin...
16274,spam,dirty pictureblyk on aircel thanks you for bei...
16275,ham,or you could do this g on mon 1635465 sep 1635...
16276,ham,insta reels par 80 गंद bhara pada hai 👀 kuch b...


In [2]:
train["text_type"] = train["text_type"].apply(lambda x: 1 if x=='spam' else 0)
X, y = train["text"], train["text_type"]

In [3]:
import torch

device = torch.device('cuda')

In [4]:
X_bart = X.copy()
y_bart = y.copy()

In [5]:
from transformers import BartTokenizer, BartConfig
import torch

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', do_lower_case=True)

In [6]:
from keras.preprocessing.sequence import pad_sequences

def preprocess_text(text):
    return tokenizer.encode_plus(text, add_special_tokens=True, max_length=1024, truncation='only_first') #[:350]

tokenized_texts = X_bart.apply(preprocess_text)

input_ids = [x['input_ids'] for x in tokenized_texts]
input_ids = pad_sequences(
    input_ids,
    maxlen=1024,
    dtype='long',
    truncating='post',
    padding='post'
)

att_mask = np.array([[float(i>0) for i in seq] for seq in input_ids])

In [7]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, y_train, y_val = train_test_split(input_ids, y_bart, test_size=0.3, random_state=0)
train_masks, val_masks, _, _ = train_test_split(att_mask, input_ids, test_size=0.3, random_state=0)

In [8]:
train_inputs, val_inputs, y_train, y_val, train_masks, val_masks =\
torch.tensor(train_inputs), torch.tensor(val_inputs), torch.tensor(y_train.values).reshape(-1, 1),\
torch.tensor(y_val.values).reshape(-1, 1), torch.tensor(train_masks), torch.tensor(val_masks)

In [9]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 4

train_data = TensorDataset(train_inputs, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(val_inputs, val_masks, y_val)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [10]:
from transformers import AdamW, BartForSequenceClassification

model = BartForSequenceClassification.from_pretrained('facebook/bart-base', num_labels=2)
model.to(device)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.dense.bias', 'classification_head.out_proj.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps

In [11]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr = 1e-5)

In [12]:
from tqdm import tqdm, trange
from sklearn.metrics import roc_auc_score
from torch.optim.lr_scheduler import StepLR

train_loss_set = []
epochs = 2
scheduler = StepLR(optimizer, step_size=1, gamma=0.3)

for _ in range(epochs):
    model.train()

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(tqdm(train_dataloader, desc="Train")):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        train_loss_set.append(loss[0].item())
        loss[0].backward()
        optimizer.step()

        tr_loss += loss[0].item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    scheduler.step()
        
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

    model.eval()

    preds, gts = [], []

    for batch in tqdm(validation_dataloader, desc="Val"):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, attention_mask=b_input_mask)

        logits = torch.sigmoid(logits[0])[:, 1].detach().cpu().tolist()
        label_ids = b_labels.to('cpu').tolist()

        preds += logits
        gts += label_ids

    print("Validation ROC-AUC: {}".format(roc_auc_score(gts, preds)))

Train: 100%|███████████████████████████████████████████████████████████████████████| 2849/2849 [24:34<00:00,  1.93it/s]


Train loss: 0.17995940977179162


Val: 100%|█████████████████████████████████████████████████████████████████████████| 1221/1221 [03:26<00:00,  5.90it/s]


Validation ROC-AUC: 0.9895083507360187


Train: 100%|███████████████████████████████████████████████████████████████████████| 2849/2849 [24:35<00:00,  1.93it/s]


Train loss: 0.06579237257093831


Val: 100%|█████████████████████████████████████████████████████████████████████████| 1221/1221 [03:26<00:00,  5.90it/s]

Validation ROC-AUC: 0.9945599460814598





In [14]:
torch.save(model.state_dict(), 'bart.pt')