In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Embedding(50259, 768)

In [6]:
emails = pd.read_csv('spam_email_.csv')['email']

In [7]:
max_length = 1024

In [8]:
class emailsDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
            

    def __len__(self):
        
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [9]:
dataset = emailsDataset(emails, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [10]:
import gc
gc.collect()

181

In [11]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

In [12]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 4662
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 4662


Step,Training Loss
100,6.6344
200,2.1039
300,1.8473
400,1.8673
500,1.8118
600,1.7689
700,1.7654
800,1.564
900,1.4934
1000,1.5455




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4662, training_loss=1.6108506070549833, metrics={'train_runtime': 1909.8272, 'train_samples_per_second': 2.441, 'train_steps_per_second': 2.441, 'total_flos': 2436286906368000.0, 'train_loss': 1.6108506070549833, 'epoch': 1.0})

In [13]:
generated = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()

In [14]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50,no_repeat_ngram_size=2, 
                                max_length=100, top_p=0.95, temperature=1, num_return_sequences=20)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [15]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


0:    I name =2E I am Mr. David H. Johnson from Iraq and a former National who now works as the Head of the Iraqi Ministry of Agriculture, I was recently named the Regional Head=2C of Iraqi Agricultural Promotion and the first Head in this department in 2005.  My duty of First and Next Department in 2003 was to negotiate a contract for the purchase of up to 4.5 million pounds of agricultural products to Iraq by the Government of Saddam Hussein
1:    --<eos> next eos>" useros>< bytes name=5D " addr=3A" =5F<br><em>Hello,  <br><br /><b>Good day.<br />=A0 <eol>This letter is a medium to send because I am contacting you through the Internet=20as it is very important to notify you that you have an opportunity to invest your
2:   =0D/M; =01 MUGABE.   <eos> alphabet unicode. < eos>< es character string> < e English text size>=20<!-- _><!-- This _word__ word__. _http://_rss.yahoo.com.pl<eoshouse=3D<br>http:/ews.bbc.co.uk/1
3:    - http://www.cnn.com/2006/WORLD/africa/01/12/26/liberia.crt -http: