In [1]:
import pandas as pd
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings, BertEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path
import re
import string
import nltk
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

## Reading the Dataset

In [2]:
data = pd.read_csv("./spam.csv", encoding='latin-1').sample(frac=1).drop_duplicates()
data = data[['v1', 'v2']].rename(columns={"v1":"label", "v2":"text"})
data.head()

Unnamed: 0,label,text
4044,ham,If You mean the website. Yes.
3849,ham,I to am looking forward to all the sex cuddlin...
1714,ham,Yeah I don't see why not
2241,ham,U buy newspapers already?
3582,ham,I sent your maga that money yesterday oh.


### Data Cleaning
#### Remove the Punctuations, stopwords, Lemmatize

In [3]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([ps.stem(word) for word in tokens if word not in stopwords])
    return text

data['cleaned_text'] = data['text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,label,text,cleaned_text
4044,ham,If You mean the website. Yes.,mean websit ye
3849,ham,I to am looking forward to all the sex cuddlin...,look forward sex cuddl two sleep
1714,ham,Yeah I don't see why not,yeah dont see
2241,ham,U buy newspapers already?,u buy newspap alreadi
3582,ham,I sent your maga that money yesterday oh.,sent maga money yesterday oh


In [4]:
#data=data.drop(["text"],axis=1)
#data.head()

## Test and Train Dataset Split

In [5]:
data['label'] = '__label__' + data['label'].astype(str)
data.iloc[0:int(len(data)*0.8)].to_csv('train.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('test.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.9):].to_csv('dev.csv', sep='\t', index = False, header = False);

## Use BERT Embeddings and Train the model

In [6]:
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')
#word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]
word_embeddings = [BertEmbeddings(), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=100)

  """Entry point for launching an IPython kernel.


2020-02-05 12:12:34,689 Reading data from .
2020-02-05 12:12:34,690 Train: train.csv
2020-02-05 12:12:34,691 Dev: dev.csv
2020-02-05 12:12:34,692 Test: test.csv


  max_tokens_per_doc=max_tokens_per_doc,
  max_tokens_per_doc=max_tokens_per_doc,
  max_tokens_per_doc=max_tokens_per_doc,
  after removing the cwd from sys.path.


2020-02-05 12:12:44,526 Computing label dictionary. Progress:


100%|██████████████████████████████████████████████████████████████████████████| 4135/4135 [00:00<00:00, 318123.32it/s]


2020-02-05 12:12:44,543 [b'ham', b'spam']
2020-02-05 12:12:44,548 ----------------------------------------------------------------------------------------------------
2020-02-05 12:12:44,551 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): BertEmbeddings(
        (model): BertModel(
          (embeddings): BertEmbeddings(
            (word_embeddings): Embedding(30522, 768, padding_idx=0)
            (position_embeddings): Embedding(512, 768)
            (token_type_embeddings): Embedding(2, 768)
            (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1)
          )
          (encoder): BertEncoder(
            (layer): ModuleList(
              (0): BertLayer(
                (attention): BertAttention(
                  (self): BertSelfAttention(
                    (query): Linear(in_features=768, out_features=768, bias=True)
   

2020-02-05 12:12:44,552 ----------------------------------------------------------------------------------------------------
2020-02-05 12:12:44,553 Corpus: "Corpus: 4135 train + 517 dev + 517 test sentences"
2020-02-05 12:12:44,554 ----------------------------------------------------------------------------------------------------
2020-02-05 12:12:44,555 Parameters:
2020-02-05 12:12:44,556  - learning_rate: "0.1"
2020-02-05 12:12:44,557  - mini_batch_size: "32"
2020-02-05 12:12:44,558  - patience: "3"
2020-02-05 12:12:44,559  - anneal_factor: "0.5"
2020-02-05 12:12:44,560  - max_epochs: "100"
2020-02-05 12:12:44,561  - shuffle: "True"
2020-02-05 12:12:44,561  - train_with_dev: "False"
2020-02-05 12:12:44,563 ----------------------------------------------------------------------------------------------------
2020-02-05 12:12:44,563 Model training base path: "."
2020-02-05 12:12:44,564 ----------------------------------------------------------------------------------------------------
2

2020-02-05 13:24:14,552 epoch 6 - iter 65/130 - loss 0.02946567 - samples/sec: 165.94
2020-02-05 13:24:17,693 epoch 6 - iter 78/130 - loss 0.02858129 - samples/sec: 197.63
2020-02-05 13:24:20,924 epoch 6 - iter 91/130 - loss 0.02539814 - samples/sec: 186.81
2020-02-05 13:24:23,934 epoch 6 - iter 104/130 - loss 0.02670084 - samples/sec: 211.71
2020-02-05 13:24:27,804 epoch 6 - iter 117/130 - loss 0.02652813 - samples/sec: 172.84
2020-02-05 13:24:30,590 ----------------------------------------------------------------------------------------------------
2020-02-05 13:24:30,591 EPOCH 6 done: loss 0.0291 - lr 0.1000
2020-02-05 13:24:33,019 DEV : loss 0.030898086726665497 - score 0.9884
2020-02-05 13:24:33,072 BAD EPOCHS (no improvement): 3
2020-02-05 13:24:33,075 ----------------------------------------------------------------------------------------------------
2020-02-05 13:24:33,215 epoch 7 - iter 0/130 - loss 0.03034712 - samples/sec: 3036.63
2020-02-05 13:24:36,146 epoch 7 - iter 13/13

2020-02-05 13:28:05,676 epoch 12 - iter 117/130 - loss 0.00564355 - samples/sec: 174.57
2020-02-05 13:28:08,285 ----------------------------------------------------------------------------------------------------
2020-02-05 13:28:08,286 EPOCH 12 done: loss 0.0053 - lr 0.0500
2020-02-05 13:28:10,774 DEV : loss 0.03522000089287758 - score 0.9923
2020-02-05 13:28:10,827 BAD EPOCHS (no improvement): 1
2020-02-05 13:28:12,107 ----------------------------------------------------------------------------------------------------
2020-02-05 13:28:12,244 epoch 13 - iter 0/130 - loss 0.00019333 - samples/sec: 3081.60
2020-02-05 13:28:16,628 epoch 13 - iter 13/130 - loss 0.00547391 - samples/sec: 189.27
2020-02-05 13:28:19,973 epoch 13 - iter 26/130 - loss 0.00454156 - samples/sec: 178.35
2020-02-05 13:28:23,165 epoch 13 - iter 39/130 - loss 0.00389808 - samples/sec: 192.33
2020-02-05 13:28:26,203 epoch 13 - iter 52/130 - loss 0.00352896 - samples/sec: 206.77
2020-02-05 13:28:30,025 epoch 13 - iter

2020-02-05 13:31:52,113 DEV : loss 0.04422296583652496 - score 0.9923
2020-02-05 13:31:52,165 BAD EPOCHS (no improvement): 3
2020-02-05 13:31:54,536 ----------------------------------------------------------------------------------------------------
2020-02-05 13:31:54,692 epoch 19 - iter 0/130 - loss 0.00067902 - samples/sec: 2719.10
2020-02-05 13:31:58,001 epoch 19 - iter 13/130 - loss 0.00070256 - samples/sec: 197.44
2020-02-05 13:32:01,035 epoch 19 - iter 26/130 - loss 0.00091576 - samples/sec: 211.07
2020-02-05 13:32:04,250 epoch 19 - iter 39/130 - loss 0.00089156 - samples/sec: 190.57
2020-02-05 13:32:07,812 epoch 19 - iter 52/130 - loss 0.00098588 - samples/sec: 162.70
2020-02-05 13:32:11,421 epoch 19 - iter 65/130 - loss 0.00119559 - samples/sec: 159.88
2020-02-05 13:32:14,554 epoch 19 - iter 78/130 - loss 0.00117425 - samples/sec: 199.82
2020-02-05 13:32:17,516 epoch 19 - iter 91/130 - loss 0.00107266 - samples/sec: 212.36
2020-02-05 13:32:20,623 epoch 19 - iter 104/130 - loss

2020-02-05 13:35:35,619 epoch 25 - iter 0/130 - loss 0.00120317 - samples/sec: 2461.66
2020-02-05 13:35:38,972 epoch 25 - iter 13/130 - loss 0.00030946 - samples/sec: 181.35
2020-02-05 13:35:42,570 epoch 25 - iter 26/130 - loss 0.00070059 - samples/sec: 163.72
2020-02-05 13:35:45,409 epoch 25 - iter 39/130 - loss 0.00080749 - samples/sec: 235.04
2020-02-05 13:35:48,486 epoch 25 - iter 52/130 - loss 0.00087338 - samples/sec: 199.05
2020-02-05 13:35:51,378 epoch 25 - iter 65/130 - loss 0.00085638 - samples/sec: 221.40
2020-02-05 13:35:54,635 epoch 25 - iter 78/130 - loss 0.00110601 - samples/sec: 192.42
2020-02-05 13:35:57,611 epoch 25 - iter 91/130 - loss 0.00110712 - samples/sec: 215.33
2020-02-05 13:36:00,882 epoch 25 - iter 104/130 - loss 0.00114664 - samples/sec: 186.55
2020-02-05 13:36:04,083 epoch 25 - iter 117/130 - loss 0.00112733 - samples/sec: 189.44
2020-02-05 13:36:06,884 ----------------------------------------------------------------------------------------------------
202

2020-02-05 13:39:27,674 epoch 31 - iter 52/130 - loss 0.00112803 - samples/sec: 203.13
2020-02-05 13:39:31,525 epoch 31 - iter 65/130 - loss 0.00096145 - samples/sec: 196.61
2020-02-05 13:39:34,456 epoch 31 - iter 78/130 - loss 0.00089758 - samples/sec: 215.78
2020-02-05 13:39:37,442 epoch 31 - iter 91/130 - loss 0.00100512 - samples/sec: 210.43
2020-02-05 13:39:40,484 epoch 31 - iter 104/130 - loss 0.00100192 - samples/sec: 200.97
2020-02-05 13:39:43,579 epoch 31 - iter 117/130 - loss 0.00096092 - samples/sec: 202.54
2020-02-05 13:39:47,099 ----------------------------------------------------------------------------------------------------
2020-02-05 13:39:47,101 EPOCH 31 done: loss 0.0009 - lr 0.0031
2020-02-05 13:39:49,604 DEV : loss 0.04760785773396492 - score 0.9923
Epoch    30: reducing learning rate of group 0 to 1.5625e-03.
2020-02-05 13:39:49,659 BAD EPOCHS (no improvement): 4
2020-02-05 13:39:50,822 -----------------------------------------------------------------------------

2020-02-05 13:43:16,851 epoch 37 - iter 91/130 - loss 0.00099564 - samples/sec: 186.64
2020-02-05 13:43:19,981 epoch 37 - iter 104/130 - loss 0.00102099 - samples/sec: 201.95
2020-02-05 13:43:23,145 epoch 37 - iter 117/130 - loss 0.00101857 - samples/sec: 195.22
2020-02-05 13:43:26,525 ----------------------------------------------------------------------------------------------------
2020-02-05 13:43:26,527 EPOCH 37 done: loss 0.0011 - lr 0.0008
2020-02-05 13:43:28,947 DEV : loss 0.04785548523068428 - score 0.9923
2020-02-05 13:43:29,000 BAD EPOCHS (no improvement): 2
2020-02-05 13:43:30,568 ----------------------------------------------------------------------------------------------------
2020-02-05 13:43:30,701 epoch 38 - iter 0/130 - loss 0.00005391 - samples/sec: 3200.84
2020-02-05 13:43:35,394 epoch 38 - iter 13/130 - loss 0.00135510 - samples/sec: 187.34
2020-02-05 13:43:39,010 epoch 38 - iter 26/130 - loss 0.00135449 - samples/sec: 158.87
2020-02-05 13:43:44,267 epoch 38 - ite

2020-02-05 13:47:13,609 EPOCH 43 done: loss 0.0010 - lr 0.0004
2020-02-05 13:47:16,016 DEV : loss 0.047963835299015045 - score 0.9923
Epoch    42: reducing learning rate of group 0 to 1.9531e-04.
2020-02-05 13:47:16,073 BAD EPOCHS (no improvement): 4
2020-02-05 13:47:17,453 ----------------------------------------------------------------------------------------------------
2020-02-05 13:47:17,641 epoch 44 - iter 0/130 - loss 0.00218877 - samples/sec: 2248.75
2020-02-05 13:47:22,526 epoch 44 - iter 13/130 - loss 0.00077225 - samples/sec: 186.47
2020-02-05 13:47:25,637 epoch 44 - iter 26/130 - loss 0.00096878 - samples/sec: 194.49
2020-02-05 13:47:30,758 epoch 44 - iter 39/130 - loss 0.00115960 - samples/sec: 199.34
2020-02-05 13:47:33,506 epoch 44 - iter 52/130 - loss 0.00108847 - samples/sec: 234.38
2020-02-05 13:47:36,872 epoch 44 - iter 65/130 - loss 0.00106321 - samples/sec: 208.95
2020-02-05 13:47:40,085 epoch 44 - iter 78/130 - loss 0.00107350 - samples/sec: 190.66
2020-02-05 13:4

  result = unpickler.load()


2020-02-05 13:56:26,060 0.9787	0.9787	0.9787
2020-02-05 13:56:26,062 
MICRO_AVG: acc 0.9583 - f1-score 0.9787
MACRO_AVG: acc 0.9047 - f1-score 0.94855
ham        tp: 451 - fp: 8 - fn: 3 - tn: 55 - precision: 0.9826 - recall: 0.9934 - accuracy: 0.9762 - f1-score: 0.9880
spam       tp: 55 - fp: 3 - fn: 8 - tn: 451 - precision: 0.9483 - recall: 0.8730 - accuracy: 0.8333 - f1-score: 0.9091
2020-02-05 13:56:26,063 ----------------------------------------------------------------------------------------------------


{'test_score': 0.9787,
 'dev_score_history': [0.9787,
  0.9884,
  0.9903,
  0.9865,
  0.9903,
  0.9884,
  0.9903,
  0.9903,
  0.9903,
  0.9903,
  0.9923,
  0.9923,
  0.9903,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923,
  0.9923],
 'train_loss_history': [0.14099423999163824,
  0.06762746933829755,
  0.04583877539620376,
  0.04146887439486678,
  0.03401835021885255,
  0.029118352060994276,
  0.022270211470850673,
  0.013809197267087606,
  0.011601482939659036,
  0.008063341164961458,
  0.007215783825421778,
  0.005273395067510697,
  0.003954336989241151,
  0.0041253711638721425,
  0.0028007527158339283,
  0.0018558785946180042,
  0.001759871122025148,
  0.002248157912883751,
  0.0012722291891875033,
  0.0016