In [1]:
import pandas as pd
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings, BertEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path
import re
import string
import nltk
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

## Reading the Dataset

In [2]:
data = pd.read_csv("./spam.csv", encoding='latin-1').sample(frac=1).drop_duplicates()
data = data[['v1', 'v2']].rename(columns={"v1":"label", "v2":"text"})
data.head()

Unnamed: 0,label,text
1100,ham,You busy or can I come by at some point and fi...
5315,ham,Hahaha..use your brain dear
1236,ham,How much are we getting?
951,ham,Shb b ok lor... Thanx...
1085,ham,FR'NDSHIP is like a needle of a clock. Though ...


### Data Cleaning
#### Remove the Punctuations, stopwords, Lemmatize

In [3]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([ps.stem(word) for word in tokens if word not in stopwords])
    return text

data['cleaned_text'] = data['text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,label,text,cleaned_text
1100,ham,You busy or can I come by at some point and fi...,busi come point figur tomorrow
5315,ham,Hahaha..use your brain dear,hahahaus brain dear
1236,ham,How much are we getting?,much get
951,ham,Shb b ok lor... Thanx...,shb b ok lor thanx
1085,ham,FR'NDSHIP is like a needle of a clock. Though ...,frndship like needl clock though v r clock v r...


In [4]:
data=data.drop(["text"],axis=1)
data.head()

Unnamed: 0,label,cleaned_text
1100,ham,busi come point figur tomorrow
5315,ham,hahahaus brain dear
1236,ham,much get
951,ham,shb b ok lor thanx
1085,ham,frndship like needl clock though v r clock v r...


## Test and Train Dataset Split

In [5]:
data['label'] = '__label__' + data['label'].astype(str)
data.iloc[0:int(len(data)*0.8)].to_csv('train.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('test.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.9):].to_csv('dev.csv', sep='\t', index = False, header = False);

## Use Glove Embeddings and Train the model

In [6]:
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')
word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]
#word_embeddings = [BertEmbeddings(), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=100)

  """Entry point for launching an IPython kernel.


2020-02-05 12:11:34,496 Reading data from .
2020-02-05 12:11:34,497 Train: train.csv
2020-02-05 12:11:34,498 Dev: dev.csv
2020-02-05 12:11:34,499 Test: test.csv


  max_tokens_per_doc=max_tokens_per_doc,
  max_tokens_per_doc=max_tokens_per_doc,
  max_tokens_per_doc=max_tokens_per_doc,
  after removing the cwd from sys.path.


2020-02-05 12:11:40,781 Computing label dictionary. Progress:


100%|██████████████████████████████████████████████████████████████████████████| 4130/4130 [00:00<00:00, 344198.45it/s]


2020-02-05 12:11:40,807 [b'ham', b'spam']
2020-02-05 12:11:40,811 ----------------------------------------------------------------------------------------------------
2020-02-05 12:11:40,813 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU

2020-02-05 12:12:55,340 epoch 5 - iter 78/130 - loss 0.07078366 - samples/sec: 520.02
2020-02-05 12:12:56,384 epoch 5 - iter 91/130 - loss 0.07353638 - samples/sec: 458.67
2020-02-05 12:12:57,284 epoch 5 - iter 104/130 - loss 0.06773369 - samples/sec: 520.02
2020-02-05 12:12:58,197 epoch 5 - iter 117/130 - loss 0.06554612 - samples/sec: 518.08
2020-02-05 12:12:59,003 ----------------------------------------------------------------------------------------------------
2020-02-05 12:12:59,004 EPOCH 5 done: loss 0.0638 - lr 0.1000
2020-02-05 12:12:59,828 DEV : loss 0.03729836270213127 - score 0.9865
2020-02-05 12:12:59,848 BAD EPOCHS (no improvement): 3
2020-02-05 12:12:59,849 ----------------------------------------------------------------------------------------------------
2020-02-05 12:12:59,917 epoch 6 - iter 0/130 - loss 0.00584695 - samples/sec: 6303.49
2020-02-05 12:13:00,866 epoch 6 - iter 13/130 - loss 0.06112595 - samples/sec: 495.26
2020-02-05 12:13:01,914 epoch 6 - iter 26/130

2020-02-05 12:13:59,592 epoch 11 - iter 117/130 - loss 0.03831725 - samples/sec: 482.62
2020-02-05 12:14:00,330 ----------------------------------------------------------------------------------------------------
2020-02-05 12:14:00,331 EPOCH 11 done: loss 0.0401 - lr 0.0250
2020-02-05 12:14:01,140 DEV : loss 0.038682375103235245 - score 0.9845
2020-02-05 12:14:01,159 BAD EPOCHS (no improvement): 1
2020-02-05 12:14:01,161 ----------------------------------------------------------------------------------------------------
2020-02-05 12:14:01,211 epoch 12 - iter 0/130 - loss 0.01665125 - samples/sec: 8667.25
2020-02-05 12:14:02,158 epoch 12 - iter 13/130 - loss 0.03941630 - samples/sec: 488.86
2020-02-05 12:14:03,063 epoch 12 - iter 26/130 - loss 0.05276949 - samples/sec: 516.79
2020-02-05 12:14:04,096 epoch 12 - iter 39/130 - loss 0.04524029 - samples/sec: 459.69
2020-02-05 12:14:04,962 epoch 12 - iter 52/130 - loss 0.03841266 - samples/sec: 543.81
2020-02-05 12:14:05,875 epoch 12 - ite

2020-02-05 12:15:02,945 DEV : loss 0.045219484716653824 - score 0.9845
2020-02-05 12:15:02,965 BAD EPOCHS (no improvement): 3
2020-02-05 12:15:02,967 ----------------------------------------------------------------------------------------------------
2020-02-05 12:15:03,023 epoch 18 - iter 0/130 - loss 0.01824516 - samples/sec: 7704.16
2020-02-05 12:15:03,965 epoch 18 - iter 13/130 - loss 0.02105071 - samples/sec: 501.83
2020-02-05 12:15:04,800 epoch 18 - iter 26/130 - loss 0.02830622 - samples/sec: 566.78
2020-02-05 12:15:05,806 epoch 18 - iter 39/130 - loss 0.02507509 - samples/sec: 459.18
2020-02-05 12:15:06,807 epoch 18 - iter 52/130 - loss 0.02393810 - samples/sec: 461.73
2020-02-05 12:15:07,703 epoch 18 - iter 65/130 - loss 0.02902818 - samples/sec: 523.29
2020-02-05 12:15:08,575 epoch 18 - iter 78/130 - loss 0.02703849 - samples/sec: 538.88
2020-02-05 12:15:09,476 epoch 18 - iter 91/130 - loss 0.03068408 - samples/sec: 520.67
2020-02-05 12:15:10,387 epoch 18 - iter 104/130 - los

2020-02-05 12:16:04,475 epoch 24 - iter 0/130 - loss 0.01913757 - samples/sec: 6933.64
2020-02-05 12:16:05,365 epoch 24 - iter 13/130 - loss 0.04099698 - samples/sec: 524.62
2020-02-05 12:16:06,255 epoch 24 - iter 26/130 - loss 0.03547525 - samples/sec: 525.27
2020-02-05 12:16:07,222 epoch 24 - iter 39/130 - loss 0.02913978 - samples/sec: 478.18
2020-02-05 12:16:08,188 epoch 24 - iter 52/130 - loss 0.02988509 - samples/sec: 479.83
2020-02-05 12:16:09,052 epoch 24 - iter 65/130 - loss 0.02770262 - samples/sec: 544.52
2020-02-05 12:16:10,048 epoch 24 - iter 78/130 - loss 0.03079952 - samples/sec: 464.30
2020-02-05 12:16:10,945 epoch 24 - iter 91/130 - loss 0.02906139 - samples/sec: 521.98
2020-02-05 12:16:11,815 epoch 24 - iter 104/130 - loss 0.02931496 - samples/sec: 540.28
2020-02-05 12:16:12,718 epoch 24 - iter 117/130 - loss 0.03222856 - samples/sec: 516.79
2020-02-05 12:16:13,597 ----------------------------------------------------------------------------------------------------
202

2020-02-05 12:17:09,876 epoch 30 - iter 52/130 - loss 0.03205360 - samples/sec: 366.86
2020-02-05 12:17:10,835 epoch 30 - iter 65/130 - loss 0.03539614 - samples/sec: 482.62
2020-02-05 12:17:11,819 epoch 30 - iter 78/130 - loss 0.03661937 - samples/sec: 490.01
2020-02-05 12:17:12,735 epoch 30 - iter 91/130 - loss 0.03641513 - samples/sec: 510.45
2020-02-05 12:17:13,655 epoch 30 - iter 104/130 - loss 0.03407164 - samples/sec: 505.49
2020-02-05 12:17:14,637 epoch 30 - iter 117/130 - loss 0.03281042 - samples/sec: 472.21
2020-02-05 12:17:15,430 ----------------------------------------------------------------------------------------------------
2020-02-05 12:17:15,432 EPOCH 30 done: loss 0.0310 - lr 0.0016
2020-02-05 12:17:16,249 DEV : loss 0.04347774758934975 - score 0.9826
Epoch    29: reducing learning rate of group 0 to 7.8125e-04.
2020-02-05 12:17:16,270 BAD EPOCHS (no improvement): 4
2020-02-05 12:17:16,272 -----------------------------------------------------------------------------

2020-02-05 12:18:14,507 epoch 36 - iter 91/130 - loss 0.02831085 - samples/sec: 516.15
2020-02-05 12:18:15,435 epoch 36 - iter 104/130 - loss 0.02845038 - samples/sec: 504.87
2020-02-05 12:18:16,405 epoch 36 - iter 117/130 - loss 0.03000542 - samples/sec: 479.28
2020-02-05 12:18:17,228 ----------------------------------------------------------------------------------------------------
2020-02-05 12:18:17,229 EPOCH 36 done: loss 0.0308 - lr 0.0004
2020-02-05 12:18:18,049 DEV : loss 0.04392469674348831 - score 0.9826
2020-02-05 12:18:18,073 BAD EPOCHS (no improvement): 2
2020-02-05 12:18:18,075 ----------------------------------------------------------------------------------------------------
2020-02-05 12:18:18,192 epoch 37 - iter 0/130 - loss 0.03444128 - samples/sec: 3617.59
2020-02-05 12:18:19,161 epoch 37 - iter 13/130 - loss 0.03070795 - samples/sec: 478.73
2020-02-05 12:18:20,080 epoch 37 - iter 26/130 - loss 0.02965951 - samples/sec: 507.95
2020-02-05 12:18:20,991 epoch 37 - ite

2020-02-05 12:19:19,650 EPOCH 42 done: loss 0.0300 - lr 0.0002
2020-02-05 12:19:20,468 DEV : loss 0.04379584640264511 - score 0.9826
Epoch    41: reducing learning rate of group 0 to 9.7656e-05.
2020-02-05 12:19:20,488 BAD EPOCHS (no improvement): 4
2020-02-05 12:19:20,490 ----------------------------------------------------------------------------------------------------
2020-02-05 12:19:20,491 ----------------------------------------------------------------------------------------------------
2020-02-05 12:19:20,492 learning rate too small - quitting training!
2020-02-05 12:19:20,493 ----------------------------------------------------------------------------------------------------
2020-02-05 12:19:24,579 ----------------------------------------------------------------------------------------------------
2020-02-05 12:19:24,582 Testing using best model ...
2020-02-05 12:19:24,583 loading file best-model.pt


  result = unpickler.load()


2020-02-05 12:19:27,941 0.9961	0.9961	0.9961
2020-02-05 12:19:27,943 
MICRO_AVG: acc 0.9923 - f1-score 0.9961
MACRO_AVG: acc 0.9809 - f1-score 0.9903
ham        tp: 458 - fp: 1 - fn: 1 - tn: 57 - precision: 0.9978 - recall: 0.9978 - accuracy: 0.9957 - f1-score: 0.9978
spam       tp: 57 - fp: 1 - fn: 1 - tn: 458 - precision: 0.9828 - recall: 0.9828 - accuracy: 0.9661 - f1-score: 0.9828
2020-02-05 12:19:27,944 ----------------------------------------------------------------------------------------------------


{'test_score': 0.9961,
 'dev_score_history': [0.9865,
  0.9884,
  0.9826,
  0.9845,
  0.9865,
  0.9845,
  0.9865,
  0.9865,
  0.9865,
  0.9845,
  0.9845,
  0.9865,
  0.9865,
  0.9826,
  0.9807,
  0.9807,
  0.9845,
  0.9865,
  0.9826,
  0.9865,
  0.9826,
  0.9826,
  0.9826,
  0.9845,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826,
  0.9826],
 'train_loss_history': [0.15148688282531042,
  0.0848426594470556,
  0.07535099587761439,
  0.07040915398930128,
  0.06383458686849246,
  0.059092580570051303,
  0.049271565632751355,
  0.04867712014283125,
  0.04706459092692687,
  0.04353121120769244,
  0.04010398012514298,
  0.03722284267871426,
  0.036603732765294036,
  0.03572221935655062,
  0.033504460537089754,
  0.03420523538325842,
  0.032365413540257855,
  0.03432311093291411,
  0.03301021218156586,
  0.032221247313114314,
  0.03131773202465131,
  0.03045689035923435,
  