In [1]:
import gensim.downloader as api
import numpy as np
import os
import tensorflow as tf

In [2]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
# downloading data

In [4]:
url_address="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

In [5]:
file_load=tf.keras.utils.get_file(url_address.split('/')[-1],url_address,extract=True,cache_dir='.')

In [6]:
os.chdir(os.path.join(os.getcwd(),"datasets"))

In [7]:
os.getcwd()

'C:\\Users\\Piotrek\\TF2.0\\datasets'

In [8]:
os.listdir()

['readme', 'SMSSpamCollection', 'smsspamcollection.zip']

In [9]:
with open('SMSSpamCollection','r') as file:
    head=[next(file) for x in range(10)]
    file.close()

In [10]:
print(head)

['ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', 'ham\tOk lar... Joking wif u oni...\n', "spam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n", 'ham\tU dun say so early hor... U c already then say...\n', "ham\tNah I don't think he goes to usf, he lives around here though\n", "spam\tFreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Â£1.50 to rcv\n", 'ham\tEven my brother is not like to speak with me. They treat me like aids patent.\n', "ham\tAs per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune\n", 'spam\tWINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! To claim call 09061

In [11]:
# parsing data

In [12]:
head[0]

'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n'

In [13]:
lab0,text0=head[0].strip().split('\t')

In [14]:
print(f"label is : {lab0}, content is : {text0}")

label is : ham, content is : Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [15]:
labels,content=[],[]
with open('SMSSpamCollection','r') as file:
    for line in file:
        label,text=line.strip().split('\t')
        labels.append(1 if label=='spam' else 0)
        content.append(text)
    file.close()

In [16]:
for x,y in list(zip(labels,content))[:3]:
    print(x,y,'\n')

0 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... 

0 Ok lar... Joking wif u oni... 

1 Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's 



In [17]:
len(labels)

5574

In [18]:
# creating tokens for each unique word in text

In [19]:
tokenizer=tf.keras.preprocessing.text.Tokenizer()

In [20]:
tokenizer.fit_on_texts(content)

In [21]:
# converting text into a numerical representation

In [22]:
text_sequences=tokenizer.texts_to_sequences(content)

In [23]:
print(text_sequences[:4])

[[49, 472, 4436, 843, 756, 659, 64, 8, 1328, 87, 123, 352, 1329, 148, 2996, 1330, 67, 58, 4437, 144], [46, 337, 1500, 473, 6, 1941], [47, 490, 8, 19, 4, 798, 902, 2, 176, 1942, 1106, 660, 1943, 2331, 261, 2332, 71, 1942, 2, 1944, 2, 338, 490, 556, 961, 73, 392, 174, 661, 393, 2997], [6, 248, 150, 23, 383, 2998, 6, 139, 154, 57, 150]]


In [24]:
# padding sequences

In [25]:
text_sequences_padding=tf.keras.preprocessing.sequence.pad_sequences(text_sequences)

In [26]:
print(text_sequences_padding[:4])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0   49  472 4436  843  756  659   64    8 1328   87  123  352 1329
   148 2996 1330   67   58 4437  144]
 [   0    0    0    0    0

In [27]:
# creating labels; it is a classification task, so there are 2 different possible outputs

In [28]:
len(set(labels))

2

In [29]:
cat_labels=tf.keras.utils.to_categorical(labels,num_classes=len(set(labels)))

In [31]:
cat_labels[:5]

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [38]:
# creating inverse dictionary from relation: word->index to index->word

In [36]:
word2idx=tokenizer.word_index
print(list(word2idx.items())[:10])

[('i', 1), ('to', 2), ('you', 3), ('a', 4), ('the', 5), ('u', 6), ('and', 7), ('in', 8), ('is', 9), ('me', 10)]


In [37]:
idx2word={v:k for k,v in word2idx.items()}

In [39]:
word2idx['PAD']=0
idx2word[0]='PAD'

In [40]:
vocab_size=len(word2idx)
vocab_size

9013

In [41]:
# creating Dataset object

In [42]:
dataset=tf.data.Dataset.from_tensor_slices((text_sequences_padding,cat_labels))

In [45]:
# random shuffle

In [43]:
dataset=dataset.shuffle(10000)

In [46]:
# splitting data into train/test/val

In [47]:
num_of_records=len(text_sequences_padding)
num_of_records

5574

In [54]:
test_size=num_of_records // 4
val_size=(num_of_records-test_size) // 10

In [55]:
test_dataset=dataset.take(test_size)
val_dataset=dataset.skip(test_size).take(val_size)
train_dataset=dataset.skip(test_size+val_size)

In [58]:
batch_size=128
test_dataset=test_dataset.batch(batch_size,drop_remainder=True)
val_dataset=val_dataset.batch(batch_size,drop_remainder=True)
train_dataset=train_dataset.batch(batch_size,drop_remainder=True)

In [59]:
# building embedding matrix

In [61]:
import gensim.downloader as api

In [67]:
api.info("glove-wiki-gigaword-300")

{'num_records': 400000,
 'file_size': 394362229,
 'base_dataset': 'Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)',
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/glove-wiki-gigaword-300/__init__.py',
 'license': 'http://opendatacommons.org/licenses/pddl/',
 'parameters': {'dimension': 300},
 'description': 'Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).',
 'preprocessing': 'Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i <fname> -o glove-wiki-gigaword-300.txt`.',
 'read_more': ['https://nlp.stanford.edu/projects/glove/',
  'https://nlp.stanford.edu/pubs/glove.pdf'],
 'checksum': '29e9329ac2241937d55b852e8284e89b',
 'file_name': 'glove-wiki-gigaword-300.gz',
 'parts': 1}

In [71]:
def build_embedding_matrix(seq,word2idx,emb_dim,emb_file):
    if os.path.exists(emb_file):
        E=np.load(emb_file)
    else:
        vocab_size=len(word2idx)
        E=np.zeros((vocab_size,emb_dim))
        word_vectors=api.load(emb_model)
        for word,idx in word2idx.items():
            try:
                E[idx]=word_vectors.word_vec(word)
            except KeyError:
                pass
        np.save(emb_file,E)
    return E

In [69]:
emb_dim=300
data_dir="data"
emb_file=os.path.join(data_dir,"E.npy")
emb_model="glove-wiki-gigaword-300"

In [75]:
E=build_embedding_matrix(text_sequences_padding,word2idx,emb_dim,emb_file)

In [77]:
# comparing shapes

In [78]:
E.shape == (len(word2idx),emb_dim)

True

In [79]:
E.shape

(9013, 300)

In [80]:
# defining spam classifier with 1-dimensional convolutional neural network

In [92]:
class SpamClassifierModel(tf.keras.Model):
    
    def __init__(self,vocab_sz,embed_sz,input_length,num_filters,kernel_sz,output_sz,run_mode,embedding_weights,**kwargs):
        
        super(SpamClassifierModel, self).__init__(**kwargs)
        # in case of learning embeddings from scratch
        if run_mode=="scratch":
            self.embedding=tf.keras.layers.Embedding(vocab_sz
            ,embed_sz,input_length=input_length,trainable=True)
        # in case of transfer learning
        elif run_mode=="vectorizer":
            self.embedding=tf.keras.layers.Embedding(vocab_sz
            ,embed_sz,input_length=input_length,weights=[embedding_weights],trainable=False)
        # in case of fine-tuning parameters
        else:
            self.embedding=tf.keras.layers.Embedding(vocab_sz
            ,embed_sz,input_length=input_length,weights=[embedding_weights],trainable=True)
            
        self.conv=tf.keras.layers.Conv1D(filters=num_filters,kernel_size=kernel_sz,activation="relu")
        self.dropout=tf.keras.layers.SpatialDropout1D(.2)
        self.pool=tf.keras.layers.GlobalMaxPooling1D()
        self.dense=tf.keras.layers.Dense(output_sz,activation="softmax")

    def call(self,X):
        
        X=self.embedding(X)
        X=self.conv(X)
        X=self.dropout(X)
        X=self.pool(X)
        X=self.dense(X)
        return X

In [106]:
# defining hyperparameters, option: learning embeddings from scratch

In [93]:
conv_num_filters=256
conv_kernel_size=3
max_seq_len=text_sequences_padding.shape[1]
model=SpamClassifierModel(vocab_size,emb_dim,max_seq_len,conv_num_filters,conv_kernel_size
                         ,len(set(labels)),"vectorizer",E)

In [94]:
model.build(input_shape=(None,max_seq_len))

In [95]:
model.summary()

Model: "spam_classifier_model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      multiple                  2703900   
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  230656    
_________________________________________________________________
spatial_dropout1d_3 (Spatial multiple                  0         
_________________________________________________________________
global_max_pooling1d_3 (Glob multiple                  0         
_________________________________________________________________
dense_3 (Dense)              multiple                  514       
Total params: 2,935,070
Trainable params: 231,170
Non-trainable params: 2,703,900
_________________________________________________________________


In [96]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [104]:
# because number of documents described as spam is over 6 times higher than others, there is a need to
# fix weights accurately by multiplying class weights with value

In [102]:
cat_labels.sum(axis=0)

array([4827.,  747.], dtype=float32)

In [105]:
val=cat_labels.sum(axis=0)[0] / cat_labels.sum(axis=0)[1] 
val

6.4618473

In [107]:
num_epochs=3
class_weights={0:1,1:val}

In [108]:
# training model

In [110]:
r=model.fit(train_dataset,epochs=num_epochs,validation_data=val_dataset,class_weight=class_weights)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [115]:
r.history['accuracy']

[0.99595904, 0.9978448, 0.99838364]

In [116]:
# evaluating model

In [117]:
labels,predictions=[],[]
for Xtest,Ytest in test_dataset:
    Ytest_=model.predict_on_batch(Xtest)
    ytest=np.argmax(Ytest,axis=1)
    ytest_=np.argmax(Ytest_,axis=1)
    labels.extend(ytest.tolist())
    predictions.extend(ytest_.tolist())

In [118]:
print("test accuracy: {:.3f}".format(accuracy_score(labels,predictions)))
print("confusion matrix")
print(confusion_matrix(labels,predictions))

test accuracy: 0.997
confusion matrix
[[1115    0]
 [   4  161]]
