<a href="https://colab.research.google.com/github/onism/MyLearning/blob/master/NLP_basic_tf2_0_sms_spam_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import os 
import io

tf.__version__

'2.2.0'

In [2]:
path_to_zip = tf.keras.utils.get_file("smsspamcollection.zip",
                  origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip",
                  extract=True)

!unzip $path_to_zip -d data

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Archive:  /root/.keras/datasets/smsspamcollection.zip
  inflating: data/SMSSpamCollection  
  inflating: data/readme             


In [3]:
lines = io.open('data/SMSSpamCollection').read().strip().split('\n')
lines[0]

'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [4]:
# pre-process data

spam_dataset = []
count = 0
for line in lines:
    label, text = line.split('\t')
    if label.lower().strip() == 'spam':
        spam_dataset.append((1, text.strip()))
    else:
        spam_dataset.append((0, text.strip()))

print(spam_dataset[0])

(0, 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')


In [5]:
#data normalization

import pandas as pd 
df = pd.DataFrame(spam_dataset, columns=['Spam', 'Message'])
df.head()

Unnamed: 0,Spam,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
import re 

def message_length(x):
    return len(x)

def num_captitals(x):
    _, count = re.subn(r'[A-Z]','',x)
    return count 

def num_punctuation(x):
    _, count = re.subn(r'\W', '', x)
    return count 

df['Capitals'] = df['Message'].apply(num_captitals)
df['Punctuation'] = df['Message'].apply(num_punctuation)
df['Length'] = df['Message'].apply(message_length)

df.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,5574.0,5574.0,5574.0,5574.0
mean,0.134015,5.621636,18.942591,80.443488
std,0.340699,11.683233,14.825994,59.841746
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [7]:
train = df.sample(frac=0.8, random_state=42)
test = df.drop(train.index)

In [8]:
train.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,4459.0,4459.0,4459.0,4459.0
mean,0.132765,5.519399,18.886522,80.316439
std,0.339359,11.405424,14.602023,59.346407
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,35.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [9]:
test.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,1115.0,1115.0,1115.0,1115.0
mean,0.139013,6.030493,19.166816,80.95157
std,0.346116,12.731059,15.694599,61.807655
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,28.0,123.0
max,1.0,127.0,195.0,790.0


In [10]:
# model building 

def make_model(input_dims=3, num_units=12):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(num_units, input_dim=input_dims, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

x_train = train[['Length', 'Punctuation', 'Capitals']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals']]
y_test = test[['Spam']]

x_train

Unnamed: 0,Length,Punctuation,Capitals
3690,25,4,1
3527,161,48,107
724,40,7,1
3370,69,17,3
468,37,8,1
...,...,...,...
3280,444,114,44
3186,65,14,50
3953,81,23,2
2768,38,8,2


In [11]:
type(x_train)

pandas.core.frame.DataFrame

In [12]:
model = make_model()

model.fit(x_train, y_train, epochs=10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f3a3e392fd0>

In [13]:
model.evaluate(x_test, y_test)



[0.2728753983974457, 0.8950672745704651]

In [14]:
y_train_pred = model.predict(x_train)

tf.math.confusion_matrix(tf.constant(y_train.Spam), y_train_pred)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[3867,    0],
       [ 592,    0]], dtype=int32)>

In [15]:
# tokenization and stop word remove

!pip install stanfordnlp

Collecting stanfordnlp
[?25l  Downloading https://files.pythonhosted.org/packages/41/bf/5d2898febb6e993fcccd90484cba3c46353658511a41430012e901824e94/stanfordnlp-0.2.0-py3-none-any.whl (158kB)
[K     |████████████████████████████████| 163kB 2.5MB/s 
Installing collected packages: stanfordnlp
Successfully installed stanfordnlp-0.2.0


In [16]:
import stanfordnlp as snlp

en = snlp.download('en')

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)
Y

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: en_ewt
Download location: /root/stanfordnlp_resources/en_ewt_models.zip


100%|██████████| 235M/235M [00:23<00:00, 10.2MB/s]



Download complete.  Models saved to: /root/stanfordnlp_resources/en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


In [17]:
en = snlp.Pipeline(lang='en')

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand

In [18]:
sentence = 'Go until jurong point, crazy.. Available only in bugis n great world'
tokenized = en(sentence)



In [21]:
tokenized.sentences[0].tokens

[<Token index=1;words=[<Word index=1;text=Go;lemma=go;upos=VERB;xpos=VB;feats=Mood=Imp|VerbForm=Fin;governor=0;dependency_relation=root>]>,
 <Token index=2;words=[<Word index=2;text=until;lemma=until;upos=ADP;xpos=IN;feats=_;governor=4;dependency_relation=case>]>,
 <Token index=3;words=[<Word index=3;text=jurong;lemma=jurong;upos=PROPN;xpos=NNP;feats=Number=Sing;governor=4;dependency_relation=compound>]>,
 <Token index=4;words=[<Word index=4;text=point;lemma=point;upos=NOUN;xpos=NN;feats=Number=Sing;governor=1;dependency_relation=obl>]>,
 <Token index=5;words=[<Word index=5;text=,;lemma=,;upos=PUNCT;xpos=,;feats=_;governor=1;dependency_relation=punct>]>,
 <Token index=6;words=[<Word index=6;text=crazy;lemma=crazy;upos=ADJ;xpos=JJ;feats=Degree=Pos;governor=1;dependency_relation=parataxis>]>,
 <Token index=7;words=[<Word index=7;text=..;lemma=..;upos=PUNCT;xpos=.;feats=_;governor=1;dependency_relation=punct>]>]

In [22]:
for snt in tokenized.sentences:
    for word in snt.tokens:
        print(word.text)
    print('<End of Sentence>')

Go
until
jurong
point
,
crazy
..
<End of Sentence>
Available
only
in
bugis
n
great
world
<End of Sentence>


In [None]:
def word_counts(x, pipeline=en):
    doc = pipeline(x)
    count = sum(  [len(sentence.tokens) for sentence in doc.sentences]  )
    return count 

df['Words'] = df['Message'].apply(word_counts)

df.describe()



In [None]:
train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)

In [None]:
x_train = train[ ['Length', 'Punctuation', 'Capitals', 'Words']  ]
x_test = test[ ['Length', 'Punctuation', 'Capitals', 'Words'] ]
y_train = train['Spam']
y_test = test['Spam']

model = make_model(input_dims=4)

model.fit(x_train, y_train, epochs=10, batch_size=10)