In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

--2021-05-11 20:40:42--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2021-05-11 20:40:43 (1.77 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2021-05-11 20:40:43--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2021-05-11 20:40:43 (2.28 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [None]:
train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
train_df = pd.read_csv(train_file_path, sep='\t', names=['target', 'message'])
test_df = pd.read_csv(test_file_path, sep='\t', names=['target', 'message'])

In [None]:
train_df.head()

Unnamed: 0,target,message
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...


In [None]:
vocab = sorted(set(' '.join(train_df.message.values.tolist() + test_df.message.values.tolist())))

In [None]:
MAXLEN = 150
VOCAB_SIZE = len(vocab)

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, char_level=False)
tokenizer.fit_on_texts(train_df.message)

In [None]:
len(tokenizer.word_index)

7692

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_df.message)
test_sequences = tokenizer.texts_to_sequences(test_df.message)

In [None]:
train_labels = np.array(train_df.target.map(lambda x: 0 if x == "ham" else 1))
test_labels = np.array(test_df.target.map(lambda x: 0 if x == "ham" else 1))

In [None]:
train_padded = keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=MAXLEN)
test_padded = keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=MAXLEN)

In [None]:
train_padded.shape, test_padded.shape

((4179, 150), (1392, 150))

In [None]:
model = keras.Sequential([
                          keras.layers.Embedding(VOCAB_SIZE, 32, input_length=MAXLEN),
                          keras.layers.LSTM(32),
                          keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 32)           2784      
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 11,137
Trainable params: 11,137
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(
    loss = 'binary_crossentropy',
    optimizer = 'adam',
    metrics = ['acc']
)

In [None]:
history = model.fit(train_padded, train_labels, epochs=15, validation_split=0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
model.evaluate(test_padded, test_labels)



[0.11905580013990402, 0.9583333134651184]

In [None]:
tokenizer.texts_to_sequences(["how are you doing today?"])

[[49, 22, 3]]

In [None]:
def predict_message(pred_text):
  
  pred_text = tokenizer.texts_to_sequences([pred_text])
  pred_text = keras.preprocessing.sequence.pad_sequences(pred_text, maxlen=MAXLEN)

  predict = model.predict(pred_text)

  if predict < 0.5:
    prediction = [predict.flatten()[0], "ham"]
  else:
    prediction = [predict.flatten()[0], "spam"]

  return (prediction)

In [None]:
pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[0.007132381, 'ham']


In [None]:
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()

You passed the challenge. Great job!
