In [2]:
import numpy as np
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, LSTM
import pandas as pd

In [None]:
df = pd.read_csv('/content/allMessages.csv', sep=';')
df = df.drop('Unnamed: 0', axis=1)
df = df.sample(frac=1)
df.head(10)

In [4]:
idx = int(0.8 * len(df))

train_texts = df.MESSAGE.to_list()
train_labels = df.CATEGORY_ID.to_list()

maxlen = 15 # shortening a messages to 15 words
num_words = 10000 # 10000 most frequent words
embedding_dim = 25 # embedding dimension

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_texts)
sequences = tokenizer.texts_to_sequences(train_texts)

train_data = pad_sequences(sequences, maxlen=maxlen)
train_labels = np.asarray(train_labels)

X_train = train_data[:idx]
y_train = train_labels[:idx]
X_val = train_data[idx:]
y_val = train_labels[idx:]

In [5]:
model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=maxlen))
model.add(LSTM(16))
# model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, batch_size=32, epochs=3, validation_data=(X_val, y_val))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [6]:
def plot_hist(history):
  import pandas as pd
  import plotly.graph_objects as go
  
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  fig = go.Figure()
  fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['accuracy'], name='accuracy', mode='markers+lines'))
  fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_accuracy'], name='val_accuracy', mode='markers+lines'))
  fig.update_layout(width=1000, height=500, title='accuracy vs val accuracy', xaxis_title='Epoki', yaxis_title='accuracy', yaxis_type='log')
  fig.show()

  fig = go.Figure()
  fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name='loss', mode='markers+lines'))
  fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name='val_loss', mode='markers+lines'))
  fig.update_layout(width=1000, height=500, title='loss vs val_loss', xaxis_title='Epoki', yaxis_title='loss', yaxis_type='log')
  fig.show()

plot_hist(history)

In [22]:
message_content = 'ham is a way better than anyone have a stream'

word_index = []
for word in message_content.split(' '):
  if not word in tokenizer.word_index.keys():
    tokenizer.word_index[word] = len(tokenizer.word_index) + 1

  word_index.append(tokenizer.word_index[word])

X = pad_sequences(sequences=[word_index], maxlen=15, padding='post')
print(model.predict(X))

[[0.00033335]]


In [13]:
word_index

[260, 2, 267, 1625, 282, 55]

In [25]:
type(tokenizer.word_index)

dict

In [28]:
import json

with open('json_data.json', 'w') as outfile:
    json.dump(tokenizer.word_index, outfile)

In [30]:
with open('json_data.json') as json_file:
    data = json.load(json_file)
    print(type(data))

<class 'dict'>


In [31]:
model.save('s.h5')