In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pickle

In [3]:
path = "./data/pii_labeled.csv"

df = pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,text,label
0,miracidial,0
1,603-23-1560,1
2,fineleaf,0
3,pansied,0
4,coapprentice,0


In [5]:
data = df.to_numpy()
np.random.shuffle(data)

separation_rate = 0.6
separation_index = int(data.shape[0] * separation_rate)

train_data = data[:separation_index]
test_data = data[separation_index:]

x_train = train_data[:, 0]
y_train = train_data[:, 1].astype(np.float64)

x_test = test_data[:, 0]
y_test = test_data[:, 1].astype(np.float64)

In [6]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((660,), (660,), (440,), (440,))

In [7]:
tokenizer = Tokenizer(num_words=1200, oov_token="<OOV>")

def set_tokenizer(x):
  tokenizer.fit_on_texts(x)

def to_tokens(x):
  sequences = tokenizer.texts_to_sequences(x)
  return pad_sequences(sequences, padding='post', maxlen=10).astype(np.float64)

In [8]:
model = Sequential([
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [10]:
set_tokenizer(x_train)
model.fit(to_tokens(x_train), y_train, epochs=100, batch_size=32)

Epoch 1/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 586us/step - accuracy: 0.7011 - loss: 5.9719
Epoch 2/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 556us/step - accuracy: 0.8248 - loss: 2.3098
Epoch 3/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 583us/step - accuracy: 0.8905 - loss: 0.4933
Epoch 4/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 492us/step - accuracy: 0.9012 - loss: 0.3687
Epoch 5/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 391us/step - accuracy: 0.9073 - loss: 0.3550
Epoch 6/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388us/step - accuracy: 0.8942 - loss: 0.3835
Epoch 7/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 398us/step - accuracy: 0.9080 - loss: 0.3221
Epoch 8/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 895us/step - accuracy: 0.9187 - loss: 0.3216
Epoch 9/100
[1m21/21[0m [32m━

<keras.src.callbacks.history.History at 0x177379dc0>

In [11]:
set_tokenizer(x_test)
model.evaluate(to_tokens(x_test), y_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 448us/step - accuracy: 0.8921 - loss: 0.3782


[0.3782564699649811, 0.8931818008422852]

In [13]:
# save model
model.save('./ml/pii_model.h5')

# save tokenizer
with open('./ml/tokenizer.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

