In [175]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pickle

In [176]:
path = "./data/pii_labeled.csv"

df = pd.read_csv(path)

In [177]:
df.head()

Unnamed: 0,text,label
0,122-65-6616,1
1,EAC 492,1
2,dhoti,0
3,IHV-767,1
4,chest,0


In [178]:
data = df.to_numpy()
np.random.shuffle(data)

separation_rate = 0.6
separation_index = int(data.shape[0] * separation_rate)

train_data = data[:separation_index]
test_data = data[separation_index:]

x_train = train_data[:, 0]
y_train = train_data[:, 1].astype(np.float64)

x_test = test_data[:, 0]
y_test = test_data[:, 1].astype(np.float64)

In [179]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((90,), (90,), (60,), (60,))

In [180]:
tokenizer = Tokenizer(num_words=1200, oov_token="<OOV>")

def set_tokenizer(x):
  tokenizer.fit_on_texts(x)

def to_tokens(x):
  sequences = tokenizer.texts_to_sequences(x)
  return pad_sequences(sequences, padding='post', maxlen=10).astype(np.float64)

In [181]:
model = Sequential([
    Dense(24, activation='relu'),
    Dense(12, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [182]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [183]:
set_tokenizer(x_train)
model.fit(to_tokens(x_train), y_train, epochs=100, batch_size=32)

Epoch 1/100


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3357 - loss: 3.0010  
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3731 - loss: 2.6173 
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7271 - loss: 2.1979 
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7622 - loss: 1.2304 
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7544 - loss: 2.2540 
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7505 - loss: 2.0490 
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7818 - loss: 1.0385 
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7661 - loss: 1.1766 
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x28deb2b70>

In [184]:
set_tokenizer(x_test)
model.evaluate(to_tokens(x_test), y_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9132 - loss: 0.3117 


[0.30364787578582764, 0.9166666865348816]

In [185]:
# save model
model.save('./ml/pii_model.h5')

# save tokenizer
with open('./ml/tokenizer.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

