In [1]:
import matplotlib as plt
import os, re, shutil, string
import tensorflow as tf
import pandas as pd
import numpy as np

from tensorflow.keras import losses
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.models import load_model
from tensorflow.keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dropout
from keras import regularizers
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

In [42]:
tf.__version__

'2.16.1'

In [43]:
train_dir = 'train/animals.csv'

In [44]:
words_train = pd.read_csv('train/animals.csv')
words_train = words_train.sample(frac=1, random_state=60).reset_index(drop=True)
words_train

Unnamed: 0,name,animal/not
0,Chameleon,animal
1,Suede,not animal
2,Gazelle,animal
3,Soil,not animal
4,Nitrogen,not animal
...,...,...
210,Desk,not animal
211,Shale,not animal
212,Crow,animal
213,Cockroach,animal


In [45]:
words_feature = words_train.copy()
words_feature.columns

Index(['name', 'animal/not'], dtype='object')

In [46]:
word_label = words_feature.pop('animal/not')
word_label = word_label.str.lstrip()


In [47]:
words_feature = words_feature.map(lambda s: s.lower())
words_feature

Unnamed: 0,name
0,chameleon
1,suede
2,gazelle
3,soil
4,nitrogen
...,...
210,desk
211,shale
212,crow
213,cockroach


In [48]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(words_feature['name'])
tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x17a620f9490>

In [49]:
word_label.unique()

array(['animal', 'not animal'], dtype=object)

In [50]:
words_label_encoded = pd.get_dummies(word_label)
words_label_encoded = words_label_encoded.astype(int)
del words_label_encoded['not animal']
words_label_encoded_np = words_label_encoded.values
words_label_encoded_np

array([[1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
    

In [51]:

sequences = tokenizer.texts_to_sequences(words_feature['name'])
sequences_padded = pad_sequences(sequences)
sequences_padded = np.array(sequences_padded)

In [52]:
sequences_padded

array([[  6],
       [  7],
       [  8],
       [  9],
       [ 10],
       [ 11],
       [ 12],
       [ 13],
       [ 14],
       [ 15],
       [ 16],
       [ 17],
       [ 18],
       [ 19],
       [ 20],
       [ 21],
       [ 22],
       [ 23],
       [ 24],
       [ 25],
       [  1],
       [ 26],
       [ 27],
       [ 28],
       [ 29],
       [ 30],
       [ 31],
       [ 32],
       [ 33],
       [ 34],
       [ 35],
       [ 36],
       [ 37],
       [ 38],
       [  2],
       [ 39],
       [ 40],
       [ 41],
       [ 42],
       [  1],
       [ 43],
       [  3],
       [ 44],
       [ 45],
       [ 46],
       [ 47],
       [ 48],
       [ 49],
       [ 50],
       [ 51],
       [ 52],
       [ 53],
       [ 54],
       [ 55],
       [ 56],
       [ 57],
       [ 58],
       [  2],
       [ 59],
       [ 60],
       [ 61],
       [ 62],
       [ 63],
       [ 64],
       [ 65],
       [ 66],
       [  4],
       [ 67],
       [ 68],
       [ 69],
       [  3],
      

In [53]:
x_train, x_test, y_train, y_test = train_test_split(sequences_padded, words_label_encoded_np, test_size=0.1, random_state=50)

x_train.shape

(193, 1)

In [54]:
y_train.shape

(193, 1)

In [55]:
x_test.shape


(22, 1)

In [68]:

if not os.path.exists('classifier.keras'):
    model = Sequential()
    model.add(layers.Embedding(193, 16))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [73]:
if os.path.exists('classifier.keras'):
    model = load_model('classifier.keras', compile=False)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    

In [74]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model.fit(x_train, y_train, epochs=200, validation_data=(x_test, y_test), callbacks=early_stopping)


Epoch 1/200


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9784 - loss: 0.6299 - val_accuracy: 0.5000 - val_loss: 0.6953
Epoch 2/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9685 - loss: 0.6272 - val_accuracy: 0.5000 - val_loss: 0.6949
Epoch 3/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9709 - loss: 0.6178 - val_accuracy: 0.4545 - val_loss: 0.6947
Epoch 4/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9737 - loss: 0.6114 - val_accuracy: 0.4545 - val_loss: 0.6947
Epoch 5/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9688 - loss: 0.6036 - val_accuracy: 0.4091 - val_loss: 0.6946
Epoch 6/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9680 - loss: 0.5976 - val_accuracy: 0.4091 - val_loss: 0.6944
Epoch 7/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x17a6219a4b0>

In [77]:
model.save('classifier.keras')

In [79]:
loss = model.evaluate(x_train, y_train)
loss

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9812 - loss: 0.3960 


[0.4012526571750641, 0.9689119458198547]