In [1]:
import numpy as np

import tensorflow as tf


In [2]:
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras.layers import TextVectorization
import tensorflow_text as tf_text

In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.config.list_physical_devices('GPU')

tf.test.is_gpu_available()

Num GPUs Available:  1
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Metal device set to: Apple M1 Max


True

In [4]:
import matplotlib as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [5]:
import json

dataset = json.load(open('dataset.json'))

In [6]:
dataset['names'][0]

'Ахметфаик Малянтович'

In [7]:
data = []
labels = []

In [8]:
data += dataset['names']
labels += ['fio'] * len(dataset['names'])

In [9]:
data += dataset['dates']
labels += ['date'] * len(dataset['dates'])

In [10]:
data += dataset['positions']
labels += ['position'] * len(dataset['positions'])

In [11]:
data += dataset['works']
labels += ['work'] * len(dataset['works'])

In [12]:
npdata = np.array(data)
nplabels = np.array(labels)

In [13]:
print(nplabels)

['fio' 'fio' 'fio' ... 'work' 'work' 'work']


In [14]:
import pandas as pd

In [44]:
df = pd.DataFrame(list(zip(data, labels)), columns=['text', 'label'])
df

Unnamed: 0,text,label
0,Ахметфаик Малянтович,fio
1,Дергачёв Вилора,fio
2,Москвилина Чжимин,fio
3,Тимургази Велиахмедова,fio
4,Ошманов Эдди,fio
...,...,...
3760,ООО Трест Рос Спец Энерго монтаж,work
3761,"""ФГУП """"Приборостроительный завод""""""",work
3762,Центральная научно-исследовательская лаборатория,work
3763,"""АО """"Опытное Конструкторское Бюро Машинострое...",work


In [46]:
df.to_csv('pzdc.csv', sep=',', index=False)


In [16]:
features_names = ['Text']
features = df[features_names]
features.head()

Unnamed: 0,Text
0,Ахметфаик Малянтович
1,Дергачёв Вилора
2,Москвилина Чжимин
3,Тимургази Велиахмедова
4,Ошманов Эдди


In [17]:
labels_names = ['Labels']
label = df[labels_names]
label.head()

Unnamed: 0,Labels
0,fio
1,fio
2,fio
3,fio
4,fio


In [18]:
tf.convert_to_tensor(features)

<tf.Tensor: shape=(3765, 1), dtype=string, numpy=
array([[b'\xd0\x90\xd1\x85\xd0\xbc\xd0\xb5\xd1\x82\xd1\x84\xd0\xb0\xd0\xb8\xd0\xba \xd0\x9c\xd0\xb0\xd0\xbb\xd1\x8f\xd0\xbd\xd1\x82\xd0\xbe\xd0\xb2\xd0\xb8\xd1\x87'],
       [b'\xd0\x94\xd0\xb5\xd1\x80\xd0\xb3\xd0\xb0\xd1\x87\xd1\x91\xd0\xb2 \xd0\x92\xd0\xb8\xd0\xbb\xd0\xbe\xd1\x80\xd0\xb0'],
       [b'\xd0\x9c\xd0\xbe\xd1\x81\xd0\xba\xd0\xb2\xd0\xb8\xd0\xbb\xd0\xb8\xd0\xbd\xd0\xb0 \xd0\xa7\xd0\xb6\xd0\xb8\xd0\xbc\xd0\xb8\xd0\xbd'],
       ...,
       [b'\xd0\xa6\xd0\xb5\xd0\xbd\xd1\x82\xd1\x80\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xbd\xd0\xb0\xd1\x8f \xd0\xbd\xd0\xb0\xd1\x83\xd1\x87\xd0\xbd\xd0\xbe-\xd0\xb8\xd1\x81\xd1\x81\xd0\xbb\xd0\xb5\xd0\xb4\xd0\xbe\xd0\xb2\xd0\xb0\xd1\x82\xd0\xb5\xd0\xbb\xd1\x8c\xd1\x81\xd0\xba\xd0\xb0\xd1\x8f \xd0\xbb\xd0\xb0\xd0\xb1\xd0\xbe\xd1\x80\xd0\xb0\xd1\x82\xd0\xbe\xd1\x80\xd0\xb8\xd1\x8f'],
       [b'"\xd0\x90\xd0\x9e ""\xd0\x9e\xd0\xbf\xd1\x8b\xd1\x82\xd0\xbd\xd0\xbe\xd0\xb5 \xd0\x9a\xd0\xbe\xd0\xbd\xd1\x81\xd1\

In [19]:
label

Unnamed: 0,Labels
0,fio
1,fio
2,fio
3,fio
4,fio
...,...
3760,work
3761,work
3762,work
3763,work


In [20]:
train_dataset = tf.data.Dataset.from_tensor_slices((features, label))
train_dataset

<_TensorSliceDataset element_spec=(TensorSpec(shape=(1,), dtype=tf.string, name=None), TensorSpec(shape=(1,), dtype=tf.string, name=None))>

In [21]:
def get_dataset_partitions_tf(ds, ds_size, train_split=0.8, val_split=0.1, test_split=0.1, shuffle=True, shuffle_size=10000):
    assert (train_split + test_split + val_split) == 1
    
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        ds = ds.shuffle(shuffle_size, seed=12)
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    train_ds = ds.take(train_size)    
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds, val_ds, test_ds

In [22]:
tf.data.experimental.cardinality(train_dataset)

<tf.Tensor: shape=(), dtype=int64, numpy=3765>

In [23]:
train, val, test = get_dataset_partitions_tf(train_dataset, 3765)

In [29]:
VOCAB_SIZE = 1000

binary_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='binary')

In [30]:
train_text = train.map(lambda text, labels: text)
val_text = val.map(lambda text, labels: text)

with tf.device('/CPU:0'):
    binary_vectorize_layer.adapt(train_text)

In [31]:
def binary_vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return binary_vectorize_layer(text), label

In [32]:
text_batch, label_batch = next(iter(train))
print(text_batch, label_batch)

tf.Tensor([b'\xd0\x97\xd0\xb0\xd1\x85\xd0\xb0\xd1\x80\xd0\xbe\xd0\xb2\xd0\xb0 \xd0\x95\xd0\xbb\xd0\xb5\xd0\xbd\xd0\xb0'], shape=(1,), dtype=string) tf.Tensor([b'fio'], shape=(1,), dtype=string)


In [33]:
print(train)
first_question, first_label = text_batch[0], label_batch[0]
print("Question", first_question)
print("Label", first_label)

<_TakeDataset element_spec=(TensorSpec(shape=(1,), dtype=tf.string, name=None), TensorSpec(shape=(1,), dtype=tf.string, name=None))>
Question tf.Tensor(b'\xd0\x97\xd0\xb0\xd1\x85\xd0\xb0\xd1\x80\xd0\xbe\xd0\xb2\xd0\xb0 \xd0\x95\xd0\xbb\xd0\xb5\xd0\xbd\xd0\xb0', shape=(), dtype=string)
Label tf.Tensor(b'fio', shape=(), dtype=string)


In [34]:
print("'binary' vectorized question:",
      binary_vectorize_text(first_question, first_label)[0])

'binary' vectorized question: tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

In [36]:
binary_train_ds = train.map(binary_vectorize_text)
binary_val_ds = val.map(binary_vectorize_text)
binary_test_ds = test.map(binary_vectorize_text)

In [37]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [38]:
binary_train_ds = configure_dataset(binary_train_ds)
binary_val_ds = configure_dataset(binary_val_ds)
binary_test_ds = configure_dataset(binary_test_ds)

In [39]:
binary_model = tf.keras.Sequential([layers.Dense(4)])

binary_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])

history = binary_model.fit(
    binary_train_ds, validation_data=binary_val_ds, epochs=10)

Epoch 1/10


2023-05-20 16:13:54.604723: W tensorflow/core/framework/op_kernel.cc:1807] OP_REQUIRES failed at cast_op.cc:121 : UNIMPLEMENTED: Cast string to float is not supported


UnimplementedError: Graph execution error:

Detected at node 'Cast_1' defined at (most recent call last):
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 725, in start
      self.io_loop.start()
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
      await result
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/w2/qgvkx7vs15s8f4q7md3zqvv00000gn/T/ipykernel_81698/2290416683.py", line 8, in <module>
      history = binary_model.fit(
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/keras/engine/training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/keras/engine/training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/keras/engine/training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/keras/engine/training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/keras/engine/training.py", line 1055, in train_step
      return self.compute_metrics(x, y, y_pred, sample_weight)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/keras/engine/training.py", line 1149, in compute_metrics
      self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/keras/engine/compile_utils.py", line 605, in update_state
      metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/keras/utils/metrics_utils.py", line 77, in decorated
      update_op = update_state_fn(*args, **kwargs)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/keras/metrics/base_metric.py", line 140, in update_state_fn
      return ag_update_state(*args, **kwargs)
    File "/Users/skifry/mambaforge/envs/mlp/lib/python3.8/site-packages/keras/metrics/base_metric.py", line 676, in update_state
      y_true = tf.cast(y_true, self._dtype)
Node: 'Cast_1'
2 root error(s) found.
  (0) UNIMPLEMENTED:  Cast string to float is not supported
	 [[{{node Cast_1}}]]
  (1) CANCELLED:  Function was cancelled before it was started
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_25238]