# Reconocedor de lenguaje de señas Argentino entrenado solo con el dataset argentino.

In [3]:
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input, decode_predictions
from tensorflow.keras.preprocessing import image
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model

import numpy as np
import matplotlib.pyplot as plt

Importamos los datasets y hacemos un split.
Comencemos por el dataset mas populado, el de lenguaje de señas americano, que utilizaremos para entrenar las capas intermedias.


In [4]:
from tensorflow.keras.preprocessing import image_dataset_from_directory

# Definimos los parametros
image_size = (299, 299)
batch_size = 32
asl_dir = "asl_dataset/"
train_val_seed = 42        # Es importante que sea la misma para ambos llamados

# Y creamos los conjuntos de entrenamiento y validacion. 
# Esto es medio raro, porque invocamos dos veces a image_dataset_from_directory para hacer el split,
# pero es la manera que indica la documentacion
asl_train_ds = image_dataset_from_directory(
    asl_dir,
    validation_split=0.2,
    subset="training",
    seed=train_val_seed, 
    image_size=image_size,
    batch_size=batch_size,
    label_mode='int'   # or 'categorical' if you want one-hot
)

asl_val_ds = image_dataset_from_directory(
    asl_dir,
    validation_split=0.2,
    subset="validation",
    seed=train_val_seed, 
    image_size=image_size,
    batch_size=batch_size,
    label_mode='int'
)

class_names = asl_train_ds.class_names

Found 2515 files belonging to 36 classes.
Using 2012 files for training.


2025-08-01 19:10:51.575782: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Found 2515 files belonging to 36 classes.
Using 503 files for validation.


In [5]:
print(class_names)

def count_elements(dataset):
    count = 0
    for batch in dataset:
        images, labels = batch
        count += images.shape[0]
    return count

print("Entradas en entrenamiento:", count_elements(asl_train_ds))
print("Entradas en validación:", count_elements(asl_val_ds))
print("Entradas en test:", count_elements(asl_test_ds))

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


2025-08-01 19:10:58.748535: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 34329984 exceeds 10% of free system memory.
2025-08-01 19:10:58.767188: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 34329984 exceeds 10% of free system memory.
2025-08-01 19:10:58.784858: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 34329984 exceeds 10% of free system memory.
2025-08-01 19:10:58.812616: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 34329984 exceeds 10% of free system memory.
2025-08-01 19:10:58.838555: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 34329984 exceeds 10% of free system memory.
2025-08-01 19:11:00.128875: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Entradas en entrenamiento: 2012
Entradas en validación: 503


2025-08-01 19:11:00.533292: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


NameError: name 'asl_test_ds' is not defined

Procesamos ahora las imagenes para adecuarlas al formato de *InceptionV3*,

In [6]:
from tensorflow.keras.applications.inception_v3 import preprocess_input

def preprocess_img(image, label):
    image = preprocess_input(image) 
    return image, label

asl_train_ds = asl_train_ds.map(preprocess_img).prefetch(tf.data.AUTOTUNE)
asl_val_ds   = asl_val_ds.map(preprocess_img).prefetch(tf.data.AUTOTUNE)

Chequeemos que obtuvimos las clases correctas,

In [7]:
print(class_names)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


Continuemos con la carga del dataset de lenguaje de señas argentino que definira las clases sobre la que predecirá el modelo.

In [8]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Extraemos los nombres de todas las imagenes que vamos a utilizar
lsa_dir = 'lsa16_segmented/'
filenames = [f for f in os.listdir(lsa_dir)]

# Y de cada una extraemos su clase, que viene dada por el primer numero del nombre
labels = [int(f.split('_')[0]) - 1 for f in filenames]   # Le restamos 1 a los labels para que esten en rango [0, 16) en vez de [1, 16]

# Y creamos un dataframe que asocia a cada nombre de archivo su clase.
lsa_df = pd.DataFrame({'filename': filenames, 'class': labels})

Preprocesamos las imagenes para adecuarlas al formato de ImageNet

In [9]:
from sklearn.model_selection import train_test_split

# Separaramos el conjunto de test (20%)
lsa_temp_df, lsa_test_df = train_test_split(
    lsa_df,
    test_size=0.2,
    stratify=lsa_df['class'],
    random_state=42
)

# Y separamos lo restante en train y val (80% train -> 64% total, 20% val -> 16% total)
lsa_train_df, lsa_val_df = train_test_split(
    lsa_temp_df,
    test_size=0.2,  # 20% de 80% = 16% del total
    stratify=lsa_temp_df['class'],
    random_state=42
)

Creamos un *pipeline* de datos de *TensorFlow*. La idea es aprovechar la paralelización del *map* para procesar los datos mas rápido.

In [10]:
# Definimos una funcion que dado un filename devuelve su imagen y su clase o label
def load_and_preprocess(image_path, label):

    # Leemos el archivo y lo decodificamos en RGB
    img = tf.io.read_file(lsa_dir + image_path)
    img = tf.image.decode_jpeg(img, channels=3) 
    
    # Lo preprocesamos para InceptionV3
    img = tf.image.resize(img, [299, 299])
    img = preprocess_input(img)  # Obs. que preprocess_input es una funcion de inception_v3 en particular
    
    return img, label

# Usamos un batch_size de TensorFlow estandar
batch_size = 32

# 1. Cargamos el dataframe
lsa_ds = tf.data.Dataset.from_tensor_slices((lsa_train_df['filename'].values, lsa_train_df['class'].values))

# 2. Le mappeamos el preprocesamiento a cada entrada, paralelizando
lsa_ds = lsa_ds.map(load_and_preprocess, num_parallel_calls=tf.data.AUTOTUNE)

# 3. Mezclamos para randomizar el orden de las muestras
lsa_ds = lsa_ds.shuffle(buffer_size=len(lsa_train_df))

# 4. Usamos el batch_size estandar
lsa_ds = lsa_ds.batch(batch_size)

# 5. Permitimos el prefetching del proximo batch
lsa_train_ds = lsa_ds.prefetch(tf.data.AUTOTUNE)                                                       

# Y repetimos lo mismo para el conjunto de validacion
lsa_val_ds = tf.data.Dataset.from_tensor_slices((lsa_val_df['filename'].values, lsa_val_df['class'].values))\
           .map(load_and_preprocess, num_parallel_calls=tf.data.AUTOTUNE) \
           .batch(batch_size) \
           .prefetch(tf.data.AUTOTUNE)

Entrenemos ahora las capas intermedias del modelo con el *dataset* de ASL,

Cargamos el modelo de manera que sea entrenable. No incluímos su última capa para poder establecer nuestras propias clases.

In [11]:
# Cargamos InceptionV3
base_model = InceptionV3(weights = 'imagenet',       # Pre-entrenado con ImageNet
                         include_top = False,        # Sin incluir su capa de clasificacion con 1000 clases para poder hacer fine-tuning 
                         input_shape = (299, 299, 3) # Necesario cuando no incluimos la ultima capa
                        )

# Inicialmente descongelamos todas las capas, despues congelamos las que no queremos que se entrenen
base_model.trainable = True

# Descongelamos desde la capa llamada mixed7, lo que descongela las ultimas ~50 capas.
set_trainable = False
for layer in base_model.layers:
    if layer.name == "mixed7":
        set_trainable = True
    layer.trainable = set_trainable

# Construimos la cabeza de clasificacion para las 36 clases de ASL
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(36, activation='softmax')(x)  # 26 letras + 10 digitos

model = Model(inputs=base_model.input, outputs=predictions)

# Compilamos el modelo usando un learning_rate bajo.
tuning_learning_rate = 1e-5
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=tuning_learning_rate),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Lo entrenamos con esos datos
model.fit(asl_train_ds, validation_data=asl_val_ds, epochs=5)

# Y nos guardamos los pesos del modelo del cual luego usaremos todo menos la cabeza de clasificacion.
model.save_weights("inceptionv3_hand_features.weights.h5")

Epoch 1/5


2025-08-01 19:12:04.280045: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 34330112 bytes after encountering the first element of size 34330112 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 3s/step - accuracy: 0.0934 - loss: 3.4406 - val_accuracy: 0.1829 - val_loss: 3.3185
Epoch 2/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 3s/step - accuracy: 0.3941 - loss: 2.7724 - val_accuracy: 0.6223 - val_loss: 2.6404
Epoch 3/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 3s/step - accuracy: 0.6481 - loss: 2.1396 - val_accuracy: 0.8509 - val_loss: 1.8960
Epoch 4/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 3s/step - accuracy: 0.7987 - loss: 1.5833 - val_accuracy: 0.8986 - val_loss: 1.2799
Epoch 5/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 3s/step - accuracy: 0.8693 - loss: 1.1402 - val_accuracy: 0.9245 - val_loss: 0.8667


Ahora cargamos ese modelo que entrenamos pero le sacamos la cabeza y colocamos la clasificadora de LSA.

In [12]:
# Reconstruimos el modelo, nuevamente sin incluir el top.
base_model = InceptionV3(weights=None, include_top=False, input_shape=(299, 299, 3))
base_model.trainable = False  # Y en este caso freezamos todas las capas pues solo queremos entrenar la que agregaremos

# Le agregamos la ultima capa
num_classes = lsa_train_df['class'].nunique()

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)

# Cargamos solo las capas compartidas con el modelo que entrenamos antes
model.load_weights("inceptionv3_hand_features.weights.h5", skip_mismatch=True)

# Lo compilamos, ahora con un learning rate un poco mas alto.
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Y entrenamos
model.fit(lsa_train_ds, validation_data=lsa_val_ds, epochs=10)

Epoch 1/10



The shape of the target variable and the shape of the target value in `variable.assign(value)` must match. variable.shape=(1024, 16), Received: value.shape=(1024, 36). Target variable: <Variable path=dense_3/kernel, shape=(1024, 16), dtype=float32, value=[[ 0.04093459  0.04401912 -0.04039394 ...  0.0559013  -0.01975827
  -0.0489627 ]
 [-0.03034027 -0.03789191 -0.07005539 ...  0.01434596 -0.03185568
  -0.02784391]
 [-0.07448781 -0.03328026  0.02478389 ...  0.04706521 -0.04614044
   0.00913257]
 ...
 [-0.02381406  0.03114007  0.01424659 ...  0.00447538 -0.03607813
   0.02731498]
 [ 0.0759483   0.04060939  0.07290486 ...  0.01610276 -0.07496329
  -0.06972116]
 [ 0.06632379  0.03647552  0.04533759 ... -0.04951165 -0.03469245
   0.02361427]]>

List of objects that could not be loaded:
[<Dense name=dense_3, built=True>]


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2s/step - accuracy: 0.1426 - loss: 3.0731 - val_accuracy: 0.4141 - val_loss: 2.0438
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2s/step - accuracy: 0.4355 - loss: 1.7786 - val_accuracy: 0.5781 - val_loss: 1.4771
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2s/step - accuracy: 0.5801 - loss: 1.3073 - val_accuracy: 0.6094 - val_loss: 1.1665
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 2s/step - accuracy: 0.6934 - loss: 0.9759 - val_accuracy: 0.7344 - val_loss: 0.9432
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2s/step - accuracy: 0.7598 - loss: 0.7740 - val_accuracy: 0.7500 - val_loss: 0.8252
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2s/step - accuracy: 0.8359 - loss: 0.5942 - val_accuracy: 0.7656 - val_loss: 0.7744
Epoch 7/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d84243ce720>

Y guardamos los pesos de este modelo final,

In [16]:
model.save_weights("combined_interpreter_final.weights.h5")

Esto que sigue creo que no funciona.

In [17]:
img_path = 'lsa16_segmented/1_1_1.png'  
img = image.load_img(img_path, target_size=(299, 299)) # La carga en img y le hace resize a 299x299
img_array = image.img_to_array(img)                    # La convierte a array de NumPy con dimensiones (299, 299, 3)
img_array = np.expand_dims(img_array, axis=0)          # Agrega una dimension mas al array, haciendolo (1, 299, 299, 3) para batching
img_array = preprocess_input(img_array)                # Matchea la representacion de la imagen a como la espera ImageNet (ej. mappea 0-255 a -1,1, cambia de RGB a BGR)

# Predict
predictions = model.predict(img_array)
decoded_predictions = decode_predictions(predictions, top=5)[0]

# Display results
plt.imshow(img)
plt.axis('off')
plt.show()

print("Top 5 Predictions:")
for i, (_, label, prob) in enumerate(decoded_predictions):
    print(f"{i + 1}: {label} ({prob * 100:.2f}%)")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step


ValueError: `decode_predictions` expects a batch of predictions (i.e. a 2D array of shape (samples, 1000)). Received array with shape: (1, 16)