In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/val.txt
/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/test.txt
/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/sampleSubmission.csv
/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/train.txt
/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/all.txt
/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/Challenge_Human_Motion_Generation_Notebook.ipynb
/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/texts/M009621.txt
/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/texts/M001922.txt
/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/texts/M000979.txt
/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/texts/M0024

In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from os.path import join
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [None]:
class BLEUCallback(tf.keras.callbacks.Callback):
    def __init__(self, X_val, Y_val, tokenizer):
        super().__init__()
        self.X_val = X_val
        self.Y_val = Y_val
        self.tokenizer = tokenizer
        self.smoothie = SmoothingFunction().method1  # Évite un BLEU = 0 sur des phrases courtes

    def on_epoch_end(self, epoch, logs=None):
        bleu_scores = []
        for i, motion in enumerate(self.X_val):
            pred_sentence = self.generate_text(motion)
            ref_sentence = self.tokenizer.sequences_to_texts([self.Y_val[i]])[0]

            bleu = sentence_bleu([ref_sentence.split()], pred_sentence.split(), smoothing_function=self.smoothie)
            bleu_scores.append(bleu)

            # Affichage à chaque prédiction
            print(f"🎯 Référence: {ref_sentence}")
            print(f"🤖 Prédiction: {pred_sentence}")
            print(f"🔵 BLEU Score: {bleu:.4f}\n{'-'*40}")

        avg_bleu = np.mean(bleu_scores)
        print(f"🔹 BLEU moyen après epoch {epoch+1}: {avg_bleu:.4f}")

    def generate_text(self, motion):
        input_seq = np.expand_dims(motion, axis=0)  # Ajouter une dimension pour batch

        # Créer un modèle d'encodeur pour obtenir state_h et state_c
        encoder_model = Model(inputs=self.model.input, outputs=[self.model.layers[1].output, self.model.layers[2].output])
        state_h, state_c = encoder_model.predict(input_seq)

        # Initialisation du séquenceur cible
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = self.tokenizer.word_index['startseq']

        decoded_sentence = []
        for _ in range(20):  # max_len (limité ici à 20 mots)
            # Décoder la séquence en utilisant l'état de l'encodeur
            output_tokens, state_h, state_c = self.model.layers[3](target_seq, initial_state=[state_h, state_c])

            # Sélectionner le mot avec la plus grande probabilité
            word_idx = np.argmax(output_tokens[0, -1, :])
            word = self.tokenizer.index_word.get(word_idx, '')

            # Arrêter si l'on atteint 'endseq' ou un mot inconnu
            if word == 'endseq' or word == '':
                break

            decoded_sentence.append(word)
            target_seq[0, 0] = word_idx  # Mise à jour du séquenceur cible

        return ' '.join(decoded_sentence)

In [8]:
def load_data(file_list, motion_data_dir, text_data_dir):
    data = {}
    with open(file_list, 'r') as f:
        titles = f.read().splitlines()
        for t in titles[:10]:
            npy_file = f"{t}.npy"
            motion_data = np.load(join(motion_data_dir, npy_file))

            # Charger et traiter la description
            txt_file = f"{text_data_dir}{t}.txt"
            with open(txt_file, 'r', encoding='utf-8') as m:
                desc = m.readline().split('#')[0].capitalize()

            data[desc] = motion_data

    return pd.DataFrame({'Description': list(data.keys()), 'Motion': list(data.values())})

def pad_motion_sequences(motions, max_length):
    T = len(motions)
    N = motions[0].shape[1]  # N
    d = motions[0].shape[2]  # d

    # Initialisation du tenseur avec des zéros
    mtpadded = np.zeros((T, max_length, N, d))

    for i, motion in enumerate(motions):
        T = motion.shape[0]  # Longueur réelle de la séquence
        mtpadded[i, :T, :, :] = motion  # Copier la séquence dans le tenseur

    return mtpadded

def predict_sequence(encoder_model, decoder_model, input_seq, tknz, max_seq):
    # Encoder : Extraire les états cachés
    states_value = encoder_model.predict(input_seq)  

    # Début de la séquence (token de départ)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tknz.word_index['startseq']

    # Stockage de la phrase générée
    decoded_sentence = []

    for _ in range(max_seq):
        # Prédiction d'un mot
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Choisir le mot avec la plus grande probabilité
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tknz.index_word.get(sampled_token_index, '')

        # Arrêter si "endseq" est généré
        if sampled_word == "endseq":
            break

        decoded_sentence.append(sampled_word)

        # Mise à jour du token d'entrée
        target_seq[0, 0] = sampled_token_index

        # Mise à jour des états cachés du décodeur
        states_value = [h, c]

    return ' '.join(decoded_sentence)

In [14]:
# Répertoires des données
direct = '/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/'
motion_data_dir = "/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/motions/"
text_data_dir = "/kaggle/input/generative-ai-for-3-d-human-motion-description/HumanML3D - Challenge/texts/"

# Charger les ensembles de données
traindf = load_data(direct+'train.txt', motion_data_dir, text_data_dir)
valdf = load_data(direct+'val.txt', motion_data_dir, text_data_dir)

# Affichage des tailles des ensembles
print('Train:', traindf.shape)
print('Validation:', valdf.shape)

xtrain, ytrain = traindf['Motion'], traindf['Description']
xval, yval = valdf['Motion'], valdf['Description']

Train: (10, 2)
Validation: (10, 2)


In [None]:
# Configuration des paramètres
hidden_units = 256
emb_dim = 128
max_motion = max([x.shape[0] for x in xtrain] )

# Tokenizer
tknz = Tokenizer()
tknz.fit_on_texts(ytrain.tolist() + ['startseq', 'endseq'])
vocab_size = len(tknz.word_index) + 1

# Transformer les descriptions en vecteurs
ytrainseq = tknz.texts_to_sequences(ytrain)
yvalseq = tknz.texts_to_sequences(yval)
max_seq = max(len(seq) for seq in ytrainseq)

# Padding
xtrainpad = pad_motion_sequences(xtrain, max_motion)
xvalpad = pad_motion_sequences(xval, max_motion)

xtrainpad = np.reshape(xtrainpad, (xtrainpad.shape[0], xtrainpad.shape[1], -1))  # Forme (x, x, 66)
xvalpad = np.reshape(xvalpad, (xvalpad.shape[0], xvalpad.shape[1], -1))
print("Forme des données de mouvement (train) :", xtrainpad.shape)
print("Forme des données de mouvement (validation) :", xvalpad.shape)

ytrainpad = pad_sequences(ytrainseq, maxlen=max_seq)
yvalpad = pad_sequences(yvalseq, maxlen=max_seq)
print("Forme des données de texte (train) :", ytrainpad.shape)
print("Forme des données de texte (validation) :", yvalpad.shape)

In [None]:
# Encoder LSTM pour les poses 3D
motion_input = Input(shape=(max_motion, 22, 3))  # (nb_frames, features)
reshape = tf.keras.layers.Reshape((max_motion, 66))(motion_input)

encoder_lstm = LSTM(hidden_units, return_state=True)
_, state_h, state_c = encoder_lstm(reshape)
encoder_states = [state_h, state_c]

# Créer un modèle séparé pour l'encodeur
encoder_model = Model(motion_input, encoder_states)

# Décodeur pour le texte
text_input = Input(shape=(max_seq,))
embedding = Embedding(input_dim=vocab_size, output_dim=emb_dim, mask_zero=True)(text_input)

decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(embedding, initial_state=encoder_states)

decoder_dense = Dense(vocab_size, activation='softmax')
output = decoder_dense(decoder_output)

In [None]:
# Modèle final
model = Model([reshape, text_input], output)
model.compile(optimizer=Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Entrées du modèle de décodage
decoder_input = Input(shape=(1,))  # Un seul token à la fois
decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Embedding du token actuel
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=emb_dim, mask_zero=True)(decoder_input)

# LSTM du décodeur (utilisation en inférence)
decoder_output, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

# Couche de sortie
decoder_output = decoder_dense(decoder_output)

# Modèle du décodeur pour la prédiction
decoder_model = Model([decoder_input] + decoder_states_inputs, [decoder_output] + decoder_states)

In [None]:
# Entraînement
model.fit([xtrainpad, ytrainpad], ytrainpad,
          batch_size=32, epochs=20)

In [None]:
for i in range(5):  # Afficher les 5 premières prédictions
    input_seq = xvalpad[i:i+1]  # Extraire un seul exemple
    predicted_sentence = predict_sequence(encoder_model, decoder_model, input_seq, tknz, max_seq)
    print(f"Phrase prédite {i+1}: {predicted_sentence}")