In this code we try and load the data pre process it and train it on models that we made from scratch to test their effeciency and what works best.

In [12]:
# ==========================
# IMPORTS
# ==========================
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report

# ==========================
# MOUNT GOOGLE DRIVE
# ==========================
from google.colab import drive
drive.mount('/content/drive')

# ==========================
# LOAD DATA
# ==========================
script_dir = "/content/drive/MyDrive/data"
file_name = "voice_commands_dataset_en_with_no_meaning.csv"
file_path = os.path.join(script_dir, file_name)

df = pd.read_csv(file_path)
print(df.head())
print(df['label'].value_counts())

# ==========================
# TRAIN/VAL/TEST SPLIT
# ==========================
X = df['text']
y = df['label']

# Split into train + temp
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
# Split temp into validation + test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                               text        label language
0                listen will you now give me a show   do_a_trick       en
1          hey friend can you instantly flip around  turn_around       en
2                  go1 could you quickly posture up        stand       en
3              listen go1 can you for me about face  turn_around       en
4  listen friend could you now go back about 6 step      go_back       en
label
do_a_trick     6250
turn_around    6250
stand          6250
go_back        6250
go_right       6250
go_forward     6250
sit            6250
go_left        6250
no_meaning     6250
Name: count, dtype: int64
Training samples: 39375
Validation samples: 8437
Test samples: 8438


This code implements a deep Bi-directional LSTM model for intent classification:

Embedding layer: Converts words into 64-dimensional dense vectors.

Three BiLSTM layers:

128 units, returns sequences → followed by 30% dropout

64 units, returns sequences → 30% dropout

32 units, returns last output → 30% dropout

Dense layers:

64-unit fully connected layer with ReLU → 30% dropout

Output layer with softmax activation matching the number of intent classes.

It is trained using categorical cross-entropy and Adam optimizer, taking sequences of max length 20 as input, and predicts the probability for each intent class.

The accuracy it gave was very high from the begining so we change the architecture and retrain since we think it did overfit on the data.

In [13]:
# ==========================
# TOKENIZATION & PADDING
# ==========================
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
max_len = 20  # max words per command, adjust if needed

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# ==========================
# LABEL ENCODING
# ==========================
le = LabelEncoder()
y_train_enc = to_categorical(le.fit_transform(y_train))
y_val_enc = to_categorical(le.transform(y_val))
y_test_enc = to_categorical(le.transform(y_test))
num_classes = y_train_enc.shape[1]

# ==========================
# BUILD MULTI-LAYER MODEL
# ==========================
embedding_dim = 64

model = Sequential()
# Embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
# First LSTM layer
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))
# Second LSTM layer
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.3))
# Third LSTM layer
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.3))
# Dense layers
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# ==========================
# TRAIN MODEL
# ==========================
history = model.fit(
    X_train_pad, y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=15,
    batch_size=32
)

# ==========================
# EVALUATE
# ==========================
y_pred = model.predict(X_test_pad)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test_enc.argmax(axis=1)

print(classification_report(y_test_classes, y_pred_classes, target_names=le.classes_))

# ==========================
# OPTIONAL: SAVE MODEL
# ==========================
model.save("/content/drive/MyDrive/voice_intent_model.h5")




Epoch 1/15
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.6438 - loss: 0.9005 - val_accuracy: 0.9967 - val_loss: 0.0225
Epoch 2/15
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 18ms/step - accuracy: 0.9920 - loss: 0.0440 - val_accuracy: 0.9998 - val_loss: 0.0027
Epoch 3/15
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 17ms/step - accuracy: 0.9987 - loss: 0.0065 - val_accuracy: 0.9986 - val_loss: 0.0118
Epoch 4/15
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 17ms/step - accuracy: 0.9965 - loss: 0.0175 - val_accuracy: 0.9998 - val_loss: 4.8702e-04
Epoch 5/15
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - accuracy: 0.9998 - loss: 0.0015 - val_accuracy: 1.0000 - val_loss: 2.4189e-05
Epoch 6/15
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 17ms/step - accuracy: 0.9988 - loss: 0.0076 - val_accuracy: 1.0000 - val_loss: 4.1



              precision    recall  f1-score   support

  do_a_trick       1.00      1.00      1.00       937
     go_back       1.00      1.00      1.00       937
  go_forward       1.00      1.00      1.00       938
     go_left       1.00      1.00      1.00       938
    go_right       1.00      1.00      1.00       938
  no_meaning       1.00      1.00      1.00       937
         sit       1.00      1.00      1.00       938
       stand       1.00      1.00      1.00       937
 turn_around       1.00      1.00      1.00       938

    accuracy                           1.00      8438
   macro avg       1.00      1.00      1.00      8438
weighted avg       1.00      1.00      1.00      8438



This is the second model which with a first look is better but still may be overfitting its architecture is as follows:
This model is a deep Bi-directional LSTM network for intent classification. It has an embedding layer (64-dim), two BiLSTM layers (64 and 32 units), each followed by dropout, a dense layer with 64 neurons, another dropout, and a softmax output layer that predicts probabilities for each intent class. It uses heavy dropout and spatial dropout for regularization to prevent overfitting.

In [15]:
# ==========================
# IMPORTS
# ==========================
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import random

# ==========================
# MOUNT DRIVE
# ==========================
from google.colab import drive
drive.mount('/content/drive')

# ==========================
# LOAD DATA
# ==========================
script_dir = "/content/drive/MyDrive/data"
file_name = "voice_commands_dataset_en_with_no_meaning.csv"
file_path = os.path.join(script_dir, file_name)

df = pd.read_csv(file_path)
print(df.head())
print(df['label'].value_counts())

# ==========================
# SIMPLE DATA AUGMENTATION
# ==========================
# Replace some words with synonyms (very basic example)
synonyms = {
    "stand": ["stand up", "rise", "get up"],
    "sit": ["sit down", "take a seat"],
    "go_forward": ["move forward", "advance", "go ahead"],
    "go_back": ["move back", "retreat", "go backward"],
    "go_left": ["turn left", "move left"],
    "go_right": ["turn right", "move right"],
    "turn_around": ["spin", "rotate 180", "turn"],
    "do_a_trick": ["perform a trick", "show a trick"],
    "no_meaning": []  # no synonyms (skip)
}

def augment_command(command, intent):
    if intent in synonyms and synonyms[intent]:
        if random.random() < 0.5:
            return random.choice(synonyms[intent])
    return command

df['command_aug'] = df.apply(lambda x: augment_command(x['text'], x['label']), axis=1)

# ==========================
# TRAIN/VAL/TEST SPLIT
# ==========================

X = df['text']
y = df['label']


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# ==========================
# TOKENIZATION & PADDING
# ==========================
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
max_len = 20

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# ==========================
# LABEL ENCODING
# ==========================
le = LabelEncoder()
y_train_enc = to_categorical(le.fit_transform(y_train))
y_val_enc = to_categorical(le.transform(y_val))
y_test_enc = to_categorical(le.transform(y_test))
num_classes = y_train_enc.shape[1]

# ==========================
# BUILD REGULARIZED MULTI-LAYER MODEL
# ==========================
embedding_dim = 64

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(SpatialDropout1D(0.5))  # embedding-level dropout
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# ==========================
# TRAIN MODEL
# ==========================
history = model.fit(
    X_train_pad, y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=32
)

# ==========================
# EVALUATE
# ==========================
y_pred = model.predict(X_test_pad)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test_enc.argmax(axis=1)

print(classification_report(y_test_classes, y_pred_classes, target_names=le.classes_))

# ==========================
# SAVE MODEL
# ==========================
model.save("/content/drive/MyDrive/voice_intent_model.keras")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                               text        label language
0                listen will you now give me a show   do_a_trick       en
1          hey friend can you instantly flip around  turn_around       en
2                  go1 could you quickly posture up        stand       en
3              listen go1 can you for me about face  turn_around       en
4  listen friend could you now go back about 6 step      go_back       en
label
do_a_trick     6250
turn_around    6250
stand          6250
go_back        6250
go_right       6250
go_forward     6250
sit            6250
go_left        6250
no_meaning     6250
Name: count, dtype: int64


Epoch 1/20
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - accuracy: 0.5600 - loss: 1.1078 - val_accuracy: 0.9976 - val_loss: 0.0119
Epoch 2/20
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - accuracy: 0.9882 - loss: 0.0587 - val_accuracy: 0.9976 - val_loss: 0.0086
Epoch 3/20
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 14ms/step - accuracy: 0.9935 - loss: 0.0315 - val_accuracy: 0.9988 - val_loss: 0.0034
Epoch 4/20
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 13ms/step - accuracy: 0.9951 - loss: 0.0226 - val_accuracy: 0.9998 - val_loss: 7.3513e-04
Epoch 5/20
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.9968 - loss: 0.0148 - val_accuracy: 0.9998 - val_loss: 8.6675e-04
Epoch 6/20
[1m1231/1231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 14ms/step - accuracy: 0.9986 - loss: 0.0078 - val_accuracy: 1.0000 - val_loss: 1.6

we save the model to use

In [16]:
import pickle

with open("/content/drive/MyDrive/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

with open("/content/drive/MyDrive/label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

we create an interavtive module to input text and it gives the probability corresponding for each intent.

In [17]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# ==========================
# LOAD MODEL
# ==========================
model_path = "/content/drive/MyDrive/voice_intent_model.keras"
model = load_model(model_path)

# ==========================
# LOAD TOKENIZER & LABEL ENCODER
# ==========================
# Make sure you saved them after training
tokenizer_path = "/content/drive/MyDrive/tokenizer.pkl"
label_encoder_path = "/content/drive/MyDrive/label_encoder.pkl"

with open(tokenizer_path, "rb") as f:
    tokenizer = pickle.load(f)

with open(label_encoder_path, "rb") as f:
    le = pickle.load(f)

# ==========================
# FUNCTION TO PREDICT INTENT PROBABILITIES
# ==========================
max_len = 20  # same as used during training

def predict_intent(command):
    seq = tokenizer.texts_to_sequences([command])
    pad = pad_sequences(seq, maxlen=max_len, padding='post')
    probs = model.predict(pad)[0]  # output probabilities
    intent_probs = dict(zip(le.classes_, probs))
    sorted_intents = dict(sorted(intent_probs.items(), key=lambda x: x[1], reverse=True))
    return sorted_intents

# ==========================
# INTERACTIVE LOOP
# ==========================
while True:
    command = input("Enter command (or 'quit' to exit): ")
    if command.lower() == 'quit':
        break
    predictions = predict_intent(command)
    print("\nIntent probabilities:")
    for intent, prob in predictions.items():
        print(f"{intent}: {prob:.4f}")
    print("\nMost likely intent:", max(predictions, key=predictions.get))
    print("="*40)


Enter command (or 'quit' to exit): sleep
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 290ms/step

Intent probabilities:
no_meaning: 1.0000
do_a_trick: 0.0000
turn_around: 0.0000
stand: 0.0000
go_right: 0.0000
go_left: 0.0000
sit: 0.0000
go_back: 0.0000
go_forward: 0.0000

Most likely intent: no_meaning
Enter command (or 'quit' to exit): rest
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step

Intent probabilities:
no_meaning: 1.0000
sit: 0.0000
go_right: 0.0000
do_a_trick: 0.0000
stand: 0.0000
go_back: 0.0000
turn_around: 0.0000
go_left: 0.0000
go_forward: 0.0000

Most likely intent: no_meaning
Enter command (or 'quit' to exit): lets walk
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step

Intent probabilities:
go_forward: 0.7068
turn_around: 0.2686
go_left: 0.0114
go_right: 0.0077
do_a_trick: 0.0034
go_back: 0.0018
stand: 0.0002
no_meaning: 0.0000
sit: 0.0000

Most likely intent: go_forward
Enter command (or 'quit' to exit): r

The below code uses softmax algorithim to train on the data. we tried playing with the hyper parameters to get the best results possible.

In [21]:
# ==========================
# IMPORTS
# ==========================
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# ==========================
# LOAD DATA
# ==========================
file_path = "/content/drive/MyDrive/data/voice_commands_dataset_en_with_no_meaning.csv"

df = pd.read_csv(file_path)
print("Dataset sample:")
print(df.head())

# Ensure correct column names
df = df.rename(columns={"text": "command", "label": "intent"})

# Drop missing rows (just in case)
df = df.dropna(subset=["command", "intent"])

# ==========================
# TRAIN / VALIDATION / TEST SPLIT
# ==========================
X = df["command"]
y = df["intent"]

# Split into train, validation, and test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(f"Training samples: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

# ==========================
# TEXT VECTORIZATION (TF–IDF)
# ==========================
vectorizer = TfidfVectorizer(
    max_features=30000,      # Limit vocab size
    ngram_range=(1, 2),     # Unigrams + bigrams
    stop_words='english'    # Remove common English words
)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)

print("Vectorized shape:", X_train_vec.shape)

# ==========================
# LABEL ENCODING
# ==========================
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)
y_test_enc = le.transform(y_test)

num_classes = len(le.classes_)
print("Classes:", list(le.classes_))

# ==========================
# MODEL: MULTINOMIAL LOGISTIC REGRESSION (SOFTMAX)
# ==========================
clf = LogisticRegression(
    max_iter=10000,
    solver='lbfgs',
    multi_class='multinomial',  # enables softmax regression
    C=2.0,                      # regularization strength (higher = less regularization)
)

# ==========================
# TRAIN MODEL
# ==========================
clf.fit(X_train_vec, y_train_enc)

# ==========================
# VALIDATE MODEL
# ==========================
y_val_pred = clf.predict(X_val_vec)
val_acc = accuracy_score(y_val_enc, y_val_pred)
print("\nValidation Accuracy:", val_acc)
print("\nValidation Classification Report:\n")
print(classification_report(y_val_enc, y_val_pred, target_names=le.classes_))

# ==========================
# TEST MODEL
# ==========================
y_test_pred = clf.predict(X_test_vec)
test_acc = accuracy_score(y_test_enc, y_test_pred)
print("\n==========================")
print("📊 FINAL TEST PERFORMANCE")
print("==========================")
print("Test Accuracy:", test_acc)
print("\nClassification Report:\n")
print(classification_report(y_test_enc, y_test_pred, target_names=le.classes_))

# ==========================
# SAVE MODEL & VECTORIZER
# ==========================
import joblib
joblib.dump(clf, "/content/drive/MyDrive/voice_intent_softmax_model.pkl")
joblib.dump(vectorizer, "/content/drive/MyDrive/voice_intent_tfidf_vectorizer.pkl")
joblib.dump(le, "/content/drive/MyDrive/voice_intent_label_encoder.pkl")

print("\n✅ Model, vectorizer, and label encoder saved successfully.")


Dataset sample:
                                               text        label language
0                listen will you now give me a show   do_a_trick       en
1          hey friend can you instantly flip around  turn_around       en
2                  go1 could you quickly posture up        stand       en
3              listen go1 can you for me about face  turn_around       en
4  listen friend could you now go back about 6 step      go_back       en
Training samples: 39375, Validation: 8437, Test: 8438
Vectorized shape: (39375, 1936)
Classes: ['do_a_trick', 'go_back', 'go_forward', 'go_left', 'go_right', 'no_meaning', 'sit', 'stand', 'turn_around']





Validation Accuracy: 0.968353680218087

Validation Classification Report:

              precision    recall  f1-score   support

  do_a_trick       0.90      0.89      0.90       938
     go_back       0.96      0.94      0.95       938
  go_forward       1.00      1.00      1.00       937
     go_left       1.00      1.00      1.00       937
    go_right       1.00      1.00      1.00       937
  no_meaning       1.00      1.00      1.00       938
         sit       0.99      0.96      0.98       937
       stand       0.90      0.95      0.92       938
 turn_around       0.96      0.98      0.97       937

    accuracy                           0.97      8437
   macro avg       0.97      0.97      0.97      8437
weighted avg       0.97      0.97      0.97      8437


📊 FINAL TEST PERFORMANCE
Test Accuracy: 0.9677648731926997

Classification Report:

              precision    recall  f1-score   support

  do_a_trick       0.91      0.89      0.90       937
     go_back       0.95  

Its the same interactive module but to test the softmax algorithim.

In [22]:
# ==========================
# INTERACTIVE INFERENCE
# ==========================
import joblib
from google.colab import output

# Load saved model, vectorizer, and label encoder
model_path = "/content/drive/MyDrive/voice_intent_softmax_model.pkl"
vectorizer_path = "/content/drive/MyDrive/voice_intent_tfidf_vectorizer.pkl"
label_encoder_path = "/content/drive/MyDrive/voice_intent_label_encoder.pkl"

clf = joblib.load(model_path)
vectorizer = joblib.load(vectorizer_path)
le = joblib.load(label_encoder_path)

# Interactive prediction loop
def predict_intent():
    while True:
        text = input("\n🎙️ Enter a command (or type 'exit' to quit): ").strip()
        if text.lower() == 'exit':
            print("👋 Exiting interactive mode.")
            break
        if text == "":
            print("⚠️ Please enter some text.")
            continue

        # Transform text
        text_vec = vectorizer.transform([text])
        probs = clf.predict_proba(text_vec)[0]
        pred_class = clf.predict(text_vec)[0]
        intent = le.inverse_transform([pred_class])[0]

        # Get top 3 probable intents
        top_idx = probs.argsort()[-3:][::-1]
        top_labels = le.inverse_transform(top_idx)
        top_probs = probs[top_idx]

        print(f"\n🔍 Predicted intent: **{intent.upper()}**")
        print("Top predictions:")
        for lbl, p in zip(top_labels, top_probs):
            print(f"  - {lbl:15s}: {p*100:.2f}%")

# Run interactive loop
predict_intent()



🎙️ Enter a command (or type 'exit' to quit): dance

🔍 Predicted intent: **DO_A_TRICK**
Top predictions:
  - do_a_trick     : 99.32%
  - no_meaning     : 0.59%
  - turn_around    : 0.02%

🎙️ Enter a command (or type 'exit' to quit): flip

🔍 Predicted intent: **TURN_AROUND**
Top predictions:
  - turn_around    : 95.90%
  - do_a_trick     : 3.97%
  - no_meaning     : 0.12%

🎙️ Enter a command (or type 'exit' to quit): backflip 

🔍 Predicted intent: **NO_MEANING**
Top predictions:
  - no_meaning     : 85.99%
  - go_back        : 3.05%
  - do_a_trick     : 2.94%

🎙️ Enter a command (or type 'exit' to quit): perform

🔍 Predicted intent: **DO_A_TRICK**
Top predictions:
  - do_a_trick     : 89.52%
  - no_meaning     : 7.42%
  - go_back        : 0.69%

🎙️ Enter a command (or type 'exit' to quit): lets walk

🔍 Predicted intent: **NO_MEANING**
Top predictions:
  - no_meaning     : 65.80%
  - go_back        : 22.23%
  - go_forward     : 3.16%

🎙️ Enter a command (or type 'exit' to quit): advance
