# Imports

In [72]:
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_experimental.llms.ollama_functions import OllamaFunctions

import os
import csv
import speech_recognition as sr
from tqdm import tqdm

from dataclasses import dataclass

from djitellopy import Tello

from sklearn.metrics import classification_report
import numpy as np


# Collect Audio

In [73]:
import pyaudio

p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
    info = p.get_device_info_by_index(i)
    print(f"Device {i}: {info['name']}")
p.terminate()

Device 0: iPhone de Lucca Microphone
Device 1: External Microphone
Device 2: External Headphones
Device 3: MacBook Pro Microphone
Device 4: MacBook Pro Speakers


In [74]:
import pyaudio
import wave

class RecordVoice:
    def __init__(self, 
                 chunk=1024, 
                 format=pyaudio.paInt16, 
                 channels=1, 
                 rate=44100, 
                 record_seconds=2.5,
                 output_filename="voice.wav", 
                 device_index=1):
        
        self.chunk = chunk
        self.format = format
        self.channels = channels
        self.rate = rate
        self.record_seconds = record_seconds
        self.output_filename = output_filename
        self.device_index = device_index
        self.frames = []
        self.p = pyaudio.PyAudio()

    def record(self):
        stream = self.p.open(format=self.format,
                             channels=self.channels,
                             rate=self.rate,
                             input=True,
                             input_device_index=self.device_index,
                             frames_per_buffer=self.chunk)

        print("* recording")

        for i in range(0, int(self.rate / self.chunk * self.record_seconds)):
            data = stream.read(self.chunk)
            self.frames.append(data)

        print("* done recording")

        stream.stop_stream()
        stream.close()
        self.p.terminate()

        wf = wave.open(self.output_filename, 'wb')
        wf.setnchannels(self.channels)
        wf.setsampwidth(self.p.get_sample_size(self.format))
        wf.setframerate(self.rate)
        wf.writeframes(b''.join(self.frames))
        wf.close()

# STT

In [54]:
# Download the model and processor

#from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# import torch

# model_name = "facebook/wav2vec2-large-xlsr-53-portuguese"

# print("Starting to download the processor...")
# # Download and save the processor locally
# processor = Wav2Vec2Processor.from_pretrained(model_name)
# processor.save_pretrained("./wav2vec2-large-xlsr-53-portuguese")
# print("Processor downloaded and saved.")

# print("Starting to download the model...")
# # Download and save the model locally
# model = Wav2Vec2ForCTC.from_pretrained(model_name)
# model.save_pretrained("./wav2vec2-large-xlsr-53-portuguese")
# print("Model downloaded and saved.")


In [75]:
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import warnings
import librosa

class STT:
    def __init__(self,processor,model):
        # Suppress warnings
        warnings.filterwarnings("ignore", category=FutureWarning)
        warnings.filterwarnings("ignore", category=UserWarning)

        # Load the model and processor from the local directory
        self.processor = processor
        self.model = model

    def transcribe_audio(self, audio_path):
        def read_audio(file_path, target_sr=16000):
            audio_input, sr = librosa.load(file_path, sr=target_sr)
            return audio_input

        audio_input = read_audio(audio_path)

        # Process the audio
        input_values = self.processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values

        # Perform inference
        with torch.no_grad():
            logits = self.model(input_values).logits

        # Decode the predicted IDs to text
        predicted_ids = logits.argmax(dim=-1)
        transcription = self.processor.batch_decode(predicted_ids)

        return transcription[0]


In [77]:
processor = Wav2Vec2Processor.from_pretrained("./wav2vec2-large-xlsr-53-portuguese")
model = Wav2Vec2ForCTC.from_pretrained("./wav2vec2-large-xlsr-53-portuguese")

stt = STT(processor,model)
text = stt.transcribe_audio('/Users/luccaemmanuel/Desktop/BIA/FunctionCalling/Dados/Dados_wav/DESCER/00000002-AUDIO-2024-06-15-20-17-42.wav')
print(text)


vai para baixo


# LLM Processor 

In [44]:
class Command(BaseModel):
    direction: str = Field(description="A direção que o drone deve ir.", required=True)


class LLMProcessor:
    def __init__(self):
        self.prompt = PromptTemplate.from_template(
            """system
            You are a smart assistant. Take the following context and question below and return your answer in JSON.
            user
            QUESTION: {question} \n
            CONTEXT: {context} \n
            JSON:
            
            assistant
            """
        )
        
        self.llm = OllamaFunctions(model="llama3", format="json", temperature=0)
        self.structured_llm = self.llm.with_structured_output(Command)
        self.chain = self.prompt | self.structured_llm
    
    def process(self, text):
        self.context = f'''Uma pessoa está controlando um drone por voz. As possíveis direções do drone são: SUBIR, DESCER, FRENTE, TRAS, DIREITA, ESQUERDA. Segue a transcrição do áudio do comando da pessoa: '{text}' '''
      
        try:
            response = self.chain.invoke({
                "question": "Qual direção a pessoa mandou o drone ir?",
                "context": self.context
            })
        except Exception as e:
            response = {"direction": "DESCONHECIDO"}
   
        return response
    

# Drone 

In [40]:
class DroneController:
    def __init__(self,drone):
        self.drone = drone
        
    def control_drone(self, command):
        if command == "SUBIR":
            self.drone.takeoff()
            self.drone.move_up(70)
        elif command == "DESCER":
            self.drone.move_down(70)
            self.drone.land()
        elif command == "ESQUERDA":
            self.drone.move_left(70)
        elif command == "DIREITA":
            self.drone.move_right(70)
        elif command == "FRENTE":
            self.drone.move_forward(70)
        elif command == "TRAS":
            self.drone.move_back(70)
        else:
            print("Unknown command")

        self.drone.end()

# Pipeline

In [7]:
processor = Wav2Vec2Processor.from_pretrained("/Users/luccaemmanuel/Desktop/BIA/FunctionCalling/asr_processor")
model = Wav2Vec2ForCTC.from_pretrained("/Users/luccaemmanuel/Desktop/BIA/FunctionCalling/asr_model")



In [141]:
drone = Tello()
drone.connect()

[INFO] tello.py - 129 - Tello instance was initialized. Host: '192.168.10.1'. Port: '8889'.
[INFO] tello.py - 438 - Send command: 'command'
[ERROR] tello.py - 458 - 'utf-8' codec can't decode byte 0xcc in position 0: invalid continuation byte
[INFO] tello.py - 438 - Send command: 'command'
[INFO] tello.py - 462 - Response command: 'ok'


In [92]:
drone.takeoff()

[INFO] tello.py - 438 - Send command: 'takeoff'
[INFO] tello.py - 462 - Response takeoff: 'ok'


In [93]:
recorder = RecordVoice()
stt = STT(processor,model)
llm = LLMProcessor()
controller = DroneController(drone)

path = '/Users/luccaemmanuel/Desktop/BIA/FunctionCalling/voice.wav'
recorder.record()
stt_text = stt.transcribe_audio(path)
llm_response = llm.process(stt_text)
controller.control_drone(llm_response.direction)


* recording
* done recording
e covai para a frente


[INFO] tello.py - 438 - Send command: 'forward 40'


FRENTE


[INFO] tello.py - 462 - Response forward 40: 'ok'
[INFO] tello.py - 438 - Send command: 'land'
[INFO] tello.py - 462 - Response land: 'ok'


- TESTE

In [118]:
stt = STT(processor,model)
llm = LLMProcessor()
controller = DroneController(drone)
path = '/Users/luccaemmanuel/Desktop/BIA/FunctionCalling/voice.wav'

In [142]:
drone.takeoff()

[INFO] tello.py - 438 - Send command: 'takeoff'
[INFO] tello.py - 462 - Response takeoff: 'ok'


In [143]:
RecordVoice().record()

stt_text = stt.transcribe_audio(path)
print('Texto: ',stt_text)
llm_response = llm.process(stt_text)
print(llm_response.direction)

controller.control_drone(llm_response.direction)



* recording
* done recording
Texto:  vai para trás por favor


[INFO] tello.py - 438 - Send command: 'back 40'
[INFO] tello.py - 462 - Response back 40: 'ok'


TRAS


# Metrics

In [None]:
processor = Wav2Vec2Processor.from_pretrained("/Users/luccaemmanuel/Desktop/BIA/FunctionCalling/asr_processor")
model = Wav2Vec2ForCTC.from_pretrained("/Users/luccaemmanuel/Desktop/BIA/FunctionCalling/asr_model")

In [45]:
stt = STT(processor,model)
llm = LLMProcessor()

In [46]:
import glob
import time
base_directory = '/Users/luccaemmanuel/Desktop/BIA/FunctionCalling/Dados/Dados_wav_augmented'
files = []
labels = []

for label in os.listdir(base_directory):
    folder_path = os.path.join(base_directory, label)
    if os.path.isdir(folder_path):
        for audio_file in tqdm(glob.glob(os.path.join(folder_path, '*.wav'))):
            files.append(audio_file)
            labels.append(label)


100%|██████████| 780/780 [00:00<00:00, 4084340.97it/s]
100%|██████████| 845/845 [00:00<00:00, 3790574.20it/s]
100%|██████████| 808/808 [00:00<00:00, 5050667.11it/s]
100%|██████████| 809/809 [00:00<00:00, 5627183.97it/s]
100%|██████████| 731/731 [00:00<00:00, 5369590.58it/s]
100%|██████████| 822/822 [00:00<00:00, 5873454.66it/s]


In [47]:
import random 

combined = list(zip(files, labels))
random.shuffle(combined)
files, labels = zip(*combined)

files = list(files)[:200]
labels = list(labels)[:200]



In [69]:
predictions = []
inference_times = []
texts = []
for file in tqdm(files):
    #calculate time of inference
    start = time.time()

    stt_text = stt.transcribe_audio(file)
    prediction = llm.process(stt_text)


    end = time.time()
    predictions.append(prediction)
    texts.append(stt_text)
    inference_times.append(end - start)

100%|██████████| 200/200 [04:16<00:00,  1.28s/it]


In [54]:
predictions_cleaned = []
for pred in predictions:
  
    if isinstance(pred,dict):
        predictions_cleaned.append('DESCONHECIDO')
    else:
        predictions_cleaned.append(pred.direction)

In [60]:
# Substitute anything other then "SUBIR", "DESCER", "ESQUERDA", "DIREITA", "FRENTE", "TRAS" with "DESCONHECIDO"

predictions_cleaned = [p if p in ["SUBIR", "DESCER", "ESQUERDA", "DIREITA", "FRENTE", "TRAS"] else "DESCONHECIDO" for p in predictions_cleaned]

In [61]:
predictions_cleaned

['SUBIR',
 'DESCER',
 'DIREITA',
 'ESQUERDA',
 'SUBIR',
 'FRENTE',
 'SUBIR',
 'ESQUERDA',
 'DIREITA',
 'FRENTE',
 'SUBIR',
 'ESQUERDA',
 'TRAS',
 'ESQUERDA',
 'DESCER',
 'TRAS',
 'TRAS',
 'DESCONHECIDO',
 'TRAS',
 'DIREITA',
 'DESCER',
 'SUBIR',
 'FRENTE',
 'FRENTE',
 'TRAS',
 'TRAS',
 'FRENTE',
 'FRENTE',
 'SUBIR',
 'FRENTE',
 'SUBIR',
 'TRAS',
 'SUBIR',
 'SUBIR',
 'FRENTE',
 'DIREITA',
 'DIREITA',
 'SUBIR',
 'DIREITA',
 'TRAS',
 'DESCER',
 'DESCONHECIDO',
 'SUBIR',
 'DESCONHECIDO',
 'ESQUERDA',
 'FRENTE',
 'SUBIR',
 'FRENTE',
 'FRENTE',
 'DESCONHECIDO',
 'DESCER',
 'FRENTE',
 'TRAS',
 'TRAS',
 'ESQUERDA',
 'FRENTE',
 'TRAS',
 'DESCER',
 'FRENTE',
 'ESQUERDA',
 'FRENTE',
 'TRAS',
 'DIREITA',
 'DESCER',
 'DIREITA',
 'ESQUERDA',
 'ESQUERDA',
 'FRENTE',
 'ESQUERDA',
 'SUBIR',
 'TRAS',
 'DESCONHECIDO',
 'DESCER',
 'TRAS',
 'FRENTE',
 'FRENTE',
 'DESCER',
 'TRAS',
 'DIREITA',
 'SUBIR',
 'DIREITA',
 'SUBIR',
 'TRAS',
 'TRAS',
 'FRENTE',
 'ESQUERDA',
 'SUBIR',
 'DESCONHECIDO',
 'DESCER',
 'F

In [68]:
report = classification_report(labels, predictions_cleaned)
print("Classification Report:")
print(report)

print('Mean inference time: ', np.mean(inference_times))
print('Std inference time: ', np.std(inference_times))

# Count how many 'desconhidos' there are
desconhecidos_count = predictions_cleaned.count('DESCONHECIDO')
print('Unknown commands: {:.2f}%'.format(100*desconhecidos_count/len(predictions_cleaned)))


Classification Report:
              precision    recall  f1-score   support

      DESCER       1.00      0.86      0.93        29
DESCONHECIDO       0.00      0.00      0.00         0
     DIREITA       1.00      0.93      0.96        29
    ESQUERDA       0.96      0.77      0.85        30
      FRENTE       0.60      0.91      0.72        33
       SUBIR       1.00      0.64      0.78        42
        TRAS       0.91      0.78      0.84        37

    accuracy                           0.81       200
   macro avg       0.78      0.70      0.73       200
weighted avg       0.91      0.81      0.84       200

Mean inference time:  1.2337602221965789
Std inference time:  0.0923626631249002
Unknown commands: 7.50%


In [24]:
predictions_t = [p.direction for p in predictions]
predictions_t


AttributeError: 'dict' object has no attribute 'direction'

In [32]:
predictions[0].direction

for i in range(len(predictions)):
    print(predictions[i].direction)
  

TRAS


AttributeError: 'dict' object has no attribute 'direction'

In [38]:
predictions


[Command(direction='TRAS'),
 {'direction': 'DESCONHECIDO'},
 Command(direction='TRAS'),
 Command(direction='DIREITA'),
 Command(direction='FRENTE'),
 Command(direction='TRAS'),
 Command(direction='DESCER'),
 Command(direction='ESQUERDA'),
 Command(direction='TRAS'),
 Command(direction='FRENTE'),
 {'direction': 'DESCONHECIDO'},
 Command(direction='TRAS'),
 Command(direction='DIREITA'),
 Command(direction='FRENTE'),
 {'direction': 'DESCONHECIDO'},
 Command(direction='FRENTE'),
 Command(direction='ESQUERDA'),
 Command(direction='TRAS'),
 Command(direction='ESQUERDA'),
 Command(direction='TRAS'),
 Command(direction='DIREITA'),
 Command(direction='ESQUERDA'),
 Command(direction='FRENTE'),
 Command(direction='FRENTE'),
 Command(direction='DIREITA'),
 Command(direction='ESQUERDA'),
 Command(direction='SUBIR'),
 Command(direction='SUBIR'),
 Command(direction='TRAS'),
 Command(direction='FRENTE'),
 Command(direction='FRENTE'),
 Command(direction='TRAS'),
 Command(direction='FRENTE'),
 {'directi