In [1]:
import torch
import cv2
import torchaudio
import torchaudio.transforms as transforms
import numpy as np
from transformers import pipeline, Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
from pathlib import Path
import onnxruntime as ort
import streamlit as st
import faiss
import langchain
import subprocess

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load custom YOLOv5 model
yolo_model_path = "best.pt"
yolo_detect_script = "yolov5/detect.py"
print(f"Using YOLOv5 detect.py script with model {yolo_model_path}...")

Using YOLOv5 detect.py script with model best.pt...


In [3]:
# Ensure directories exist
uploads_dir = Path("uploads")
outputs_dir = Path("outputs")
uploads_dir.mkdir(exist_ok=True)
outputs_dir.mkdir(exist_ok=True)

In [4]:
def detect_fall(image_path):
    output_image_path = outputs_dir / Path(image_path).name
    command = [
        "python", yolo_detect_script,
        "--weights", yolo_model_path,
        "--source", image_path,
        "--save-txt",
        "--project", "outputs",
        "--name", "fall-detection",
        "--exist-ok"
    ]
    print(f"Running YOLOv5 detection: {' '.join(command)}")
    subprocess.run(command, check=True)
    processed_image_path = outputs_dir / "fall-detection" / Path(image_path).name
    return str(processed_image_path)

In [5]:
# Load pre-trained Speech Emotion Detection model
emotion_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
print(f"Loading Speech Emotion Detection model: {emotion_model_name}...")
emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained(emotion_model_name).to(dtype=torch.float32)
feature_extractor = AutoFeatureExtractor.from_pretrained(emotion_model_name)
print("Speech Emotion Detection model loaded successfully!")

Loading Speech Emotion Detection model: ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition...


Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', '

Speech Emotion Detection model loaded successfully!


In [6]:
# Emotion labels mapping
emotions = ['neutral', 'happy', 'sad', 'angry', 'fear', 'disgust', 'surprise', 'calm']

In [7]:
def predict_emotion(audio_path):
    print(f"Processing audio file for emotion detection: {audio_path}")
    waveform, sample_rate = torchaudio.load(audio_path)
    
    # Resample audio if needed
    target_sample_rate = 16000
    if sample_rate != target_sample_rate:
        resampler = transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)
        sample_rate = target_sample_rate
    
    # Ensure correct waveform shape and dtype
    waveform = waveform.squeeze(0).to(dtype=torch.float32)  # Remove batch dimension and ensure float32
    inputs = feature_extractor(waveform, return_tensors="pt", padding=True)
    
    with torch.no_grad():
        logits = emotion_model(**inputs).logits
    predicted_class = torch.argmax(logits, dim=-1).item()
    predicted_emotion = emotions[predicted_class] if predicted_class < len(emotions) else "Unknown"
    print(f"Emotion prediction complete! Predicted class: {predicted_class} ({predicted_emotion})")
    return predicted_emotion


In [8]:
# Model optimization: Quantization & Pruning
print("Applying model optimizations...")
def quantize_yolo():
    print("Quantizing YOLOv5 model using ONNX...")
    dummy_input = torch.randn(1, 3, 640, 640)
    torch.onnx.export(torch.hub.load('ultralytics/yolov5', 'custom', path=yolo_model_path), dummy_input, "yolo_quantized.onnx", opset_version=11)
    print("YOLOv5 quantization completed!")

def quantize_wav2vec():
    print("Quantizing Wav2Vec2 model...")
    emotion_model.to(dtype=torch.float32)  # Ensure model is in float32 to avoid dtype mismatch
    print("Wav2Vec2 quantization completed!")

Applying model optimizations...


In [9]:
quantize_yolo()
quantize_wav2vec()

Quantizing YOLOv5 model using ONNX...


Using cache found in /Users/nanxuan/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2025-3-5 Python-3.9.21 torch-2.6.0 CPU

Fusing layers... 
Model summary: 322 layers, 86180143 parameters, 0 gradients, 203.8 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
  y = self.model(im, augment=augment, visualize=visualize) if augment or visualize else self.model(im)
  if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:


YOLOv5 quantization completed!
Quantizing Wav2Vec2 model...
Wav2Vec2 quantization completed!


In [11]:
!streamlit run app.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.31.99.212:8501[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
Using YOLOv5 detect.py script with model best.pt...
Loading Speech Emotion Detection model: ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition...
Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining 