##### Start

In [1]:
import os
import cv2
import torch
import time
import queue
import threading
import shutil
import sounddevice as sd
import face_recognition
import numpy as np
import streamlit as st
import scipy.io.wavfile as wav
from scipy.io.wavfile import write
import torch.nn.functional as F
import torchaudio
from datetime import datetime
from playsound import playsound
from speechbrain.inference.speaker import SpeakerRecognition

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import streamlit as st
import face_recognition
import cv2
import numpy as np
import os
import time
from datetime import datetime
import sounddevice as sd
import scipy.io.wavfile as wav
from speechbrain.pretrained import SpeakerRecognition

# --- CONFIGURATIONS ---
FACE_MATCH_THRESHOLD = 0.6
VOICE_MATCH_THRESHOLD = 0.75
KNOWN_FACES_DIR = "faces"
VOICE_EMBEDDINGS_DIR = "voice_embeddings"
ACCESS_LOG_FILE = "access_log.txt"

# --- INITIALIZE SESSION STATE ---
for key in ["face_verified", "user_name", "stored_voice_embedding"]:
    if key not in st.session_state:
        st.session_state[key] = None

# --- HELPER FUNCTIONS ---
def load_face_encodings():
    known_encodings = []
    known_names = []
    for name in os.listdir(KNOWN_FACES_DIR):
        img_path = os.path.join(KNOWN_FACES_DIR, name)
        img = face_recognition.load_image_file(img_path)
        encodings = face_recognition.face_encodings(img)
        if encodings:
            known_encodings.append(encodings[0])
            known_names.append(os.path.splitext(name)[0])
    return known_encodings, known_names

def verify_face_live(known_encodings, known_names):
    st.markdown("### Step 1: Face Verification")
    st.info("Please align your face with the camera and click the 'Capture' button.")
    run = st.button("Capture")
    if run:
        cap = cv2.VideoCapture(0)
        time.sleep(2)
        ret, frame = cap.read()
        cap.release()

        if not ret:
            st.error("Failed to capture image.")
            return None

        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        face_locations = face_recognition.face_locations(rgb_frame)
        face_encodings = face_recognition.face_encodings(rgb_frame, face_locations)

        if len(face_encodings) == 0:
            st.warning("No face detected.")
            return None

        face_encoding = face_encodings[0]
        distances = face_recognition.face_distance(known_encodings, face_encoding)
        best_match_index = np.argmin(distances)

        if distances[best_match_index] < FACE_MATCH_THRESHOLD:
            name = known_names[best_match_index]
            st.success(f"Face verified as **{name}**.")
            return name
        else:
            st.error("Face not recognized.")
            return None

def load_voice_embedding_for_user(name):
    path = os.path.join(VOICE_EMBEDDINGS_DIR, f"{name}.wav")
    if not os.path.exists(path):
        return None
    return path

def record_voice(duration=4, fs=16000):
    st.info("Recording for 4 seconds...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()
    file_path = "temp_user_audio.wav"
    wav.write(file_path, fs, audio)
    return file_path

def verify_voice(user_audio_path, stored_audio_path):
    model = SpeakerRecognition.from_hparams(
        source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/voice"
    )
    score, _ = model.verify_files(user_audio_path, stored_audio_path)
    return (score > VOICE_MATCH_THRESHOLD), score

def log_access(user, status, reason=None):
    with open(ACCESS_LOG_FILE, "a") as f:
        line = f"{datetime.now()}, {user}, {status}"
        if reason:
            line += f", {reason}"
        f.write(line + "\n")

# --- MAIN APP UI ---
st.title("Multi-modal Biometric Access Control")
st.markdown("Secure access using **Face + Voice Recognition**.")

known_face_encodings, known_face_names = load_face_encodings()

# Step 1: Face Verification
if st.session_state.face_verified is None:
    if st.button("Start Verification"):
        user = verify_face_live(known_face_encodings, known_face_names)
        if user:
            st.session_state.face_verified = True
            st.session_state.user_name = user
            voice_path = load_voice_embedding_for_user(user)
            if voice_path:
                st.session_state.stored_voice_embedding = voice_path
            else:
                st.warning("⚠️ No stored voice embedding found.")
                log_access(user, "denied", "Missing voice embedding")
                st.session_state.face_verified = None  # reset

# Step 2: Voice Verification
if st.session_state.face_verified and st.session_state.stored_voice_embedding:
    st.markdown("### Step 2: Voice Verification")
    if st.button("Record Voice"):
        user_voice = record_voice()
        verified, score = verify_voice(user_voice, st.session_state.stored_voice_embedding)
        if verified:
            st.success(f"✅ Voice verified. Cosine similarity: {score:.2f}")
            st.balloons()
            log_access(st.session_state.user_name, "access granted")

            # New Verification Button
            if st.button("Start New Verification"):
                for key in ["user_name", "stored_voice_embedding", "face_verified"]:
                    st.session_state[key] = None
                st.rerun()
        else:
            st.error(f"❌ Voice mismatch. Cosine similarity: {score:.2f}")
            log_access(st.session_state.user_name, "denied", "Voice mismatch")


In [8]:
%pip install subprocess

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement subprocess (from versions: none)
ERROR: No matching distribution found for subprocess

[notice] A new release of pip available: 22.2.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import pyttsx3

# Initialize the TTS engine
engine = pyttsx3.init()

# Set properties before adding things to say
engine.setProperty('rate', 150)    # Speed percent (can go over 100)
engine.setProperty('volume', 0.9)  # Volume 0-1

# Optionally, set a specific voice (male/female) if available
voices = engine.getProperty('voices')
# For male voice
engine.setProperty('voice', voices[1].id)
# For female voice, uncomment the following line
# engine.setProperty('voice', voices[1].id)

# Define the welcome message
welcome_text = "Authentication successful. Welcome to the conference."

# Save the speech to a file
engine.save_to_file(welcome_text, 'welcome.wav')

# Run the speech engine
engine.runAndWait()


##### Video Embedding

In [2]:
import face_recognition
import os
import numpy as np
import cv2

In [4]:
# Folder containing videos
video_folder = '../data/video'

# Directory to save extracted frames
frame_output_folder = 'frames_output'
os.makedirs(frame_output_folder, exist_ok=True)

# Directory to save embeddings
embeddings_folder = 'embeddings/video_new'
os.makedirs(embeddings_folder, exist_ok=True)

In [5]:
def extract_face_embeddings_from_video(video_path, video_name):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_count = 0
    embeddings = []
    frames_to_extract = 10
    frame_indices = np.linspace(0, total_frames - 1, frames_to_extract, dtype=int)

    while True:
        success, frame = cap.read()
        if not success or frame is None:
            break

        if frame_count in frame_indices:
            # Convert frame to RGB directly in memory
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            # Detect face locations and embeddings
            face_locations = face_recognition.face_locations(rgb_frame)
            if face_locations:
                face_encodings = face_recognition.face_encodings(rgb_frame, face_locations)
                if face_encodings:
                    embeddings.append(face_encodings[0])
            else:
                print(f"No face detected at frame {frame_count}. Skipping...")

        frame_count += 1
        if len(embeddings) >= frames_to_extract:
            break

    cap.release()
    print(f"Extracted {len(embeddings)} embeddings from {video_name}.")

    # Save embeddings if any were found
    if embeddings:
        embeddings_array = np.array(embeddings)
        np.save(os.path.join(embeddings_folder, f"{video_name}_face_embeddings.npy"), embeddings_array)
        print(f"Embeddings saved for {video_name}.")
    else:
        print(f"[!] No embeddings found for {video_name}.")


In [6]:
# Loop through all videos in the folder
for filename in os.listdir(video_folder):
    print(filename)

abdullah.mp4
aminah.mp4
dembele.mp4
gentle.mp4
haypen.mp4
maimunah.mp4
meedo.mp4
mustapha.mp4
rukoyah.mp4


In [7]:
# Loop through all videos in the folder
for filename in os.listdir(video_folder):
    if filename.endswith('.mp4'):  # or any other video format you have
        video_path = os.path.join(video_folder, filename)
        video_name = os.path.splitext(filename)[0]  # Remove extension
        print(f"Processing video: {video_name}")
        extract_face_embeddings_from_video(video_path, video_name)

Processing video: abdullah
No face detected at frame 613. Skipping...
Extracted 9 embeddings from abdullah.
Embeddings saved for abdullah.
Processing video: aminah
Extracted 10 embeddings from aminah.
Embeddings saved for aminah.
Processing video: dembele
No face detected at frame 452. Skipping...
Extracted 9 embeddings from dembele.
Embeddings saved for dembele.
Processing video: gentle
No face detected at frame 0. Skipping...
No face detected at frame 100. Skipping...
No face detected at frame 200. Skipping...
No face detected at frame 301. Skipping...
No face detected at frame 502. Skipping...
No face detected at frame 602. Skipping...
Extracted 4 embeddings from gentle.
Embeddings saved for gentle.
Processing video: haypen
No face detected at frame 1427. Skipping...
Extracted 9 embeddings from haypen.
Embeddings saved for haypen.
Processing video: maimunah
Extracted 10 embeddings from maimunah.
Embeddings saved for maimunah.
Processing video: meedo
No face detected at frame 55. Ski

##### Audio Embedding

In [1]:
import os
import torch
import torchaudio
from torchaudio.transforms import Resample
from speechbrain.inference.speaker import SpeakerRecognition
from torchaudio.functional import vad
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Paths
AUDIO_DIR = '../data/audio'  # Directory where your audio files are located
EMBEDDING_DIR = 'embeddings/audio_new'  # Directory to save embeddings
MODEL_PATH = os.path.join("pretrained_models", "spkrec")  # Path to the pre-trained model

# Ensure the embedding directory exists
os.makedirs(EMBEDDING_DIR, exist_ok=True)

In [3]:
# === Configuration ===
BATCH_SIZE = 4
TARGET_SR = 16000  # Required by most models
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# === Load Model ===
speaker_model = SpeakerRecognition.from_hparams(
    source=MODEL_PATH,
    savedir=MODEL_PATH,
    run_opts={"device": DEVICE},
    use_auth_token=False
)

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


In [5]:
from torchaudio.transforms import Vad

# Define global VAD transform
vad_transform = Vad(sample_rate=16000)

def apply_vad(waveform, sample_rate):
    if sample_rate != 16000:
        waveform = Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    return vad_transform(waveform)


In [6]:
# === Preprocess Audio (resample, VAD) ===
def preprocess_audio(file_path):
    waveform, sr = torchaudio.load(file_path)

    # Convert stereo to mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Apply VAD to remove silence
    waveform = apply_vad(waveform, sr)

    # Resample if needed
    if sr != TARGET_SR:
        waveform = Resample(sr, TARGET_SR)(waveform)

    return waveform.squeeze(0)

In [None]:
# # Function to save audio embedding
# def save_audio_embedding(name, embedding):
#     embedding_filename = os.path.join(EMBEDDING_DIR, f"{name}_voice.pt")
#     torch.save(embedding, embedding_filename)  # Save the embedding
#     print(f"Embedding saved for {name} as {embedding_filename}")

In [7]:
import torch.nn.functional as F

def l2_normalize(embedding):
    return F.normalize(embedding, p=2, dim=0)

# === Extract embedding ===
def extract_embedding(waveform):
    embedding = speaker_model.encode_batch(waveform.unsqueeze(0)).squeeze().detach()
    embedding = l2_normalize(embedding)
    return embedding

# === Save embedding ===
def save_embedding(name, embedding):
    file_path = os.path.join(EMBEDDING_DIR, f"{name}_voice.pt")
    torch.save(embedding, file_path)
    print(f"✅ Saved: {file_path}")

In [8]:
audio_files = [
    f for f in os.listdir(AUDIO_DIR)
    if f.endswith('.wav')
]

print(f"📂 Found {len(audio_files)} audio files.")

for i in tqdm(range(0, len(audio_files), BATCH_SIZE)):
    batch_files = audio_files[i:i + BATCH_SIZE]
    for filename in batch_files:
        name = os.path.splitext(filename)[0]
        path = os.path.join(AUDIO_DIR, filename)

        try:
            waveform = preprocess_audio(path)
            embedding = extract_embedding(waveform)
            save_embedding(name, embedding)
        except Exception as e:
            print(f"[!] Failed for {name}: audio too short or noisy")
            if "size should be less than the corresponding input dimension" in str(e).lower():
                print(f"Audio has very low volume or silence .\n"
            f"Possible reasons: microphone off, silence, or heavy noise cancellation.\n"
            f"✔️ Please re-record with clear speech and minimal background noise."  
            )


📂 Found 9 audio files.


  0%|          | 0/3 [00:00<?, ?it/s]

✅ Saved: embeddings/audio_new\abdullah_voice.pt
✅ Saved: embeddings/audio_new\aminah_voice.pt
✅ Saved: embeddings/audio_new\dembele_voice.pt


 33%|███▎      | 1/3 [00:17<00:35, 17.66s/it]

✅ Saved: embeddings/audio_new\gentle_voice.pt
✅ Saved: embeddings/audio_new\haypen_voice.pt
[!] Failed for maimunah: audio too short or noisy
Audio has very low volume or silence .
Possible reasons: microphone off, silence, or heavy noise cancellation.
✔️ Please re-record with clear speech and minimal background noise.
[!] Failed for meedo: audio too short or noisy
Audio has very low volume or silence .
Possible reasons: microphone off, silence, or heavy noise cancellation.
✔️ Please re-record with clear speech and minimal background noise.


 67%|██████▋   | 2/3 [00:26<00:12, 12.47s/it]

✅ Saved: embeddings/audio_new\mustapha_voice.pt


100%|██████████| 3/3 [00:29<00:00,  9.98s/it]

✅ Saved: embeddings/audio_new\rukoyah_voice.pt





In [22]:
for filename in batch_files:
    name = os.path.splitext(filename)[0]
    path = os.path.join(AUDIO_DIR, filename)
    print(path)

../data/audio\rukoyah.wav


In [35]:
waveform, _ = torchaudio.load('..\data\\audio\maimunah.wav')
print(waveform.abs().mean())  # If it's near zero → silent


tensor(0.0014)


In [25]:
waveform, _ = torchaudio.load('mustapha.wav')
print(waveform.abs().mean())  # If it's near zero → silent


tensor(0.0294)


##### Audio Reg

In [2]:
import os
import cv2
import torch
import time
import queue
import sounddevice as sd
import face_recognition
import numpy as np
import streamlit as st
import scipy.io.wavfile as wav
from scipy.io.wavfile import write
import torch.nn.functional as F
import torchaudio
from datetime import datetime
from speechbrain.inference.speaker import SpeakerRecognition

In [4]:
# --- Constants ---
FACE_EMBED_DIR = "embeddings/video_new"
VOICE_EMBED_DIR = "embeddings/audio_new"
MODEL_PATH = os.path.join("pretrained_models", "spkrec")
AUDIO_TMP_PATH = "temp_user_audio.wav"
LOG_FILE = "access_logs.log"
FACE_MATCH_THRESHOLD = 0.4
VOICE_MATCH_THRESHOLD = 0.60
RECORD_DURATION = 10
SAMPLE_RATE = 16000

In [11]:
# === Configuration ===
BATCH_SIZE = 4
TARGET_SR = 16000  # Required by most models
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


In [12]:
# === Load Model ===
speaker_model = SpeakerRecognition.from_hparams(
    source=MODEL_PATH,
    savedir=MODEL_PATH,
    run_opts={"device": DEVICE},
    use_auth_token=False
)


AttributeError: module 'streamlit' has no attribute 'text'

In [None]:
def load_voice_embedding_for_user(name):
    path = os.path.join(VOICE_EMBED_DIR, f"{name}_voice.pt")
    if not os.path.exists(path):
        return None
    return path

def record_voice(seconds=10, fs=16000):
    st.info("🎤 Please speak after clicking the record button")
    audio_q = queue.Queue()

    def callback(indata, frames, time, status):
        audio_q.put(indata.copy())

    with sd.InputStream(samplerate=fs, channels=1, callback=callback):
        audio_data = []
        with st.spinner("Recording..."):
            for _ in range(int(fs / 1024 * seconds)):
                audio_data.append(audio_q.get())
        audio_np = np.concatenate(audio_data, axis=0)
        wav.write(AUDIO_TMP_PATH, fs, audio_np)
    st.success("✅ Recording complete")
    return AUDIO_TMP_PATH

def process_audio(file_path):
    waveform, sr = torchaudio.load(file_path)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)
    waveform = waveform.squeeze(0)  # Make sure shape is [time] not [1, time]
    embedding = speaker_model.encode_batch(waveform.unsqueeze(0)).squeeze().detach()
    return embedding

def verify_voice(processed_audio, stored_embedding):
    score = F.cosine_similarity(processed_audio, stored_embedding, dim=0).item()
    return score > VOICE_MATCH_THRESHOLD, score

In [8]:
def process_audio(file_path):
    waveform, sr = torchaudio.load(file_path)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)
    waveform = waveform.squeeze(0)  # Make sure shape is [time] not [1, time]
    embedding = speaker_model.encode_batch(waveform.unsqueeze(0)).squeeze().detach()
    return embedding

In [9]:
def verify_voice(processed_audio, stored_embedding):
    score = F.cosine_similarity(processed_audio, stored_embedding, dim=0).item()
    return score > VOICE_MATCH_THRESHOLD, score

In [None]:

# --- HELPER FUNCTIONS ---


def load_voice_embedding_for_user(name):
    path = os.path.join(VOICE_EMBEDDINGS_DIR, f"{name}.wav")
    if not os.path.exists(path):
        return None
    return path

def record_voice(duration=4, fs=16000):
    st.info("Recording for 4 seconds...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()
    file_path = "temp_user_audio.wav"
    wav.write(file_path, fs, audio)
    return file_path

def verify_voice(user_audio_path, stored_audio_path):
    model = SpeakerRecognition.from_hparams(
        source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/voice"
    )
    score, _ = model.verify_files(user_audio_path, stored_audio_path)
    return (score > VOICE_MATCH_THRESHOLD), score

def log_access(user, status, reason=None):
    with open(ACCESS_LOG_FILE, "a") as f:
        line = f"{datetime.now()}, {user}, {status}"
        if reason:
            line += f", {reason}"
        f.write(line + "\n")

# --- MAIN APP UI ---
st.title("Multi-modal Biometric Access Control")
st.markdown("Secure access using **Face + Voice Recognition**.")

known_face_encodings, known_face_names = load_face_encodings()

# Step 1: Face Verification
if st.session_state.face_verified is None:
    if st.button("Start Verification"):
        user = verify_face_live(known_face_encodings, known_face_names)
        if user:
            st.session_state.face_verified = True
            st.session_state.user_name = user
            voice_path = load_voice_embedding_for_user(user)
            if voice_path:
                st.session_state.stored_voice_embedding = voice_path
            else:
                st.warning("⚠️ No stored voice embedding found.")
                log_access(user, "denied", "Missing voice embedding")
                st.session_state.face_verified = None  # reset

# Step 2: Voice Verification
if st.session_state.face_verified and st.session_state.stored_voice_embedding:
    st.markdown("### Step 2: Voice Verification")
    if st.button("Record Voice"):
        user_voice = record_voice()
        verified, score = verify_voice(user_voice, st.session_state.stored_voice_embedding)
        if verified:
            st.success(f"✅ Voice verified. Cosine similarity: {score:.2f}")
            st.balloons()
            log_access(st.session_state.user_name, "access granted")

            # New Verification Button
            if st.button("Start New Verification"):
                for key in ["user_name", "stored_voice_embedding", "face_verified"]:
                    st.session_state[key] = None
                st.rerun()
        else:
            st.error(f"❌ Voice mismatch. Cosine similarity: {score:.2f}")
            log_access(st.session_state.user_name, "denied", "Voice mismatch")


##### Frames Exraction

In [32]:
import cv2
import os

# Path to the input video
video_dir = '../data/video'

# Folder to save extracted frames
output_folder = 'frames'
os.makedirs(output_folder, exist_ok=True)



def extract_frames(video_path, video_name):
    # Open the video file
    cap = cv2.VideoCapture(video_path)
    frames_dir = os.path.join(output_folder, video_name)
    os.makedirs(frames_dir, exist_ok=True)

    frame_count = 0

    while True:
        success, frame = cap.read()

        if not success:
            print("Finished reading the video.")
            break

        if frame is None:
            print(f"[!] Frame {frame_count} is None, skipping...")
            continue

        
        # Convert from BGR (OpenCV default) to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Save frame as image (optional)
        frame_filename = os.path.join(frames_dir, f"{video_name}_{frame_count:04d}.jpg")
        cv2.imwrite(frame_filename, cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR))  # Save back as BGR for normal viewing

        frame_count += 1

    cap.release()
    print(f"Extracted {frame_count} frames.")


In [33]:
# Path to the input video
video_dir ='../data/video'
video_files = [f for f in os.listdir(video_dir) if f.endswith('.mp4')]  # Get all .wav files
for video_filename in video_files:
    video_path = os.path.join(video_dir, video_filename)
    video_name = os.path.splitext(video_filename)[0]  # Get the base name without extension
    print(f"Processing: {video_name}")
    extract_frames(video_path, video_name)

Processing: abdullah
Finished reading the video.
Extracted 1106 frames.
Processing: aminah
Finished reading the video.
Extracted 524 frames.
Processing: dembele
Finished reading the video.
Extracted 510 frames.
Processing: gentle
Finished reading the video.
Extracted 905 frames.
Processing: haypen
Finished reading the video.
Extracted 1836 frames.
Processing: maimunah
Finished reading the video.
Extracted 555 frames.
Processing: meedo
Finished reading the video.
Extracted 168 frames.
Processing: rukoyah
Finished reading the video.
Extracted 597 frames.


##### Face Recognition

In [1]:
import os
import numpy as np
import cv2
import face_recognition
from sklearn.metrics.pairwise import cosine_similarity


In [15]:
# Paths
known_embeddings_dir = "embeddings/video/"
new_faces_dir = "new_f/"

In [3]:
known_embeddings = []
known_names = []

for filename in os.listdir(known_embeddings_dir):
    if filename.endswith(".npy"):
        name = os.path.splitext(filename)[0]
        embeddings = np.load(os.path.join(known_embeddings_dir, filename))

        # Skip if embeddings are empty
        if embeddings.ndim != 2 or embeddings.shape[1] != 128:
            print(f"⚠️ Skipping {filename} — unexpected shape: {embeddings.shape}")
            continue

        # Mean pooling: average over all embeddings
        mean_embedding = np.mean(embeddings, axis=0)

        known_embeddings.append(mean_embedding)
        known_names.append(name)

known_embeddings = np.array(known_embeddings)


In [4]:
known_embeddings.shape

(8, 128)

In [16]:
# Loop through new face images
for img_file in os.listdir(new_faces_dir):
    if img_file.lower().endswith(('.jpg', '.png', '.jpeg')):
        img_path = os.path.join(new_faces_dir, img_file)
        image = face_recognition.load_image_file(img_path)

        face_locations = face_recognition.face_locations(image)
        face_encodings = face_recognition.face_encodings(image, face_locations)

        print(f"\nProcessing: {img_file}")
        if len(face_encodings) == 0:
            print("❌ No face found.")
            continue

        for encoding in face_encodings:
            # Compute cosine similarity with each known face
            encoding = encoding.reshape(1, -1)
            similarities = cosine_similarity(encoding, known_embeddings)[0]
            best_match_index = np.argmax(similarities)
            best_score = similarities[best_match_index]

            if best_score > 0.6:  # threshold can be adjusted
                print(f"✅ Access Granted: {known_names[best_match_index]} (Score: {best_score:.2f})")
            else:
                print(f"❌ Access Denied: Unknown face (Score: {best_score:.2f})")



Processing: user_01.jpg


  if ismodule(module) and hasattr(module, '__file__'):


ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 128 while Y.shape[1] == 192

##### Voice Recognition

In [1]:
import os
import torch
import numpy as np
from speechbrain.inference.speaker import SpeakerRecognition
import torchaudio
from pathlib import Path



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Paths
EMBEDDING_DIR = 'embeddings/audio'  # Folder containing enrolled .pt files
MODEL_PATH = os.path.join("pretrained_models", "spkrec")  # Local path to SpeechBrain model
AUDIO_PATH = 'new_voices/user01.wav'  # New audio input for recognition

# Load speaker recognition model
speaker_model = SpeakerRecognition.from_hparams(
    source=MODEL_PATH,
    savedir=MODEL_PATH,
    run_opts={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    use_auth_token=False
)

# Step 1: Load known voice embeddings
known_embeddings = []
known_names = []

for filename in os.listdir(EMBEDDING_DIR):
    if filename.endswith('_voice.pt'):
        name = filename.replace('_voice.pt', '')
        embedding = torch.load(os.path.join(EMBEDDING_DIR, filename))
        known_embeddings.append(embedding)
        known_names.append(name)



In [5]:
# Step 2: Process new audio
def process_audio(file_path):
    waveform, sr = torchaudio.load(file_path)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)
    waveform = waveform.squeeze(0)
    embedding = speaker_model.encode_batch(waveform.unsqueeze(0)).squeeze().detach()
    return embedding

new_embedding = process_audio(AUDIO_PATH)

# Step 3: Compare with known embeddings using cosine similarity
def cosine_similarity(a, b):
    return torch.nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()

scores = [cosine_similarity(new_embedding, known_emb) for known_emb in known_embeddings]
best_index = int(np.argmax(scores))
best_score = scores[best_index]

# Step 4: Threshold and result
threshold = 0.75  # Adjust this based on testing
if best_score >= threshold:
    print(f"Match found: {known_names[best_index]} (score: {best_score:.2f})")
else:
    print(f"No match found. Closest match: {known_names[best_index]} (score: {best_score:.2f})")


Match found: dembele (score: 0.76)


##### Realtime Voice Recognition

In [5]:
import os
import torch
import numpy as np
import sounddevice as sd
from scipy.io.wavfile import write
from speechbrain.inference.speaker import SpeakerRecognition
import torchaudio
import tempfile

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Paths
EMBEDDING_DIR = 'embeddings/audio'
MODEL_PATH = os.path.join("pretrained_models", "spkrec")

# Initialize model
speaker_model = SpeakerRecognition.from_hparams(
    source=MODEL_PATH,
    savedir=MODEL_PATH,
    run_opts={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    use_auth_token=False
)

# Load enrolled embeddings
known_embeddings = []
known_names = []

for filename in os.listdir(EMBEDDING_DIR):
    if filename.endswith('_voice.pt'):
        name = filename.replace('_voice.pt', '')
        embedding = torch.load(os.path.join(EMBEDDING_DIR, filename))
        known_embeddings.append(embedding)
        known_names.append(name)

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


In [7]:
known_names

['abdullah',
 'aminah',
 'dembele',
 'gentle',
 'haypen',
 'maimunah',
 'meedo',
 'rukoyah']

In [None]:
for filename in os.listdir(EMBEDDING_DIR):
    if filename.endswith('_voice.pt'):
        name = filename.replace('_voice.pt', '')
        if name == verified_user:
            print(name)
        

abdullah
aminah
dembele
gentle
haypen
maimunah
meedo
rukoyah


In [15]:
# Record voice
def record_voice(filename='temp.wav', duration=15, fs=16000):
    print(f"\n🎤 Recording for {duration} seconds...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='int16')
    sd.wait()
    write(filename, fs, audio)
    print("✅ Recording complete.")
    return filename

# Process audio to embedding
def process_audio(file_path):
    waveform, sr = torchaudio.load(file_path)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)
    waveform = waveform.squeeze(0)
    embedding = speaker_model.encode_batch(waveform.unsqueeze(0)).squeeze().detach()
    return embedding

# Cosine similarity
def cosine_similarity(a, b):
    return torch.nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()


In [21]:
# Main logic
def recognize_from_microphone():
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmpfile:
        wav_path = record_voice(tmpfile.name)
    
    new_embedding = process_audio(wav_path)
    scores = [cosine_similarity(new_embedding, emb) for emb in known_embeddings]
    
    best_index = int(np.argmax(scores))
    best_score = scores[best_index]

    threshold = 0.75
    if best_score >= threshold:
        print(f"\n✅ Match: {known_names[best_index]} (score: {best_score:.2f})")
    else:
        print(f"\n❌ Unknown speaker. Closest match: {known_names[best_index]} (score: {best_score:.2f})")

In [20]:
recognize_from_microphone()


🎤 Recording for 15 seconds...
✅ Recording complete.

✅ Match: aminah (score: 0.61)


##### Testing Webcam

In [31]:
import cv2

# Open the webcam (0 = default camera)
video_capture = cv2.VideoCapture(0)

# Check if camera opened successfully
if not video_capture.isOpened():
    print("[ERROR] Could not open webcam.")
    exit()

In [32]:
# Set resolution (optional)
video_capture.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
video_capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

# Define the codec and create VideoWriter object for .mp4
fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # 'mp4v' for .mp4
out = cv2.VideoWriter('output.mp4', fourcc, 20.0, (640, 480))

print("[INFO] Press 'q' to stop recording...")

while True:
    ret, frame = video_capture.read()
    if not ret:
        print("[ERROR] Failed to read frame.")
        break

    # Write the frame to output file
    out.write(frame)

    # Display the resulting frame (can remove if you want speed)
    cv2.imshow('Recording...', frame)

    # Press 'q' to quit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        print("[INFO] Stopping capture...")
        break

# Release everything
video_capture.release()
out.release()
cv2.destroyAllWindows()


[INFO] Press 'q' to stop recording...
[ERROR] Failed to read frame.


##### Realtime Face Recognition

In [22]:
import face_recognition
import cv2
import numpy as np
import os
import time



In [35]:
# === Load known face encodings ===
known_embeddings_dir = "embeddings/video"
known_encodings = []
known_names = []

for filename in os.listdir(known_embeddings_dir):
    if filename.endswith(".npy"):
        name = os.path.splitext(filename)[0]
        embedding = np.load(os.path.join(known_embeddings_dir, filename))
        known_encodings.append(embedding)
        known_names.append(name)

print(f"[INFO] Loaded {len(known_encodings)} known face embeddings.")


[INFO] Loaded 8 known face embeddings.


In [36]:

# === Start webcam capture ===
video_capture = cv2.VideoCapture(0, cv2.CAP_DSHOW)

if not video_capture.isOpened():
    print("[ERROR] Could not open webcam.")
    exit()

# === Prepare video writer to save the capture ===
frame_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = 10  # You can adjust this

# Use mp4v codec for .mp4
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter("output.mp4", fourcc, fps, (frame_width, frame_height))

print("[INFO] Starting real-time recognition. Press 'Ctrl+C' to stop.")

try:
    while True:
        ret, frame = video_capture.read()
        if not ret:
            print("[!] Failed to grab frame.")
            break

        # Save the frame to the video file
        out.write(frame)

        # Resize and convert to RGB for processing
        small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
        rgb_small_frame = small_frame[:, :, ::-1]

        # Detect face locations and encodings
        face_locations = face_recognition.face_locations(rgb_small_frame)
        face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations)

        for face_encoding in face_encodings:
            name = "Unknown"
            best_score = float('inf')
            best_match = None

            for known_embedding, known_name in zip(known_encodings, known_names):
                distances = face_recognition.face_distance(known_embedding, face_encoding)
                min_distance = np.min(distances)
                if min_distance < best_score:
                    best_score = min_distance
                    best_match = known_name

            if best_score < 0.6:
                print(f"✅ Face recognized: {best_match}")
            else:
                print("❌ Unknown face detected")

        time.sleep(0.5)  # Control recognition speed

except KeyboardInterrupt:
    print("\n[INFO] Recognition stopped by user.")

# === Cleanup ===
video_capture.release()
out.release()


[INFO] Starting real-time recognition. Press 'Ctrl+C' to stop.
[!] Failed to grab frame.


In [33]:
import cv2

# === Start webcam capture ===
video_capture = cv2.VideoCapture(0, cv2.CAP_DSHOW)

if not video_capture.isOpened():
    print("[ERROR] Could not open webcam.")
    exit()

print("[INFO] Starting real-time recognition. Press 'Ctrl+C' to stop.")


[INFO] Starting real-time recognition. Press 'Ctrl+C' to stop.


In [34]:
# Save to 'output.avi' using MJPG codec at 20 FPS, 640x480 resolution
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('outputww.mp4', fourcc, 20.0, (640, 480))


In [27]:

try:
    while True:
        ret, frame = video_capture.read()
        if not ret:
            print("[!] Failed to grab frame.")
            break

        # Resize and convert to RGB for processing
        small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
        rgb_small_frame = small_frame[:, :, ::-1]

        # Detect face locations and encodings
        face_locations = face_recognition.face_locations(rgb_small_frame)
        face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations)

        for face_encoding in face_encodings:
            name = "Unknown"
            best_score = float('inf')
            best_match = None

            for known_embedding, known_name in zip(known_encodings, known_names):
                distances = face_recognition.face_distance(known_embedding, face_encoding)
                min_distance = np.min(distances)
                if min_distance < best_score:
                    best_score = min_distance
                    best_match = known_name

            # Threshold for recognition
            if best_score < 0.6:
                print(f"✅ Face recognized: {best_match}")
            else:
                print("❌ Unknown face detected")

        time.sleep(0.5)  # Adjust as needed

except KeyboardInterrupt:
    print("\n[INFO] Recognition stopped by user.")

video_capture.release()

[!] Failed to grab frame.


##### Annotated Recognition Script

In [None]:
# import os
# import cv2
# import numpy as np
# import face_recognition
# from sklearn.metrics.pairwise import cosine_similarity

# # Paths
# known_embeddings_dir = "embeddings/video"
# new_faces_dir = "new_faces/"

# # Load known face embeddings
# known_embeddings = []
# known_names = []

# for filename in os.listdir(known_embeddings_dir):
#     if filename.endswith(".npy"):
#         name = os.path.splitext(filename)[0]
#         embeddings = np.load(os.path.join(known_embeddings_dir, filename))

#         if embeddings.ndim != 2 or embeddings.shape[1] != 128:
#             print(f"⚠️ Skipping {filename} — unexpected shape: {embeddings.shape}")
#             continue

#         mean_embedding = np.mean(embeddings, axis=0)
#         known_embeddings.append(mean_embedding)
#         known_names.append(name)

# known_embeddings = np.array(known_embeddings)



In [None]:
# # Process images in new_faces directory
# for img_file in os.listdir(new_faces_dir):
#     if img_file.lower().endswith((".jpg", ".jpeg", ".png")):
#         img_path = os.path.join(new_faces_dir, img_file)
#         image = face_recognition.load_image_file(img_path)

#         face_locations = face_recognition.face_locations(image)
#         face_encodings = face_recognition.face_encodings(image, face_locations)

#         # Convert to BGR for OpenCV display
#         image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

#         for (top, right, bottom, left), face_encoding in zip(face_locations, face_encodings):
#             encoding = face_encoding.reshape(1, -1)
#             similarities = cosine_similarity(encoding, known_embeddings)[0]

#             best_match_index = np.argmax(similarities)
#             best_score = similarities[best_match_index]

#             threshold = 0.5  # cosine similarity threshold
#             if best_score >= threshold:
#                 name = known_names[best_match_index]
#             else:
#                 name = "Unknown"

#             # Draw box and label
#             cv2.rectangle(image_bgr, (left, top), (right, bottom), (0, 255, 0), 2)
#             cv2.putText(image_bgr, name, (left, top - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

#         # Show the annotated image
#         cv2.imshow("Recognition", image_bgr)
#         cv2.waitKey(0)

# cv2.destroyAllWindows()


##### Others

In [24]:
def clean_name(raw_name):
    name = raw_name.replace('_face_embeddings', '').replace('_voice', '')
    return name.replace('_', ' ').title()


In [26]:
clean_name('nhdfhjhHHFGfjhd').lower()

'nhdfhjhhhfgfjhd'

In [27]:
'Nae'.lower()

'nae'

In [28]:
a = 'Mgahegiih'
a.lower()

'mgahegiih'

In [34]:
for filename in os.listdir(EMBEDDING_DIR):
    if filename.endswith('_voice.pt'):
        name = filename.replace('_voice.pt', '')
        if name == 'gentle':
            embedding = torch.load(os.path.join(EMBEDDING_DIR, filename))
            print(name)
            

gentle


In [37]:
known_voice_embeddings = []
known_voice_names = []

for filename in os.listdir(EMBEDDING_DIR):
    if filename.endswith('_voice.pt'):
        name = filename.replace('_voice.pt', '')
        if name == 'gentle':
            embedding = torch.load(os.path.join(EMBEDDING_DIR, filename))
            known_voice_embeddings.append(embedding)
            known_voice_names.append(name)

In [43]:
known_voice_embeddings = None
known_voice_names = []

for filename in os.listdir(EMBEDDING_DIR):
    if filename.endswith('_voice.pt'):
        name = filename.replace('_voice.pt', '')
        embedding = torch.load(os.path.join(EMBEDDING_DIR, filename))
        known_voice_embeddings = embedding
        known_voice_names.append(name)

In [44]:
len(known_voice_embeddings)

192

In [45]:
type(known_voice_embeddings)

torch.Tensor

In [46]:
print(known_voice_embeddings)

tensor([ 1.3625e+00,  9.8698e+00,  4.0903e+01,  3.8804e+00,  2.4257e+01,
         2.9141e+01,  5.5740e+01,  4.1073e+01, -7.5835e+00,  5.2313e+00,
         2.4843e+01,  2.2655e+01,  4.4369e+01,  2.0786e+01,  5.7986e+00,
        -3.8210e+01, -1.2403e+01,  2.1943e+01,  1.8472e+01,  1.7310e+01,
        -4.5351e+01, -2.6754e+01, -2.8166e+00, -6.5284e+00,  4.3536e+01,
        -1.1374e+01, -1.5355e+01,  1.7753e+01,  1.7495e+01, -5.2646e+01,
        -2.8624e+01,  1.2043e+01,  1.1832e+01,  4.1213e+01, -2.0512e+01,
         2.4462e+01,  1.0415e+01,  5.1659e+00, -2.0074e+01, -1.7107e+00,
        -1.7746e+01, -3.1406e+00, -3.9071e+01, -2.1866e+01,  1.5681e+01,
         9.5509e-01,  6.3962e+00, -5.5488e+00, -7.3807e+00,  7.1852e+00,
         2.4002e+01, -4.0397e+01, -1.2042e+01,  3.8726e+01, -2.6279e+01,
         1.4765e+01,  4.1586e+00,  2.5375e+01,  8.0072e+00,  1.9197e+01,
        -2.0759e+01,  3.0002e+01, -1.6630e+01, -3.3087e+01,  5.6518e+01,
        -1.0584e+01, -1.7382e+01, -7.3311e+00, -4.5