In [2]:
from flask import Flask, send_from_directory
from flask_cors import CORS
import logging
from flask_socketio import SocketIO
import numpy as np
import wave
import io
import threading
import queue
import time
import os
from datetime import datetime
import base64
import asyncio
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import torchaudio
from flask import render_template
import librosa
import csv
from datetime import datetime

In [32]:
TIMING_CSV = "processing_times.csv"
if not os.path.exists(TIMING_CSV):
    with open(TIMING_CSV, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Timestamp", "Chunk Duration (sec)", "Processing Time (sec)"])

In [33]:
# Initialize Hugging Face model
processor = AutoProcessor.from_pretrained("MohamedRashad/Arabic-Whisper-CodeSwitching-Edition")
model = AutoModelForSpeechSeq2Seq.from_pretrained("MohamedRashad/Arabic-Whisper-CodeSwitching-Edition")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Add this after app initialization
VOICE_DIR = "voice"
if not os.path.exists(VOICE_DIR):
    os.makedirs(VOICE_DIR)


class AudioProcessor:
    def __init__(self):
        self.audio_queue = queue.Queue()
        self.processing_thread = None
        self.is_running = False
        self.voice_dir = VOICE_DIR
        self.last_process_time = time.time()
        self.buffer = np.array([], dtype=np.float32)
        self.loop = asyncio.new_event_loop()

    def stop_processing(self):
        """Stop the audio processing thread"""
        self.is_running = False
        if self.processing_thread is not None:
            self.processing_thread.join()
        if self.loop.is_running():
            self.loop.call_soon_threadsafe(self.loop.stop)
        logger.info("Audio processing thread stopped")

    async def process_with_huggingface(self, audio_file_path: str, chunk_duration: int = None) -> str:
        """Process audio file with Hugging Face model, log timing."""
        if not os.path.isfile(audio_file_path):
            raise ValueError(f"Invalid audio file path: {audio_file_path}")

        try:
            start_time = time.time()  # Start timing

            # Load and process the audio file
            audio_data, sample_rate = librosa.load(audio_file_path, sr=None)

            # Resample audio data to 16000 Hz if it's not already
            if sample_rate != 16000:
                logger.info(f"Resampling audio from {sample_rate} Hz to 16000 Hz")
                audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)

            # Prepare the input for the model
            inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt")

            # Generate transcription with the model
            with torch.no_grad():
                generated_ids = model.generate(inputs["input_features"])

            # Decode the generated ids to text
            transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
            result = " ".join(transcription)

            # Measure processing time
            end_time = time.time()
            processing_time = end_time - start_time

            # Log timing to CSV
            with open(TIMING_CSV, 'a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow([datetime.now().isoformat(), chunk_duration, processing_time])

            logger.info(f"Processing Time: {processing_time:.2f} seconds for duration: {chunk_duration} sec")
            logger.info(f"Transcription result: {result}")

            return result
        except Exception as e:
            logger.exception(f"Unexpected error processing audio: {e}")
            return ""

    def process_audio_chunk(self, audio_data: np.ndarray, timestamp: int, mode: str):
        try:
            self.buffer = np.append(self.buffer, audio_data)
            chunk_samples = 44100 * 3  # 3 seconds worth of audio at 44.1kHz

            while len(self.buffer) >= chunk_samples:
                # Extract a chunk of audio data
                chunk = self.buffer[:chunk_samples]
                self.buffer = self.buffer[chunk_samples:]

                logger.info(f"Processing chunk of {len(chunk)} samples")

                # Save the chunk to a temporary file
                timestamp_str = datetime.fromtimestamp(timestamp / 1000).strftime('%Y%m%d_%H%M%S')
                filename = f"{timestamp_str}_{mode}.wav"
                filepath = os.path.join(self.voice_dir, filename)

                with wave.open(filepath, 'wb') as wav_file:
                    wav_file.setnchannels(1)
                    wav_file.setsampwidth(2)
                    wav_file.setframerate(44100)
                    audio_16bit = (chunk * 32767).astype(np.int16)
                    wav_file.writeframes(audio_16bit.tobytes())

                logger.info(f"Saved audio chunk: {filename}")

                # Transcribe the chunk
                future = asyncio.run_coroutine_threadsafe(
                    self.process_with_huggingface(filepath),
                    self.loop
                )
                transcription = future.result()

                # Emit the transcription
                if transcription:
                    socketio.emit('transcription', {
                        'text': transcription,
                        'timestamp': timestamp,
                        'mode': mode
                    })
                    logger.info(f"Chunk Transcription: {transcription}")

            # Update the last process time to throttle processing
            self.last_process_time = time.time()

        except Exception as e:
            logger.error(f"Error processing audio chunk: {e}", exc_info=True)
            self.buffer = np.array([], dtype=np.float32)


# Create audio processor instance
audio_processor = AudioProcessor()

2024-12-10 15:30:13,347 - urllib3.connectionpool - DEBUG - Resetting dropped connection: huggingface.co
2024-12-10 15:30:14,011 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /MohamedRashad/Arabic-Whisper-CodeSwitching-Edition/resolve/main/processor_config.json HTTP/11" 404 0
2024-12-10 15:30:14,200 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /MohamedRashad/Arabic-Whisper-CodeSwitching-Edition/resolve/main/preprocessor_config.json HTTP/11" 200 0
2024-12-10 15:30:14,376 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /MohamedRashad/Arabic-Whisper-CodeSwitching-Edition/resolve/main/preprocessor_config.json HTTP/11" 200 0
2024-12-10 15:30:14,536 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /MohamedRashad/Arabic-Whisper-CodeSwitching-Edition/resolve/main/preprocessor_config.json HTTP/11" 200 0
2024-12-10 15:30:14,796 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /MohamedRashad/Arab

In [35]:
app = Flask(__name__)
CORS(app)
socketio = SocketIO(app, cors_allowed_origins="*", ping_timeout=120)

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('audio_server.log')
    ]
)
logger = logging.getLogger(__name__)

# Add socket.io logging
logging.getLogger('socketio').setLevel(logging.DEBUG)
logging.getLogger('engineio').setLevel(logging.DEBUG)

@app.route('/voice/<path:filename>')
def serve_voice_files(filename):
    return send_from_directory('voice', filename)

@app.route('/transcribe/<filename>')
def transcribe_file(filename):
    file_path = os.path.join(VOICE_DIR, filename)
    if not os.path.isfile(file_path):
        return {"error": "File not found"}, 404
    try:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        transcription = loop.run_until_complete(audio_processor.process_with_huggingface(file_path))
        loop.close()
        return transcription, 200, {'Content-Type': 'text/plain; charset=utf-8'}
    except Exception as e:
        logger.error(f"Error in transcription: {e}", exc_info=True)
        return {"error": str(e)}, 500


@socketio.on('connect')
def handle_connect():
    """Handle client connection"""
    logger.info('Client connected')
    if not audio_processor.is_running:
        audio_processor.start_processing()

@socketio.on('disconnect')
def handle_disconnect():
    """Handle client disconnection"""
    logger.info('Client disconnected')
    audio_processor.stop_processing()

@socketio.on('audio_data')
def handle_audio_data(data):
    """Handle incoming audio data"""
    try:
        audio_array = np.frombuffer(data['buffer'], dtype=np.float32)
        timestamp = data.get('timestamp', int(time.time() * 1000))
        mode = data.get('mode', 'both')

        if len(audio_array) > 0:
            logger.debug(f"Received audio chunk - Length: {len(audio_array)}, Mode: {mode}")
            audio_processor.process_audio_chunk(audio_array, timestamp, mode)
    except Exception as e:
        logger.error(f"Error handling audio data: {e}", exc_info=True)

In [None]:
if __name__ == '__main__':
    try:
        socketio.run(app, host='0.0.0.0', port=5000, debug=True, use_reloader=False, allow_unsafe_werkzeug=True)
    finally:
        audio_processor.stop_processing()
        print('Processing is done')



 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://10.24.18.6:5000
2024-12-10 15:48:26,704 - werkzeug - INFO - [33mPress CTRL+C to quit[0m
2024-12-10 15:49:46,797 - asyncio - DEBUG - Using selector: SelectSelector
2024-12-10 15:49:47,781 - __main__ - INFO - Resampling audio from 48000 Hz to 16000 Hz
2024-12-10 15:50:42,058 - __main__ - INFO - Processing Time: 55.02 seconds for duration: None sec
2024-12-10 15:50:42,061 - __main__ - INFO - Transcription result: مسائل full على مدريد والمدريد هو الأصدقاء والمتابعين المفضلين ل podcast المحلقين، اليوم معنا محمد يسري المدريدي العتيد، أنت عارف أنك أول سكاندرونيجي ل podcast؟
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\omar.mounir\AppData\Local\Programs\Python\Python39\lib\logging\__init__.py", line 1086, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\omar.mounir\AppData\Local\Programs\Python\Python39\lib\encodings\cp1252.py", line 19, in encode
    re