In [2]:
import torch
import librosa
import numpy as np
import pyaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import google.generativeai as genai
from dotenv import load_dotenv
import os

In [3]:
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [4]:
# Load the Hugging Face model and processor for Uzbek speech-to-text
def load_uzbek_stt_model():
    processor = Wav2Vec2Processor.from_pretrained("oyqiz/uzbek_stt")
    model = Wav2Vec2ForCTC.from_pretrained("oyqiz/uzbek_stt")
    return processor, model

In [5]:
# Function to perform speech-to-text using 'oyqiz/uzbek_stt'
def uzbek_speech_to_text(audio_data):
    processor, model = load_uzbek_stt_model()

    # Resample if needed and convert audio to the appropriate format
    audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)
    input_values = processor(audio_data, return_tensors="pt", padding="longest").input_values

    # Perform inference (Uzbek speech-to-text)
    with torch.no_grad():
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)

    # Convert the predicted IDs to transcribed text
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

In [6]:
# Use Gemini to summarize text
def generate_gemini_content(text, prompt):
    model = genai.GenerativeModel(model_name="gemini-1.5-flash")
    full_prompt = (
        f"{prompt}\n"
        "Matnni qisqartirib, asosiy fikrlarni bullet point ko'rinishida taqdim qiling. "
        "Matnni ijobiy yoki salbiy ekanligini ham aniqlang."
    )
    response = model.generate_content(full_prompt + text)
    return response.text

In [7]:
# Load audio from file and perform speech-to-text
def process_uploaded_audio(file_path):
    # Load the audio file and perform conversion
    audio_data, sr = librosa.load(file_path, sr=16000)
    transcription = uzbek_speech_to_text(audio_data)

    print("Transcribed Text: ", transcription)
    prompt = "Quyidagi matnni tahlil qilib, qisqacha mazmunini chiqarib bering:"
    summary = generate_gemini_content(transcription, prompt)
    print("Summary: ", summary)

In [8]:
# Function to capture microphone input
def capture_microphone_input(duration=5, sr=16000):
    p = pyaudio.PyAudio()

    # Setup recording parameters
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=sr,
                    input=True,
                    frames_per_buffer=1024)

    print("Recording...")
    frames = []

    for _ in range(0, int(sr / 1024 * duration)):
        data = stream.read(1024)
        frames.append(np.frombuffer(data, dtype=np.int16))

    print("Finished recording.")

    stream.stop_stream()
    stream.close()
    p.terminate()

    # Convert the list of arrays to a single numpy array
    audio_data = np.concatenate(frames).astype(np.float32) / np.iinfo(np.int16).max

    return audio_data, sr

In [None]:
# Main part of the code for interactive input
option = input("Choose input method (1 for file upload, 2 for microphone): ")

if option == "1":
    file_path = "../Operator-va-menejerlar-uchun-o-quv-kursi-kirish-va-chiqish.mp3"
    process_uploaded_audio(file_path)

elif option == "2":
    duration = int(input("Enter the recording duration in seconds: "))
    audio_data, sr = capture_microphone_input(duration)
    uzbek_text = uzbek_speech_to_text(audio_data)
    print("Transcribed Text:", uzbek_text)

    # Perform summarization with Gemini
    prompt = "Quyidagi matnni tahlil qilib, qisqacha mazmunini chiqarib bering:"
    summary = generate_gemini_content(uzbek_text, prompt)
    print("Summary:", summary)