In [None]:
# !pip install transformers librosa torch ipywidgets pyspellchecker noisereduce nltk spacy
# !python -m spacy download en_core_web_sm
# !pip install langdetect
# !pip install pydub

In [None]:
import numpy as np
import librosa
from transformers import BartForConditionalGeneration, BartTokenizer, SeamlessM4TModel, AutoProcessor
import noisereduce as nr
from google.colab import files
import time
from pydub import AudioSegment, effects

In [None]:
model_name_stt = "facebook/hf-seamless-m4t-medium"
model_stt = SeamlessM4TModel.from_pretrained(model_name_stt)
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")

model_name_summarization = "facebook/bart-large-cnn"
model_summarization = BartForConditionalGeneration.from_pretrained(model_name_summarization)
tokenizer_summarization = BartTokenizer.from_pretrained(model_name_summarization)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def enhance_audio(file_path):
    try:
        audio = AudioSegment.from_file(file_path)
        normalized_audio = effects.normalize(audio)
        enhanced_file_path = "enhanced_" + file_path
        normalized_audio.export(enhanced_file_path, format="wav")
        return enhanced_file_path
    except Exception as e:
        print(f"Error enhancing audio: {e}")
        return file_path

def calculate_summary_quality(generated_summary, reference_summary):
    generated_words = set(generated_summary.split())
    reference_words = set(reference_summary.split())
    overlap = len(generated_words.intersection(reference_words))
    total_words = len(reference_words)
    quality = (overlap / total_words) * 100 if total_words > 0 else 0
    return quality

def reduce_noise(audio, sr):
    return nr.reduce_noise(y=audio, sr=sr, n_std_thresh_stationary=1.5, prop_decrease=0.8)

def dynamic_chunk_duration(audio_length, sr=16000, max_chunks=10):
    total_duration_sec = audio_length / sr
    return int(max(30, np.ceil(total_duration_sec / max_chunks)))

def split_audio(audio_input, sr=16000, chunk_duration=30):
    chunk_length = sr * chunk_duration
    total_length = len(audio_input)
    return [audio_input[i:i + chunk_length] for i in range(0, total_length, chunk_length)]

def transcribe_audio(chunks, target_lang="eng"):
    transcriptions = []
    for chunk in chunks:
        audio_inputs = processor(audios=chunk, return_tensors="pt")
        output_tokens = model_stt.generate(**audio_inputs, tgt_lang=target_lang, generate_speech=False)
        decoded_output = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
        if decoded_output:
            transcriptions.append(decoded_output)
    return " ".join(transcriptions)

def adjust_summary_length(transcription_length):
    return (200, 50) if transcription_length > 2000 else (150, 30)

def transcribe_and_summarize_uploaded_file(file_content, filename, reference_summary=None):
    start_time = time.time()
    try:
        with open(filename, 'wb') as f:
            f.write(file_content)

        enhanced_filename = enhance_audio(filename)

        audio_input, sr = librosa.load(enhanced_filename, sr=16000)
        audio_input = reduce_noise(audio_input, sr)
        chunk_duration = dynamic_chunk_duration(len(audio_input), sr)
        chunks = split_audio(audio_input, sr, chunk_duration)
        full_transcription = transcribe_audio(chunks)

        summary_lengths = [(150, 25), (350, 50)]
        summaries = []
        summary_qualities = []
        for max_length, min_length in summary_lengths:
            summary_ids = model_summarization.generate(
                tokenizer_summarization.encode("summarize: " + full_transcription, return_tensors="pt", max_length=1024, truncation=True),
                num_beams=4, max_length=max_length, min_length=min_length, early_stopping=True)
            summary = tokenizer_summarization.decode(summary_ids[0], skip_special_tokens=True)
            summaries.append(summary)
            if reference_summary:
                summary_qualities.append(calculate_summary_quality(summary, reference_summary))
            else:
                summary_qualities.append(None)

        processing_time = time.time() - start_time
        return full_transcription, summaries, summary_qualities, processing_time
    except Exception as e:
        return f"An error occurred: {e}", [], [], 0

In [None]:
uploaded_files = files.upload()

for filename, file_content in uploaded_files.items():
    print(f"Processing file: {filename}")
    reference_summary = None
    transcription, summaries, summary_qualities, processing_time = transcribe_and_summarize_uploaded_file(file_content, filename, reference_summary)
    print("Transcription:", transcription)
    for i, (summary, quality) in enumerate(zip(summaries, summary_qualities), start=1):
        print(f"Summary {i}: {summary}")
        if quality is not None:
            print("This Summary Base On Reference similar to Outcome Or Not, However Please Check it Manually")
            print(f"Quality {i}: {quality}%")
    print(f"Processing Time: {processing_time} seconds")

Saving PODCAST_.mp3 to PODCAST_ (7).mp3
Processing file: PODCAST_ (7).mp3


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Transcription: Today, the Thai meteorological agency is presenting the story of the thunderstorm, the thunderstorm, the thunderstorm, the thunderstorm, the thunderstorm, the thunderstorm, the thunderstorm, the thunderstorm, the thunderstorm, the thunderstorm, the thunderstorm, the thunderstorm. The southwesterly winds, which began in May, will cause heavy rainfall in the southwestern part of the country, while the southwestern part will have heavy rainfall in the month of May.
Summary 1: The southwesterly winds, which began in May, will cause heavy rainfall in the southwestern part of the country. The southwestern part will have heavy rain in the month of May.
Summary 2: The southwesterly winds, which began in May, will cause heavy rainfall in the southwestern part of the country. The southwestern part will have heavy rain in the month of May. The Thai meteorological agency is presenting the story of the thunderstorm.
Processing Time: 37.405519247055054 seconds
