In [None]:
# !pip install transformers librosa torch ipywidgets pyspellchecker noisereduce nltk spacy
# !python -m spacy download en_core_web_sm
# !pip install langdetect
# !pip install pydub

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting noisereduce
  Downloading noisereduce-3.0.2-py3-none-any.whl (22 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━

In [None]:
import numpy as np
import librosa
from transformers import BartForConditionalGeneration, BartTokenizer, SeamlessM4TModel, AutoProcessor
import noisereduce as nr
from google.colab import files
import time
from pydub import AudioSegment, effects
from IPython.display import display, HTML

In [None]:
model_name_stt = "facebook/hf-seamless-m4t-medium"
model_stt = SeamlessM4TModel.from_pretrained(model_name_stt)
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")

model_name_summarization = "facebook/bart-large-cnn"
model_summarization = BartForConditionalGeneration.from_pretrained(model_name_summarization)
tokenizer_summarization = BartTokenizer.from_pretrained(model_name_summarization)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def enhance_audio(file_path):
    try:
        audio = AudioSegment.from_file(file_path)
        normalized_audio = effects.normalize(audio)
        enhanced_file_path = "enhanced_" + file_path
        normalized_audio.export(enhanced_file_path, format="wav")
        return enhanced_file_path
    except Exception as e:
        print(f"Error enhancing audio: {e}")
        return file_path

def calculate_quality(generated_text, reference_text):
    generated_words = set(generated_text.split())
    reference_words = set(reference_text.split())
    overlap = len(generated_words.intersection(reference_words))
    total_words = len(reference_words)
    quality = (overlap / total_words) * 100 if total_words > 0 else 0
    return quality

def reduce_noise(audio, sr):
    return nr.reduce_noise(y=audio, sr=sr, n_std_thresh_stationary=1.5, prop_decrease=0.8)

def dynamic_chunk_duration(audio_length, sr=16000, max_chunks=10):
    total_duration_sec = audio_length / sr
    return int(max(30, np.ceil(total_duration_sec / max_chunks)))

def split_audio(audio_input, sr=16000, chunk_duration=30):
    chunk_length = sr * chunk_duration
    total_length = len(audio_input)
    return [audio_input[i:i + chunk_length] for i in range(0, total_length, chunk_length)]

def transcribe_audio(chunks, target_lang="eng"):
    transcriptions = []
    for chunk in chunks:
        audio_inputs = processor(audios=chunk, return_tensors="pt")
        output_tokens = model_stt.generate(**audio_inputs, tgt_lang=target_lang, generate_speech=False)
        decoded_output = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
        if decoded_output:
            transcriptions.append(decoded_output)
    return " ".join(transcriptions)

def adjust_summary_length(transcription_length):
    return (200, 50) if transcription_length > 2000 else (150, 30)

def transcribe_and_summarize_uploaded_file(file_content, filename, reference_transcript=None, reference_summary=None):
    start_time = time.time()
    try:
        with open(filename, 'wb') as f:
            f.write(file_content)

        enhanced_filename = enhance_audio(filename)

        audio_input, sr = librosa.load(enhanced_filename, sr=16000)
        audio_input = reduce_noise(audio_input, sr)
        chunk_duration = dynamic_chunk_duration(len(audio_input), sr)
        chunks = split_audio(audio_input, sr, chunk_duration)
        full_transcription = transcribe_audio(chunks)

        summary_lengths = [(100, 25), (350, 50)]
        summaries = []
        summary_qualities = []
        for max_length, min_length in summary_lengths:
            summary_ids = model_summarization.generate(
                tokenizer_summarization.encode("summarize: " + full_transcription, return_tensors="pt", max_length=1024, truncation=True),
                num_beams=4, max_length=max_length, min_length=min_length, early_stopping=True)
            summary = tokenizer_summarization.decode(summary_ids[0], skip_special_tokens=True)
            summaries.append(summary)
            if reference_summary:
                summary_qualities.append(calculate_quality(summary, reference_summary))
            else:
                summary_qualities.append(None)

        transcript_quality = calculate_quality(full_transcription, reference_transcript) if reference_transcript else None

        processing_time = time.time() - start_time
        return full_transcription, summaries, summary_qualities, transcript_quality, processing_time
    except Exception as e:
        return f"An error occurred: {e}", [], [], None, 0

In [None]:
def display_output(transcription, summaries, summary_qualities, processing_time, reference_transcript=None, transcript_quality=None, reference_summary=None):
    output_html = f"""
    <div style="border: 2px solid #444; border-radius: 10px; padding: 10px; margin-bottom: 10px;">
        <h3>Transcription:</h3>
        <p>{transcription}</p>
    """
    if reference_transcript:
        output_html += f"""
        <hr>
        <h3>Reference Transcript:</h3>
        <p>{reference_transcript}</p>
        """
        if transcript_quality is not None:
            output_html += f"""
            <p><strong>Transcript Quality:</strong> {transcript_quality}%</p>
            """
    if reference_summary:
        output_html += f"""
        <hr>
        <h3>Reference Summary:</h3>
        <p>{reference_summary}</p>
        """
    output_html += f"""
        <hr>
        <h3>Summaries:</h3>
        <ul>
    """
    for i, (summary, quality) in enumerate(zip(summaries, summary_qualities), start=1):
        output_html += f"""
            <li>
                <strong>Summary {i}:</strong> {summary}<br>
                """
        if quality is not None:
            output_html += f"""
                <strong>Quality:</strong> {quality}%
                """
        output_html += f"""
            </li>
        """
    output_html += f"""
        </ul>
        <hr>
        <p><strong>Processing Time:</strong> {processing_time} seconds</p>
    </div>
    """
    display(HTML(output_html))

##For demo:

In [None]:
uploaded_files = files.upload()

for filename, file_content in uploaded_files.items():
    print(f"Processing file: {filename}")
    reference_transcript = """
gold markets continue to look very
strong gold has rallied rather
significantly during the course of the
trading session on Thursday as it looks
like people want to have exposure
heading into the Easter holiday weekend
that being said any shortterm pullback
that you get is a buying opportunity
there will be some limited electronic
trading uh heading into the weekend so
do keep that in mind but it will be thin
volume ultimately this is a market that
I do think continues to go much higher
but um you know whether or not we have
the momentum to take out the shooting
star
from a little over a week ago then that
remains to be seen but we certainly look
like we're going to try to do it with
that being said I like the idea of
buying dips that's been the play all
along in gold and at this point in time
one would have to say you would
anticipate more of the same I think that
the
$2,150 level is likely to continue to
see quite a bit of support so pay
attention to that on any pullback if we
even get there I do think at this point
in time it's very likely that the gold
market could go looking to the $2500
level although we don't necessarily have
to get there right away gold does tend
to be impulsive though and it could be
much quicker than you
think we have worked off quite a bit of
froth from that shot higher and now it
looks like we're ready to continue going
higher from a longer term standpoint I
have no interest in shorting gold quite
frankly I think gold will probably be
one of the better trading Vehicles this
year as there are geopolitical concerns
interest rates being cut and of course
just a general concern of the politics
and the geopolitical situation in the
Middle East and for that matter in
Eastern Europe if you like the video
give me a thumbs up and make sure to
subscribe to the channel
"""

    reference_summary = "The speaker, Chris Lewis, is bullish on gold and believes that the gold market will continue to go higher. He thinks that any short-term pullback is a buying opportunity. He also thinks that the $2,150 level is likely to continue to see quite a bit of support. His long-term target is $2,500. He believes that gold will be one of the better trading vehicles this year due to geopolitical concerns, interest rates being cut, and the general concern of the politics and the geopolitical situation in the Middle East and Eastern Europe."

    transcription, summaries, summary_qualities, transcript_quality, processing_time = transcribe_and_summarize_uploaded_file(file_content, filename, reference_transcript, reference_summary)

    display_output(transcription, summaries, summary_qualities, processing_time, reference_transcript, transcript_quality, reference_summary)


##For test:

In [None]:
uploaded_files = files.upload()

for filename, file_content in uploaded_files.items():
    print(f"Processing file: {filename}")
    reference_transcript = None

    reference_summary = None

    transcription, summaries, summary_qualities, transcript_quality, processing_time = transcribe_and_summarize_uploaded_file(file_content, filename, reference_transcript, reference_summary)

    display_output(transcription, summaries, summary_qualities, processing_time, reference_transcript, transcript_quality, reference_summary)
