In [None]:
import os
import time
import yt_dlp
import assemblyai as aai
from openai import OpenAI
from dotenv import load_dotenv

###############################################################################
# 2. ASSEMBLYAI TRANSCRIPTION
###############################################################################

def extract_audio_from_youtube(youtube_url: str) -> str:
    """
    Extracts the best available audio URL (m4a format) from the given YouTube URL.
    """

    ydl_opts = {
        "cookiesfrombrowser": ('firefox',)
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(youtube_url, download=False)

    # Iterate over formats in reverse (best quality first) and pick one with audio only
    for fmt in reversed(info.get("formats", [])):
        if fmt.get("acodec") != "none" and fmt.get("ext") == "m4a":
            return fmt["url"]

def transcribe_audio_assemblyai(audio_url_or_path: str, language_code: str = "en") -> aai.Transcript:
    """
    Transcribe the audio from the given URL or local file path using AssemblyAI.
    If a YouTube URL is provided, it automatically extracts the audio URL.
    Returns the transcript object.
    """
    # If the input appears to be a YouTube URL, extract the audio URL.
    if "youtube.com" in audio_url_or_path or "youtu.be" in audio_url_or_path:
        print("YouTube URL detected. Extracting audio URL...")
        audio_url_or_path = extract_audio_from_youtube(audio_url_or_path)
        print(f"YouTube video audio URL: {audio_url_or_path}")
    
    # Set up AssemblyAI
    aai.settings.api_key = ASSEMBLYAI_API_KEY
    config = aai.TranscriptionConfig(language_code=language_code)
    transcriber = aai.Transcriber(config=config)

    # Start transcription
    transcript = transcriber.transcribe(audio_url_or_path)

    # Poll for completion
    while transcript.status not in ['completed', 'error']:
        print(f"Transcription status: {transcript.status}. Waiting...")
        time.sleep(5)  # Wait for 5 seconds before checking again
        transcript = transcriber.get_transcription(transcript.id)

    # Check for errors
    if transcript.status == aai.TranscriptStatus.error:
        raise RuntimeError(f"Transcription failed: {transcript.error}")

    return transcript

###############################################################################
# 3. CHUNKING THE TRANSCRIPT
###############################################################################

def chunk_text_by_paragraphs(transcript: aai.Transcript, chunk_word_target: int = 600) -> list:
    """
    Splits the transcript into chunks based on its paragraphs, aiming for about
    `chunk_word_target` words each. Accumulates paragraphs until the target is reached.
    
    Returns a list of textual chunks (strings).
    """
    paragraphs = transcript.get_paragraphs()
    chunks = []
    current_chunk = []
    current_word_count = 0

    for paragraph in paragraphs:
        paragraph_text = paragraph.text.strip()
        if not paragraph_text:
            continue  # Skip empty paragraphs

        paragraph_word_count = len(paragraph_text.split())

        # If adding this paragraph exceeds the target and current chunk is not empty, create a new chunk
        if (current_word_count + paragraph_word_count) > chunk_word_target and current_chunk:
            chunk = "\n".join(current_chunk)
            chunks.append(chunk)
            current_chunk = []
            current_word_count = 0

        # Add the paragraph to the current chunk
        current_chunk.append(paragraph_text)
        current_word_count += paragraph_word_count

    # Add any remaining paragraphs as the last chunk
    if current_chunk:
        chunk = "\n".join(current_chunk)
        chunks.append(chunk)

    return chunks

###############################################################################
# 4. OPENAI REWRITING (PAGE-BY-PAGE)
###############################################################################
def rewrite_chunk_with_openai(chunk_text: str,
                              model: str = OPENAI_MODEL,
                              prev_summary: str = "") -> str:
    """
    Sends a chunk of text to OpenAI for rewriting in a 'professorial' register.

    Optionally includes `prev_summary` – a short summary of all previously
    processed chunks – as context for better continuity across chunks.

    Returns the revised chunk as a string.
    """

    # Build system prompt with instructions
    system_prompt = (
        "You are an expert in rewriting transcripts with a professorial register. "
        "You will receive fragments of a university lesson transcript generated "
        "from an audio recording. Your role is to correct grammar, punctuation, "
        "and spelling, fix words that may be misrecognized, remove filler words, "
        "and elevate the text to an academic standard. Output only the revised "
        "transcript text in plain text, without titles, markdown, or other formatting. "
        "Maintain context as if it were in medias res."
    )

    # Build user prompt with the chunk, plus the short summary of prior chunks
    # The summary is for context only; it helps the model keep track of earlier topics.
    if prev_summary:
        user_prompt = (
            f"Here is a short summary of what has come before:\n{prev_summary}\n\n"
            f"Now, rewrite the following chunk:\n\n{chunk_text}\n\n"
            "Output only the revised text. Do not add extra commentary or formatting."
        )
    else:
        user_prompt = (
            f"Now, rewrite the following chunk:\n\n{chunk_text}\n\n"
            "Output only the revised text. Do not add extra commentary or formatting."
        )

    # Call OpenAI ChatCompletion using the client
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.2,  # Keep temperature low for consistent rewriting
        max_tokens=1500,   # Enough tokens to handle rewriting a ~600-word chunk
    )

    revised_text = response.choices[0].message.content
    return revised_text.strip()

def summarize_text_with_openai(text: str,
                               model: str = "chatgpt-4o-mini") -> str:
    """
    Summarizes the given text in a couple of sentences to maintain context
    for future rewriting chunks.
    """

    system_prompt = (
        "You are a concise and precise summarizer. Summarize the following text "
        "in one sentence, focusing on the key ideas. Keep it short. Do not referes "
        "to the text itself, just provide a single sentence that capture the kay ideas."
    )

    user_prompt = f"Text to summarize:\n{text}"

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.2,
        max_tokens=200,
    )

    summary = response.choices[0].message.content
    return summary.strip()

def get_word_count(text: str) -> int:
    """
    Returns the word count of the given text.
    """
    return len(text.split())

In [76]:
###############################################################################
# 1. CONFIGURATION
###############################################################################

# Define the job name (for output file)
job_name = input("Job name: ")

# Load environment variables from config.env
load_dotenv('./config.env')

# Retrieve API keys from environment variables
ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Validate API keys
if not ASSEMBLYAI_API_KEY:
    raise ValueError("ASSEMBLYAI_API_KEY not found in config.env")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in config.env")

# Initialize the OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)


# client = OpenAI(api_key="<DeepSeek API Key>", base_url="https://api.deepseek.com")

# Define OpenAI model to use
OPENAI_MODEL = "gpt-4o-mini"  # Ensure this model is accessible with your API key

# Define chunking parameters
CHUNK_WORD_TARGET = 500  # Target words per chunk
MAX_SUMMARY_WORDS = 300  # Maximum words in running summary before summarization
ENABLE_SUMMARY_SUMMARIZATION = True  # Toggle for summary summarization


In [None]:
# 1) Transcribe audio
# You can point to a local file, remote URL or a YouTube video. E.g.:
# audio_source = "https://assembly.ai/path_to_your_audio_file.mp3"
# or
# audio_source = "./local_file.mp3"
# or
# audio_source = "https://www.youtube.com/watch?v=YOUR_VIDEO"
audio_source = input('Path to audio file: ')  # Replace with your audio file path or URL

print("Transcribing audio... please wait.")
try:
    full_transcript_text = transcribe_audio_assemblyai(audio_source, language_code='en')
except RuntimeError as e:
    print(str(e))

print("Transcription complete.")

[debug] Encodings: locale UTF-8, fs utf-8, pref UTF-8, out UTF-8 (No ANSI), error UTF-8 (No ANSI), screen UTF-8 (No ANSI)
[debug] yt-dlp version stable@2025.02.19 from yt-dlp/yt-dlp [4985a4041] (pip) API
[debug] params: {'cookiesfrombrowser': ('firefox',), 'verbose': 'True', 'compat_opts': set(), 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5', 'Sec-Fetch-Mode': 'navigate'}}
[debug] Python 3.13.0 (CPython x86_64 64bit) - Linux-6.11.0-19-generic-x86_64-with-glibc2.39 (OpenSSL 3.0.13 30 Jan 2024, glibc 2.39)
[debug] exe versions: none
[debug] Optional libraries: brotli-1.0.9, certifi-2024.08.30, requests-2.32.3, sqlite3-3.45.3, urllib3-2.2.3, websockets-14.1
[debug] Proxy map: {}


Transcribing audio... please wait.
YouTube URL detected. Extracting audio URL...
Extracting cookies from firefox


[debug] Extracting cookies from: "/home/nicola/snap/firefox/common/.mozilla/firefox/jnaig3c6.default/cookies.sqlite"


Extracted 1795 cookies from firefox


[debug] Request Handlers: urllib, requests, websockets
[debug] Loaded 1841 extractors
[debug] [youtube] Found YouTube account cookies


[youtube] Extracting URL: https://www.youtube.com/watch?v=9vM4p9NN0Ts&t=28s&pp=ygUHTUlUIGxsbQ%3D%3D
[youtube] 9vM4p9NN0Ts: Downloading webpage
[youtube] 9vM4p9NN0Ts: Downloading tv client config
[youtube] 9vM4p9NN0Ts: Downloading player 82345d49
[youtube] 9vM4p9NN0Ts: Downloading tv player API JSON


[debug] Loading youtube-nsig.82345d49 from cache
[debug] [youtube] Decrypted nsig mMRyRm7PI5TMRguQ => EVTbR_tjv5BhVw
[debug] Loading youtube-nsig.82345d49 from cache
[debug] [youtube] Decrypted nsig JJBMiYw7m5KKx6oj => GmEPKBrTsj2iig
[debug] Sort order given by extractor: quality, res, fps, hdr:12, source, vcodec, channels, acodec, lang, proto
[debug] Formats sorted by: hasvid, ie_pref, quality, res, fps, hdr:12(7), source, vcodec, channels, acodec, lang, proto, size, br, asr, vext, aext, hasaud, id
[debug] Default format spec: best/bestvideo+bestaudio


YouTube video audio URL: https://rr5---sn-5hne6nzd.googlevideo.com/videoplayback?expire=1742399055&ei=75HaZ8mAC9K-i9oPr7voqQ4&ip=213.39.100.109&id=o-AN_dgsnlT3k3QcyMDUvUAkdIedenddyiGxyOosREqalk&itag=140&source=youtube&requiressl=yes&xpc=EgVo2aDSNQ%3D%3D&met=1742377455%2C&mh=_J&mm=31%2C29&mn=sn-5hne6nzd%2Csn-5hneknes&ms=au%2Crdu&mv=m&mvi=5&pl=24&rms=au%2Cau&initcwndbps=3118750&siu=1&bui=AccgBcNeEKG0eSnPxcp1QNy1F7Sth8wBfOrRa83Ra8REqFbXKP-UZPd4hE4gTMz3Qz7mpf1j&vprv=1&svpuc=1&mime=audio%2Fmp4&ns=DOkXPhUqNhxiUSyhLRvf6XUQ&rqh=1&gir=yes&clen=101484952&dur=6270.687&lmt=1724828401738727&mt=1742377253&fvip=1&keepalive=yes&lmw=1&c=TVHTML5&sefc=1&txp=5432434&n=GmEPKBrTsj2iig&sparams=expire%2Cei%2Cip%2Cid%2Citag%2Csource%2Crequiressl%2Cxpc%2Csiu%2Cbui%2Cvprv%2Csvpuc%2Cmime%2Cns%2Crqh%2Cgir%2Cclen%2Cdur%2Clmt&sig=AJfQdSswRQIgN3JiVgXsegfG8cvmKEphVgrAaF-Zm4m7REYBxo2Po0kCIQCVAvkFf_SEWj_jPlqmZ_U3kipvkuTRzvQoJHXCVo2YtQ%3D%3D&lsparams=met%2Cmh%2Cmm%2Cmn%2Cms%2Cmv%2Cmvi%2Cpl%2Crms%2Cinitcwndbps&lsig=AFVRHe

In [None]:
# 2) Chunk the transcript using paragraphs
print("Splitting transcript into chunks based on paragraphs...")
chunks = chunk_text_by_paragraphs(full_transcript_text, chunk_word_target=CHUNK_WORD_TARGET)
print(f"Created {len(chunks)} chunk(s) of ~{CHUNK_WORD_TARGET} words each.")

Splitting transcript into chunks based on paragraphs...
Created 9 chunk(s) of ~500 words each.


In [None]:
# 3) For each chunk, rewrite with OpenAI
final_rewritten_text = []
running_summary = ""  # Will accumulate short summaries of prior chunks

for i, chunk_text in enumerate(chunks, start=1):
    print("Revriting the transcript ...")
    # print(f"Rewriting chunk {i}/{len(chunks)}...")

    # Rewrite the chunk
    try:
        revised_text = rewrite_chunk_with_openai(
            chunk_text=chunk_text,
            model=OPENAI_MODEL,
            prev_summary=running_summary
        )
    except RuntimeError as e:
        print(f"Error rewriting chunk {i}: {str(e)}")
        continue  # Skip to the next chunk

    # Append the revised text to our final output
    final_rewritten_text.append(revised_text)

    # Summarize this revised chunk to update context
    try:
        chunk_summary = summarize_text_with_openai(revised_text, model=OPENAI_MODEL)
        # print(f"Summary for chunk {i}: {chunk_summary}")
    except RuntimeError as e:
        print(f"Error summarizing chunk {i}: {str(e)}")
        chunk_summary = ""

    # Append new summary to the running summary
    
    if ENABLE_SUMMARY_SUMMARIZATION:
        running_summary += f" {chunk_summary}"
        # Check if running_summary exceeds MAX_SUMMARY_WORDS
        if get_word_count(running_summary) > MAX_SUMMARY_WORDS:
            # print("Running summary exceeds maximum word limit. Summarizing the running summary...")
            try:
                summarized_running_summary = summarize_text_with_openai(running_summary, model=OPENAI_MODEL)
                running_summary = summarized_running_summary
                # print(f"Summarized running summary: {running_summary}")
            except RuntimeError as e:
                print(f"Error summarizing running summary: {str(e)}")
                # Optionally, you can reset the running_summary or keep it as is
    else:
        running_summary += f" {chunk_summary}"


Rewriting chunk 1/9...
Rewriting chunk 2/9...
Rewriting chunk 3/9...
Rewriting chunk 4/9...
Rewriting chunk 5/9...
Rewriting chunk 6/9...
Rewriting chunk 7/9...
Rewriting chunk 8/9...
Rewriting chunk 9/9...


In [None]:
final_rewritten_text

['If one provides a large language model with the phrase "Michael Jordan plays the sport of" and prompts it to predict the subsequent words, a correct prediction of "basketball" would imply that within its hundreds of billions of parameters, the model possesses ingrained knowledge about a specific individual and his associated sport. Generally, anyone who has experimented with these models recognizes that they have memorized an extensive array of facts. Consequently, a pertinent question arises: how does this process function, and where are these facts stored?\n\nIn December of last year, several researchers from Google DeepMind published findings addressing this inquiry, utilizing the specific example of matching athletes to their respective sports. While a comprehensive mechanistic understanding of how facts are stored remains elusive, they presented intriguing partial results, including the overarching conclusion that these facts appear to reside within a particular segment of the n

In [None]:
import json

# 4) Output the final revised text and running summary as a JSON file
final_text = "\n".join(final_rewritten_text)

data = {
    "final_text": final_text,
    "running_summary": running_summary
}

output_filename = f"transcript_{job_name}.json"
try:
    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
    print(f"Final transcript and running summary saved to {output_filename}")
except Exception as e:
    print(f"Error saving JSON file: {str(e)}")

print("\nDone.")



=== FINAL REWRITTEN TRANSCRIPT ===

Carosia GPT O1. Ho un corso relativo alla gestione dei dati di ricerca (Research Data Management) e il mio compito consiste nell'implementare, utilizzando MySQL Workbench, un modello concettuale che ho creato a partire dai miei dati, con il tuo supporto precedente. Per cominciare, ti invio il report dello studio da cui ho tratto i dati, che ho elaborato personalmente. Successivamente, ti descriverò nel dettaglio come ho strutturato questo modello concettuale, anche grazie al tuo aiuto.

Il modello concettuale che ho sviluppato si basa su due oggetti dati, ossia due dataset di sequenziamento dell'RNA a singola cellula: uno relativo a cellule sane e l'altro a cellule tumorali. Utilizzando il dataset delle cellule sane, ho generato delle predizioni applicate al dataset delle cellule tumorali.

La struttura del modello concettuale è la seguente. La prima entità è un'entità principale denominata "Cellula", che contiene cinque attributi: Cell ID, Cell Cyc