In [None]:
!pip install google-cloud-storage
!pip install google-cloud-speech



In [None]:
import os
from google.cloud import storage
from google.api_core.client_options import ClientOptions
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
from concurrent.futures import ThreadPoolExecutor
import time
import json
import tempfile
import zipfile
from google.colab import files

MAX_AUDIO_LENGTH_SECS = 8 * 60 * 60  # 8 hours max for Speech-to-Text API (NBA podcasts will never be this long)
BATCH_SIZE = 5  # I did not fully test max efficiency for this number but this seems to be the max amount of files I can do at the same time before erros start occuring from the Speech-to-Text API

In [None]:
# Uses Speech-to-Text V2 for better transcription.
# Standard transcription for V2  using folowing configuration:
# Model: Long
# Language: en-US

def process_single_file(audio_file):
    print(f"\nüé¨ STARTING transcription for: {audio_file}")
    start_time = time.time()

    client = SpeechClient(
        client_options=ClientOptions(
            api_endpoint="us-speech.googleapis.com",
            quota_project_id="potent-hue-440714-e7"
        ),
    )

    gcs_output_folder = "gs://audio-files2024/RotoWire JSON transcripts"
    audio_gcs_uri = f"gs://audio-files2024/RotoWire Fantasy Basketball/{audio_file}"

    config = cloud_speech.RecognitionConfig(
        auto_decoding_config={},
        features=cloud_speech.RecognitionFeatures(
            enable_word_confidence=True,
            enable_word_time_offsets=True,
        ),
        model="long",
        language_codes=["en-US"],
    )

    output_config = cloud_speech.RecognitionOutputConfig(
        gcs_output_config=cloud_speech.GcsOutputConfig(uri=gcs_output_folder),
    )

    files = [cloud_speech.BatchRecognizeFileMetadata(uri=audio_gcs_uri)]

    request = cloud_speech.BatchRecognizeRequest(
        recognizer="projects/potent-hue-440714-e7/locations/us/recognizers/_",
        config=config,
        files=files,
        recognition_output_config=output_config,
    )

    operation = client.batch_recognize(request=request)
    response = operation.result(timeout=3 * MAX_AUDIO_LENGTH_SECS)

    end_time = time.time()
    duration = round(end_time - start_time, 2)
    print(f"\n‚úÖ COMPLETED transcription for: {audio_file}")
    print(f"\n‚è±Ô∏è Time taken: {duration} seconds")
    return response

def run_batch_recognize(podcast_name):
    # List of audio files to process
    podcast_path = podcast_name + "/"
    # print("üìÇ Fetching list of audio files...")
    storage_client = storage.Client()
    bucket = storage_client.bucket("audio-files2024")
    blobs = list(bucket.list_blobs(prefix=podcast_path))
    audio_files = [blob.name.split(podcast_path)[-1]
                  for blob in blobs
                  if blob.name.endswith('.mp3')]

    print(f"\nüéØ Processing these {len(audio_files)} files:")
    for file in audio_files:
        print(f"   - {file}")

    print("\nüöÄ Starting batch processing...")
    # Process files in parallel for quicker processing time
    with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
        futures = [executor.submit(process_single_file, audio_file)
                  for audio_file in audio_files]

        # Wait for all files to complete
        for future in futures:
            try:
                response = future.result()
            except Exception as e:
                print(f"‚ùå Error processing file: {e}")

    print("\n‚ú® All files processed!")

if __name__ == "__main__":
    run_batch_recognize("RotoWire Fantasy Basketball")


üéØ Processing these 97 files:
   - 2023 NBA Re-Draft In-Season Tournament Final Four Set  Live Listener QA.mp3
   - Add These 9 Players Off Waivers - NBA Fantasy 2023-24.mp3
   - Adds in Memphis Utahs Ascent  Siakam to Indiana .mp3
   - All-Star Weekend Takeaways Nets Deep Dive Schedule Notes  Fantasy Basketball QA.mp3
   - Ant Goes Off Injuries in the East  Tips to Win Your Fantasy Playoffs.mp3
   - Big Injury Updates Next Years First Round AddsDrops  Live QA.mp3
   - Bucks Concerns OKCs Finals Chances Zion 20  Players Were Avoiding in 2024-25.mp3
   - Bucks Fall to Washington Wembanyama vs Jokic  Most Disappointing Fantasy Players in 2023-24.mp3
   - Daily News  Notes Late-Season Streams Dynasty Tips  More.mp3
   - Dames Slump Fox Returns Frustrating Wizards  Chet vs Wembanyama with Matt Stroup.mp3
   - Draymond Suspension Fallout AD vs Wembanyama Tari Eason Terry Rozier and Much More.mp3
   - Early Draft Hits  Misses Sell-High Candidates with Brandon Kravitz.mp3
   - Early Fantas

KeyboardInterrupt: 

In [None]:
#Process each JSON file to a .txt file
def process_transcripts():
    print("üîÑ Starting transcript processing...")

    # Initialize GCS client
    storage_client = storage.Client()
    bucket = storage_client.bucket("audio-files2024")

    # List all JSON files in transcripts folder
    blobs = list(bucket.list_blobs(prefix="RotoWire JSON transcripts/"))
    json_files = [blob for blob in blobs if blob.name.endswith('.json')]

    print(f"üìÅ Found {len(json_files)} JSON files to process")

    # Process each JSON file
    for blob in json_files:
        try:
            # Download and parse JSON content
            json_content = blob.download_as_text()
            data = json.loads(json_content)

            # Create output filename
            output_filename = blob.name.split('/')[-1].replace('.json', '.txt')
            gcs_output_path = f"RotoWire_text_transcripts/{output_filename}"

            print(f"\nüé¨ Processing: {blob.name}")

            # Create temporary file
            with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
                transcripts = []
                for result in data['results']:
                    if 'alternatives' in result and result['alternatives']:
                        transcript = result['alternatives'][0]['transcript']
                        transcripts.append(transcript)

                # Join all transcripts with space and write to temp file
                full_transcript = ' '.join(transcripts)
                temp_file.write(full_transcript)

            # Upload the temp file to GCS
            with open(temp_file.name, 'rb') as file:
                output_blob = bucket.blob(gcs_output_path)
                output_blob.upload_from_file(file)

            # Clean up temp file
            os.unlink(temp_file.name)

            print(f"‚úÖ Created: gs://audio-files2024/{gcs_output_path}")

        except Exception as e:
            print(f"‚ùå Error processing {blob.name}: {str(e)}")

    print("\n‚ú® All transcripts processed!")

if __name__ == "__main__":
    process_transcripts()

üîÑ Starting transcript processing...
üìÅ Found 97 JSON files to process

üé¨ Processing: RotoWire JSON transcripts/2023 NBA Re-Draft In-Season Tournament Final Four Set  Live Listener QA_transcript_680c2fb2-0000-2c8b-92c1-d4f547f12aa4.json
‚úÖ Created: gs://audio-files2024/RotoWire_text_transcripts/2023 NBA Re-Draft In-Season Tournament Final Four Set  Live Listener QA_transcript_680c2fb2-0000-2c8b-92c1-d4f547f12aa4.txt

üé¨ Processing: RotoWire JSON transcripts/Add These 9 Players Off Waivers - NBA Fantasy 2023-24_transcript_6af84f27-0000-2e33-b8d3-24058882bff8.json
‚úÖ Created: gs://audio-files2024/RotoWire_text_transcripts/Add These 9 Players Off Waivers - NBA Fantasy 2023-24_transcript_6af84f27-0000-2e33-b8d3-24058882bff8.txt

üé¨ Processing: RotoWire JSON transcripts/Adds in Memphis Utahs Ascent  Siakam to Indiana _transcript_67449c51-0000-2b1d-8b0f-747446fd7c1c.json
‚úÖ Created: gs://audio-files2024/RotoWire_text_transcripts/Adds in Memphis Utahs Ascent  Siakam to Indiana _

In [None]:
import os
import zipfile
from google.colab import files

# Put all transcribed files into a zip folder
def download_folder(folder_path, zip_name=None):
    if zip_name is None:
        zip_name = os.path.basename(folder_path) + '.zip'

    print(f"üì¶ Zipping folder: {folder_path}")

    # Create zip file
    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, filenames in os.walk(folder_path):
            for filename in filenames:
                file_path = os.path.join(root, filename)
                arcname = os.path.relpath(file_path, folder_path)
                print(f"Adding: {arcname}")
                zipf.write(file_path, arcname)

    print(f"\n‚ú® Downloading {zip_name}")
    files.download(zip_name)

download_folder('/content/text_transcripts')

üì¶ Zipping folder: /content/text_transcripts
Adding: Draymond Chokes Gobert Fantasy Basketball Tuesday Recap  Wednesday Lookahead_transcript_6afe58ef-0000-2bfc-bcc0-2405887bfb7c.txt
Adding: Week 6 Fantasy Basketball Strategy  Best Worst Schedules  Streaming Guide_transcript_6d6df28f-0000-25ff-adb4-14223bc0a5ca.txt
Adding: Week 4 Fantasy Basketball Preview Streaming Weekly Choices Injury Updates  Strategies_transcript_6d6b4f53-0000-25ff-adb4-14223bc0a5ca.txt
Adding: Fantasy Basketball ADDS for Week 13  Streaming Schedule_transcript_6cd943bf-0000-2d1a-90a0-14223bb63cfa.txt
Adding: Luka Drops 50  Christmas Fantasy Basketball Recap  Podziemski and Other Key Moves_transcript_6d2f7cb6-0000-2279-a0f3-14223bcc1196.txt
Adding: Buy or Sell Buddy Hield Tre Mann  Fantasy Basketball Thursday  Rui  Advija Big Games_transcript_6cb256c4-0000-268a-bb06-14c14eec0548.txt
Adding: Week 2 Fantasy Basketball Recap - Adds Drops Buys Sells_transcript_6d22679b-0000-2d1a-90a0-14223bb63cfa.txt
Adding: Waiver Wi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>