In [None]:
import requests
import os
import re
from bs4 import BeautifulSoup


In [None]:
# Constants
RSS_FEED_URL = "https://lexfridman.com/feed/podcast/"
ASSEMBLYAI_API_KEY = "ASSEMBLYAI_API_KEY"
ASSEMBLYAI_UPLOAD_URL = "https://api.assemblyai.com/v2/upload"
ASSEMBLYAI_TRANSCRIPT_URL = "https://api.assemblyai.com/v2/transcript"
CHUNK_SIZE = 5242880

In [None]:
# Function to read file in chunks
def read_file(filename, chunk_size=CHUNK_SIZE):
    with open(filename, 'rb') as _file:
        while True:
            data = _file.read(chunk_size)
            if not data:
                break
            yield data

In [None]:
# Podcasts RSS Feed Processing
page = requests.get(RSS_FEED_URL)
soup = BeautifulSoup(page.content, 'html.parser')
podcasts_items = soup.find_all('item')

In [None]:
# Download and Save Podcasts
mp3_url = podcasts_items[0].find('enclosure')['url']
mp3_file = requests.get(mp3_url)
with open("podcast.mp3", "wb") as f:
    f.write(mp3_file.content)


In [None]:
# Create 'trial' directory
os.mkdir('./trial')

# Download and Save Limited Podcasts with Modified Title
count = 0
for podcasts in podcasts_items:
    if count == 10:
        break
    title = podcasts.find('title').text
    title = re.sub(r'[\/\:\–]', '_', title)
    mp3_url = podcasts.find('enclosure')['url']
    mp3_file = requests.get(mp3_url)
    with open(f'./trial/{title}.mp3', "wb") as f:
        f.write(mp3_file.content)
    count += 1

In [None]:
# Transcription Process
file_names = os.listdir('./trial')
output_ids = []

# Upload and Transcribe Podcasts
for file in file_names:
    file_path = f'trial/{file}'
    print("Uploading file:", file)
    response = requests.post(ASSEMBLYAI_UPLOAD_URL, headers={'authorization': ASSEMBLYAI_API_KEY}, data=read_file(file_path))
    print("Upload Status:", response.status_code)
    
    # Set audio start and end time as needed
    audio_json = {
        "audio_url": response.json()['upload_url'],
        "audio_start_from": 300000,
        "audio_end_at": 600000,
    }

    print("Transcribing...")
    response = requests.post(ASSEMBLYAI_TRANSCRIPT_URL, json=audio_json, headers={'authorization': ASSEMBLYAI_API_KEY})
    output_ids.append(response.json()['id'])
    print("Transcription Status:", response.status_code)

In [None]:
# Save Transcripts
path = './transcripts'
if not os.path.exists(path):
    os.mkdir(path)

for tid in output_ids:
    transcript_url = f'https://api.assemblyai.com/v2/transcript/{tid}'
    response = requests.get(transcript_url, headers={'authorization': ASSEMBLYAI_API_KEY})
    raw_text = response.json()['text']
    with open(f'./transcripts/{tid}.txt', 'w') as f:
        f.write(raw_text)
    print("Transcript saved for:", tid)