In [1]:
from pydub import AudioSegment
import os

def split_audio_to_chunks(input_file, output_dir, chunk_length_sec=300, overlap_sec=5):
    audio = AudioSegment.from_mp3(input_file)
    duration_ms = len(audio)
    chunk_length_ms = chunk_length_sec * 1000
    overlap_ms = overlap_sec * 1000

    os.makedirs(output_dir, exist_ok=True)
    chunks = []

    start = 0
    i = 0
    while start < duration_ms:
        end = min(start + chunk_length_ms, duration_ms)
        chunk = audio[start:end + overlap_ms]
        start_time = start / 1000
        end_time = (start + len(chunk)) / 1000
        chunk_name = f"chunk_{i:03d}.mp3"
        chunk_path = os.path.join(output_dir, chunk_name)
        chunk.export(chunk_path, format="mp3")
        chunks.append({
            "filename": chunk_name,
            "start_time": start_time,
            "end_time": end_time
        })
        start += chunk_length_ms
        i += 1

    return chunks

# Run it
input_file = "tesla_q4.mp3"  # Your input MP3
chunks_metadata = split_audio_to_chunks(input_file, "chunks", chunk_length_sec=300, overlap_sec=5)


In [2]:
from google.cloud import storage

def upload_to_gcs(bucket_name, local_file_path, destination_blob_name):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(local_file_path)
    return f"gs://{bucket_name}/{destination_blob_name}"

# Example usage
bucket_name = "simpliearn-audio"

for chunk in chunks_metadata:
    local_path = os.path.join("chunks", chunk["filename"])
    gcs_uri = upload_to_gcs(bucket_name, local_path, chunk["filename"])
    chunk["gcs_uri"] = gcs_uri  # Add URI to metadata

In [3]:
from google.cloud import speech

phrases = ["Tesla", "Dojo", "Cybertruck", "FSD", "Gigafactory", "Elon"]

def transcribe_audio_with_word_timestamps(gcs_uri, phrases):
    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=44100,
        language_code="en-US",
        enable_word_time_offsets=True,
        enable_automatic_punctuation=True,
        use_enhanced=True,
        model="video",
        speech_contexts=[
            speech.SpeechContext(
                phrases=phrases,
                boost=15.0
            )
        ]
    )

    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=1800)

    transcript_text = ""
    word_timestamps = []

    for result in response.results:
        alternative = result.alternatives[0]
        transcript_text += alternative.transcript + " "
        for word_info in alternative.words:
            word_timestamps.append({
                "word": word_info.word,
                "start_time": word_info.start_time.total_seconds(),
                "end_time": word_info.end_time.total_seconds(),
            })

    return transcript_text.strip(), word_timestamps

In [4]:
full_transcript = ""
all_word_timestamps = []

for chunk in chunks_metadata:
    print(f"Transcribing: {chunk['filename']}...")
    transcript, word_times = transcribe_audio_with_word_timestamps(chunk["gcs_uri"], phrases)

    # Adjust word timestamps to be relative to full audio
    for word in word_times:
        word["start_time"] += chunk["start_time"]
        word["end_time"] += chunk["start_time"]

    full_transcript += transcript + " "
    all_word_timestamps.extend(word_times)

Transcribing: chunk_000.mp3...
Transcribing: chunk_001.mp3...
Transcribing: chunk_002.mp3...
Transcribing: chunk_003.mp3...
Transcribing: chunk_004.mp3...
Transcribing: chunk_005.mp3...
Transcribing: chunk_006.mp3...
Transcribing: chunk_007.mp3...
Transcribing: chunk_008.mp3...
Transcribing: chunk_009.mp3...
Transcribing: chunk_010.mp3...
Transcribing: chunk_011.mp3...
Transcribing: chunk_012.mp3...
Transcribing: chunk_013.mp3...
Transcribing: chunk_014.mp3...


In [8]:
import nltk
from nltk.tokenize import sent_tokenize

# Download the sentence tokenizer
nltk.download("punkt_tab")

# Function to split transcript into sentences
def split_into_sentences(text):
    return sent_tokenize(text)

# Example: transcript_text from the transcription
# transcript_text = "Good afternoon everyone and welcome to Tesla's Q4 earnings call. Elon spoke about Dojo and Gigafactory plans."

transcript_text = full_transcript.strip()

# Run it
sentences = split_into_sentences(transcript_text)

# Display results
for idx, sentence in enumerate(sentences):
    print(f"Sentence {idx + 1}: {sentence}")


Sentence 1: Good afternoon everyone and welcome to Tesla has fourth quarter 2024, QA webcast my name is Travis Axelrod Heaven, restoration relations here at Tesla, and I am joined today by Elon Musk and them up, Tunisia, and a number of other executives.
Sentence 2: Are Q4 results, were announced at about 3 p.m. central Time in the update deck, we published at the same length as this.
Sentence 3: Webcast during this call, we will discuss our business Outlook and make forward-looking statements.
Sentence 4: These comments are based on our predictions and expectations as of today.
Sentence 5: Day actual events or results could differ materially due to a number of risks and uncertainties, including those mentioned in our most recent filings with the SEC.
Sentence 6: During the question and answer portion of today's call, please limit yourself to one question and one follow-up.
Sentence 7: Please use the raise hand button to join the question queue.
Sentence 8: Before we jump into, Q&A Elo

[nltk_data] Error loading punkt_tab: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


In [13]:
def assign_timestamps_to_sentences(sentences, word_timestamps):
    sentence_timestamps = []
    word_index = 0
    last_end_time = 0

    for sentence in sentences:
        words = sentence.split()
        if not words:
            continue

        try:
            # Estimate start and end time from current word index
            start_time = word_timestamps[word_index]["start_time"]
            end_time = word_timestamps[word_index + len(words) - 1]["end_time"]

            # Ensure time does not move backward
            if start_time < last_end_time:
                start_time = last_end_time  # Clamp it
            if end_time < start_time:
                end_time = start_time  # Also clamp forward

            last_end_time = end_time  # Update reference

            sentence_timestamps.append({
                "sentence": sentence,
                "start_time": round(start_time, 2),
                "end_time": round(end_time, 2)
            })

        except IndexError:
            # If word_index is out of range, just skip this sentence
            sentence_timestamps.append({
                "sentence": sentence,
                "start_time": None,
                "end_time": None
            })

        # Move word index forward
        word_index += len(words)

    return sentence_timestamps


In [14]:
matched_sentences = assign_timestamps_to_sentences(sentences, all_word_timestamps)

# Preview result
for item in matched_sentences[:10]:
    print(f"Sentence: {item['sentence']}")
    print(f"Start Time: {item['start_time']}s, End Time: {item['end_time']}s\n")

Sentence: Good afternoon everyone and welcome to Tesla has fourth quarter 2024, QA webcast my name is Travis Axelrod Heaven, restoration relations here at Tesla, and I am joined today by Elon Musk and them up, Tunisia, and a number of other executives.
Start Time: 378.6s, End Time: 391.9s

Sentence: Are Q4 results, were announced at about 3 p.m. central Time in the update deck, we published at the same length as this.
Start Time: 391.9s, End Time: 399.2s

Sentence: Webcast during this call, we will discuss our business Outlook and make forward-looking statements.
Start Time: 399.2s, End Time: 404.3s

Sentence: These comments are based on our predictions and expectations as of today.
Start Time: 404.3s, End Time: 408.3s

Sentence: Day actual events or results could differ materially due to a number of risks and uncertainties, including those mentioned in our most recent filings with the SEC.
Start Time: 408.4s, End Time: 417.3s

Sentence: During the question and answer portion of today'

### Saving Data
I had to restart the kernel and didn't want to lose the data

In [33]:
import os
import pickle
import json

# Create folder if it doesn't exist
save_dir = "saved_data"
os.makedirs(save_dir, exist_ok=True)

# Save full transcript
with open(os.path.join(save_dir, "full_transcript.txt"), "w", encoding="utf-8") as f:
    f.write(full_transcript.strip())

# Save word-level timestamps
with open(os.path.join(save_dir, "word_timestamps.pkl"), "wb") as f:
    pickle.dump(all_word_timestamps, f)

# Save matched sentences
with open(os.path.join(save_dir, "matched_sentences.pkl"), "wb") as f:
    pickle.dump(matched_sentences, f)

# Save chunk metadata
with open(os.path.join(save_dir, "chunks_metadata.json"), "w") as f:
    json.dump(chunks_metadata, f)

print(f"All data saved in folder: {save_dir}/")

All data saved in folder: saved_data/


### Getting Data

In [5]:
import os
import pickle
import json

# Path to saved data folder
save_dir = "saved_data"

# Load full transcript
with open(os.path.join(save_dir, "full_transcript.txt"), "r", encoding="utf-8") as f:
    full_transcript = f.read()

# Load word timestamps
with open(os.path.join(save_dir, "word_timestamps.pkl"), "rb") as f:
    all_word_timestamps = pickle.load(f)

# Load matched sentences
with open(os.path.join(save_dir, "matched_sentences.pkl"), "rb") as f:
    matched_sentences = pickle.load(f)

# Load chunk metadata
with open(os.path.join(save_dir, "chunks_metadata.json"), "r") as f:
    chunks_metadata = json.load(f)

print(f"All data reloaded from folder: {save_dir}/")


All data reloaded from folder: saved_data/


In [6]:
from transformers import pipeline
import pandas as pd

# Load the sentiment analysis model
classifier = pipeline(
    "sentiment-analysis",
    model="soleimanian/roberta-large"
)

Device set to use mps:0


In [None]:
# Create a result list with all fields
sentiment_results = []

for entry in matched_sentences:
    sentence = entry["sentence"]

    try:
        result = classifier(sentence)[0]
        certainty_score = result['score']

        if result['label'] == 'positive':
            sentiment_score = certainty_score
        elif result['label'] == 'negative':
            sentiment_score = -certainty_score
        else:
            sentiment_score = 0
    except Exception as e:
        # Fallback in case of any error with the model
        result = {"label": "error"}
        certainty_score = 0
        sentiment_score = 0

    # Combine everything into one dict
    sentiment_results.append({
        "sentence": sentence,
        "start_time": entry["start_time"],
        "end_time": entry["end_time"],
        "label": result['label'],
        "certainty_score": round(certainty_score, 4),
        "sentiment_score": round(sentiment_score, 4)
    })

# Convert to DataFrame for inspection or saving
sentiment_df = pd.DataFrame(sentiment_results)

(650, 6)

In [13]:
# Show first few entries
print("head:")
print(sentiment_df.head())

print("\nshape:")
print(sentiment_df.shape)

head:
                                            sentence  start_time  end_time  \
0  Good afternoon everyone and welcome to Tesla h...       378.6     391.9   
1  Are Q4 results, were announced at about 3 p.m....       391.9     399.2   
2  Webcast during this call, we will discuss our ...       399.2     404.3   
3  These comments are based on our predictions an...       404.3     408.3   
4  Day actual events or results could differ mate...       408.4     417.3   

     label  certainty_score  sentiment_score  
0  neutral           0.9996              0.0  
1  neutral           0.9996              0.0  
2  neutral           0.9996              0.0  
3  neutral           0.9996              0.0  
4  neutral           0.9995              0.0  

shape:
(650, 6)


### Saving to CSV

In [14]:
sentiment_df.to_csv("saved_data/sentiment_analysis.csv", index=False)

Data ready to send to sentiment analysis team.