In [2]:
import re
import json
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from keybert import KeyBERT
from collections import defaultdict
from difflib import SequenceMatcher

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\apasi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\apasi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Load the transcript text file and JSON file
with open('transcript.txt', 'r', encoding='latin-1') as file:
    text = file.read()

with open('transcript.json', 'r') as file:
    transcript_json = json.load(file)

In [6]:
#cleaning  the data
def clean_text(text):
    # Remove newline characters and multiple spaces
    text = text.replace('\\n', ' ').strip()
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space

    # Remove filler words and phrases
    filler_words = r'\b(um|uh|like|you know|uhh|basically|right|oh)\b'
    text = re.sub(filler_words, '', text, flags=re.IGNORECASE)

    # Remove punctuation (except sentence-ending punctuation)
    text = re.sub(r'[^a-zA-Z0-9.!? ]+', '', text)

    # Split the text into sentences for further processing
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())

    # Capitalize each sentence properly
    sentences = [sentence.capitalize() for sentence in sentences]

    # Join sentences back into a single cleaned text
    return ' '.join(sentences)


In [7]:
# Apply the updated cleaning function
cleaned_text = clean_text(text)
cleaned_text

'Theres two types of people whove watched my videos and message me theres the people and its not nowhere in between either they have fully pushed to the end and created  a full app that they love or they got stuck on  the first few steps and gave up once you get the aha moment where youre   this works and you realize that  youre in charge you dont need to ask anyone  influencer you dont need an influencer or a teacher you just need to ask ai you need to ask claude be  . Hey it didnt work for this. And it might not give you the  answer on the first try but odds are by the second or third try  you will will your way to a working app. I its guaranteed i just think it separates the high agency people and the low agency. People  pretty music well  riley welcome to the show this is a startup ideas podcast where we talk about startup ideas but were not necessarily talking about startup ideas today were talking about how to how to use some so ai tools to build software . . Yeah . Its gotten a 

In [6]:
# Step 2: Preprocess by removing stopwords and tokenizing
stop_words = set(stopwords.words('english'))
def remove_stopwords(sentence):
    words = word_tokenize(sentence)
    filtered_words = [w for w in words if w.lower() not in stop_words]
    return " ".join(filtered_words)


In [7]:
# Preprocess the cleaned text
preprocessed_text = remove_stopwords(cleaned_text)
sentences = sent_tokenize(preprocessed_text)

In [8]:
# Step 3: TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)

In [9]:
# Step 4: KMeans Clustering (you can adjust the number of clusters)
num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)


In [10]:
# Step 5: Group sentences by clusters
clustered_sentences = {i: [] for i in range(num_clusters)}
for sentence, cluster in zip(sentences, kmeans.labels_):
    clustered_sentences[cluster].append(sentence)


In [11]:
# Step 6: Extract keywords for each sentence using KeyBERT
kw_model = KeyBERT()
sentence_keywords = {}
for sentence in sentences:
    keywords = kw_model.extract_keywords(sentence, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=3)
    sentence_keywords[sentence] = [kw[0].lower() for kw in keywords]


In [12]:
# Step 7: Function to calculate string similarity
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [13]:
# Step 8: Match sentences with timestamps from the JSON transcript
def match_sentences_to_time(sentences, transcript_json, threshold=0.5):
    matched_times = []
    for sentence in sentences:
        for segment in transcript_json:
            if similar(sentence.strip().lower(), segment['text'].strip().lower()) > threshold:
                matched_times.append({
                    'start_time': segment['offset'] / 1000,  # Convert to seconds
                    'end_time': (segment['offset'] + segment['duration']) / 1000,  # Convert to seconds
                    'text': sentence
                })
                break
    return matched_times

# Call matching function
matched_times = match_sentences_to_time(sentences, transcript_json)


In [14]:
# Step 9: Categorize sentences based on keywords
keywords_dict = {
    'AI-Assisted App Development': ['ai', 'app development', 'software'],
    'User Interface and Design': ['user interface', 'design', 'ux'],
    'Entrepreneurial Mindset': ['startup', 'entrepreneurial', 'business'],
    'Tools and Technologies': ['technology', 'tools', 'software']
}

def classify_sentence(sentence):
    for category, keywords in keywords_dict.items():
        for keyword in keywords:
            if keyword in sentence.lower():
                return category
    return None

In [15]:
# Step 10: Compile final output in the desired format
output = []
for sentence in sentences:
    category = classify_sentence(sentence)
    if category:
        matched_time = match_sentences_to_time([sentence], transcript_json)
        if matched_time:
            output.append({
                'start_time': matched_time[0]['start_time'],
                'end_time': matched_time[0]['end_time'],
                'text': sentence
            })

# Print the final output in the specified format
formatted_output = []
for item in output:
    formatted_output.append({
        "start_time": item['start_time'],
        "end_time": item['end_time'],
        "text": item['text']
    })

# Display the formatted output
print(json.dumps(formatted_output, indent=2))

[
  {
    "start_time": 295.96,
    "end_time": 298.24,
    "text": "going vzer going able add ai features thats going done back end ."
  },
  {
    "start_time": 61.6,
    "end_time": 64.199,
    "text": "ive got 30 plus startup ideas could make millions ."
  },
  {
    "start_time": 666.639,
    "end_time": 669.72,
    "text": "Okay could actually make useful wait hold ."
  },
  {
    "start_time": 309.72,
    "end_time": 311.8,
    "text": "section show people submitted startup ideas show via text prompt ."
  },
  {
    "start_time": 832.36,
    "end_time": 836.079,
    "text": "Well itll knows types animations trained data ."
  },
  {
    "start_time": 922.839,
    "end_time": 927.399,
    "text": "says great idea needs impr wait ."
  },
  {
    "start_time": 280.24,
    "end_time": 283.12,
    "text": "decision decision made go next startup idea ."
  },
  {
    "start_time": 1113.48,
    "end_time": 1118.159,
    "text": "Okay cool lets lets try wait reversed ."
  },
  {
    "star