In [8]:
import re
import json
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from keybert import KeyBERT
from collections import defaultdict
from difflib import SequenceMatcher
import numpy as np

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the transcript text file and JSON file
with open('transcript.txt', 'r', encoding='latin-1') as file:
    text = file.read()

with open('transcript.json', 'r') as file:
    transcript_json = json.load(file)

def clean_text(text):
    text = text.replace('\\n', ' ').strip()
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    filler_words = r'\b(um|uh|like|you know|uhh|basically|right|oh)\b'
    text = re.sub(filler_words, '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^a-zA-Z0-9.!? ]+', '', text)
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return ' '.join([sentence.capitalize() for sentence in sentences])

# Apply the updated cleaning function
cleaned_text = clean_text(text)

# Step 2: Preprocess by removing stopwords and tokenizing
stop_words = set(stopwords.words('english'))
def remove_stopwords(sentence):
    words = word_tokenize(sentence)
    filtered_words = [w for w in words if w.lower() not in stop_words]
    return " ".join(filtered_words)

# Preprocess the cleaned text
preprocessed_text = remove_stopwords(cleaned_text)
sentences = sent_tokenize(preprocessed_text)

# Step 3: TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)

# Step 4: DBSCAN Clustering
dbscan = DBSCAN(eps=0.5, min_samples=2, metric='cosine')
dbscan.fit(X)

# Step 5: Group sentences by clusters
clustered_sentences = defaultdict(list)
for sentence, cluster in zip(sentences, dbscan.labels_):
    clustered_sentences[cluster].append(sentence)

# Step 6: Extract keywords for each sentence using KeyBERT
kw_model = KeyBERT()
sentence_keywords = {}
for sentence in sentences:
    keywords = kw_model.extract_keywords(sentence, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=3)
    sentence_keywords[sentence] = [kw[0].lower() for kw in keywords]

# Step 7: Function to calculate string similarity
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Step 8: Match sentences with timestamps from the JSON transcript
def match_sentences_to_time(sentences, transcript_json, threshold=0.5):
    matched_times = []
    for sentence in sentences:
        for segment in transcript_json:
            if similar(sentence.strip().lower(), segment['text'].strip().lower()) > threshold:
                matched_times.append({
                    'start_time': segment['offset'] / 1000,  # Convert to seconds
                    'end_time': (segment['offset'] + segment['duration']) / 1000,  # Convert to seconds
                    'text': sentence
                })
                break
    return matched_times

# Call matching function
matched_times = match_sentences_to_time(sentences, transcript_json)

# Step 9: Categorize sentences based on keywords
keywords_dict = {
    'AI-Assisted App Development': ['ai', 'app development', 'software'],
    'User Interface and Design': ['user interface', 'design', 'ux'],
    'Entrepreneurial Mindset': ['startup', 'entrepreneurial', 'business'],
    'Tools and Technologies': ['technology', 'tools', 'software']
}

def classify_sentence(sentence):
    for category, keywords in keywords_dict.items():
        for keyword in keywords:
            if keyword in sentence.lower():
                return category
    return None

# Step 10: Compile final output in the desired format
# Step 10: Compile final output in the desired format, including category
output = []
for sentence in sentences:
    category = classify_sentence(sentence)
    if category:
        matched_time = match_sentences_to_time([sentence], transcript_json)
        if matched_time:
            output.append({
                'category': category,  # Add the category here
                'start_time': matched_time[0]['start_time'],
                'end_time': matched_time[0]['end_time'],
                'text': sentence
            })

# Print the final output in the specified format, including categories
formatted_output = []
for item in output:
    formatted_output.append({
        "category": item['category'],  # Include category in the output
        "start_time": item['start_time'],
        "end_time": item['end_time'],
        "text": item['text']
    })

# Display the formatted output
print(json.dumps(formatted_output, indent=2))



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\apasi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\apasi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[
  {
    "category": "AI-Assisted App Development",
    "start_time": 295.96,
    "end_time": 298.24,
    "text": "going vzer going able add ai features thats going done back end ."
  },
  {
    "category": "Entrepreneurial Mindset",
    "start_time": 61.6,
    "end_time": 64.199,
    "text": "ive got 30 plus startup ideas could make millions ."
  },
  {
    "category": "AI-Assisted App Development",
    "start_time": 666.639,
    "end_time": 669.72,
    "text": "Okay could actually make useful wait hold ."
  },
  {
    "category": "Entrepreneurial Mindset",
    "start_time": 309.72,
    "end_time": 311.8,
    "text": "section show people submitted startup ideas show via text prompt ."
  },
  {
    "category": "AI-Assisted App Development",
    "start_time": 832.36,
    "end_time": 836.079,
    "text": "Well itll knows types animations trained data ."
  },
  {
    "category": "AI-Assisted App Development",
    "start_time": 922.839,
    "end_time": 927.399,
    "text": "says great ide