In [2]:
import re
from datetime import timedelta

def parse_srt(srt_content):
    pattern = re.compile(r"(\d+)\s+(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\s+(.*?)\s+(?=\d+\s+\d{2}|\Z)", re.DOTALL)
    entries = []
    for match in pattern.finditer(srt_content):
        index = int(match.group(1))
        start = match.group(2)
        end = match.group(3)
        text = match.group(4).replace("\n", " ").strip()
        entries.append({
            "index": index,
            "start": start,
            "end": end,
            "text": text
        })
    return entries

def time_to_timedelta(t):
    h, m, s_ms = t.split(":")
    s, ms = s_ms.split(",")
    return timedelta(hours=int(h), minutes=int(m), seconds=int(s), milliseconds=int(ms))

def timedelta_to_time(td):
    total_seconds = int(td.total_seconds())
    ms = int((td.total_seconds() - total_seconds) * 1000)
    return f"{str(td)}".replace(".", ",")[:12]

def merge_sentences(entries, time_gap_threshold=1.0):
    merged = []
    buffer = [entries[0]]
    
    for curr in entries[1:]:
        prev = buffer[-1]
        prev_end = time_to_timedelta(prev["end"])
        curr_start = time_to_timedelta(curr["start"])
        gap = (curr_start - prev_end).total_seconds()
        
        # Merge condition: short gap and no punctuation at end of previous
        if gap < time_gap_threshold and not re.search(r"[.!?]$", prev["text"]):
            buffer.append(curr)
        else:
            # Commit buffer
            merged.append({
                "start": buffer[0]["start"],
                "end": buffer[-1]["end"],
                "text": " ".join(b["text"] for b in buffer)
            })
            buffer = [curr]
    
    # Final buffer
    if buffer:
        merged.append({
            "start": buffer[0]["start"],
            "end": buffer[-1]["end"],
            "text": " ".join(b["text"] for b in buffer)
        })

    # Re-number indexes
    for i, entry in enumerate(merged, start=1):
        entry["index"] = i
    return merged

def format_srt(entries):
    result = []
    for entry in entries:
        result.append(f"{entry['index']}")
        result.append(f"{entry['start']} --> {entry['end']}")
        result.append(entry["text"])
        result.append("")
    return "\n".join(result)

# Example usage:
with open("example.srt", "r", encoding="utf-8") as f:
    raw_srt = f.read()

parsed = parse_srt(raw_srt)
merged = merge_sentences(parsed)
new_srt = format_srt(merged)

print(f'new_srt: {new_srt}')
# with open("merged_output.srt", "w", encoding="utf-8") as f:
#     f.write(new_srt)

new_srt: 1
00:00:00,000 --> 00:00:07,600
It's Tuesday, April 8th, here's what's happening right now on CNN this morning.

2
00:00:07,600 --> 00:00:13,200
I have great respect for China, but they can't do this.

3
00:00:13,200 --> 00:00:19,280
President Trump threatens to escalate his trade war with China, issuing them an ultimatum and a deadline.

4
00:00:19,280 --> 00:00:30,880
The China does not appear to be backing down, plus our team will get to work tomorrow to deport these heinous, violent foreign terrorists.

5
00:00:30,880 --> 00:00:39,800
The White House can resume rapid deportations, but there's a catch, new rules from the Supreme Court, also.

6
00:00:39,800 --> 00:00:44,240
This is the highest I've seen the water in my lifetime.

7
00:00:44,240 --> 00:00:50,000
The flooding danger is not over as deadly storms leave some communities under water.

8
00:00:50,000 --> 00:00:58,480
And in three short years, we got the University of Florida's basketball program back where it belo

In [6]:
import datetime
datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

'2025-08-06T00:44:59'