In [None]:
import re
import pandas as pd
from tqdm import tqdm
import html as ihtml

# Load the HTML file
with open('watch-history.html', 'r', encoding='utf-8') as f:
    html = f.read()

# Find all "outer-cell" blocks quickly without full DOM parsing
entries = re.findall(r'<div class="outer-cell[\s\S]*?<\/div>\s*<\/div>', html)

In [None]:
len(entries)

In [None]:
data = []

for entry in tqdm(entries, desc="Processing entries"):
    if "Watched" not in entry:
        continue  # Skip anything not related to a watched video

    # ✅ Correctly extract only the first "content-cell" div (the real watch event)
    match = re.search(r'<div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1">(.*?)<\/div>', entry, re.S)
    if not match:
        continue
    
    watched_section = match.group(1)


    if watched_section.strip().startswith("https://"):
        continue  # Skip bad auto-logged links

    # Extract all links
    links = re.findall(r'<a href="(.*?)">(.*?)<\/a>', watched_section)
    # Extract timestamps
    time_match = re.search(r'(\d{1,2} \w+ \d{4}, \d{2}:\d{2}:\d{2} [A-Z]+)', watched_section)

    # --- CASE 1: Full record with video and channel
    if len(links) >= 2 and time_match:
        video_link, video_name = links[0]
        channel_link, channel_name = links[1]
        watch_time = time_match.group(1)

        video_name_unescaped = ihtml.unescape(video_name.strip())
        channel_name_unescaped = ihtml.unescape(channel_name.strip())

        # ❗ Skip if video name is same as link
        if video_name_unescaped.strip() == video_link.strip():
            continue
        
        data.append({
            'video_name': video_name_unescaped,
            'video_link': video_link.strip(),
            'channel_name': channel_name_unescaped,
            'channel_link': channel_link.strip(),
            'watch_time': watch_time.strip()
        })

    # --- CASE 2: Minimal record with only video
    elif len(links) == 1 and time_match:
        # no channel means the video is no longer available
        continue

    else:
        # No usable record found, skip
        continue

In [None]:
len(data)

In [None]:
# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('watch-history.csv', index=False)

print(f"Extracted {len(df)} entries and saved to watch-history.csv")

In [None]:
df