In [76]:
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timezone, timedelta
import pytz
from dateutil import tz

# Load only needed parts of the HTML
with open("Aaryan_takeout/watch-history.html", "r", encoding="utf-8") as f:
    html_content = f.read(10_000_000)  # Read only the first 10MB for speed (adjust as needed)


In [77]:
def convert_to_utc(datetime_str):
    # Split the string into the date part and the timezone part
    parts = datetime_str.rsplit(" ", 1)  # Splitting at the last space
    date_part = parts[0]  # The actual date/time string
    tz_part = parts[1] if len(parts) > 1 else None  # Timezone string

    # Convert to a naive datetime object
    naive_dt = datetime.strptime(date_part, "%b %d, %Y, %I:%M:%S %p")

    if tz_part:
        try:
            # Convert string timezone to a valid tz object
            local_tz = tz.gettz(tz_part)
            if local_tz is None:
                raise ValueError(f"Unknown timezone: {tz_part}")

            # Localize the naive datetime to the extracted timezone
            localized_dt = naive_dt.replace(tzinfo=local_tz)
        except Exception as e:
            print(f"Warning: {e}. Assuming UTC.")
            localized_dt = naive_dt.replace(tzinfo=pytz.UTC)  # Default to UTC if error
    else:
        # Assume UTC if no timezone information is present
        localized_dt = naive_dt.replace(tzinfo=pytz.UTC)

    # Convert to UTC
    utc_dt = localized_dt.astimezone(pytz.UTC)

    return utc_dt

In [78]:
soup = BeautifulSoup(html_content, "html.parser")
outer_cells = soup.find_all("div", class_="outer-cell", limit=100000)
print('finished')
# Extract text content from each 'outer-cell' div

finished


In [79]:
data = []
for idx, cell in enumerate(outer_cells):
    # Extract title
    try: 
        if "From Google Ads" in cell.get_text():
            continue
        cell_text = cell.find("div", class_="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1").get_text("\n", strip=True).split("\n")
        title = cell_text[1]
        time = cell_text[-1]
        time_dt = convert_to_utc(time)
        data.append([title, time_dt])
    except:
        pass 

# Convert extracted data to a DataFrame
df = pd.DataFrame(data, columns=["title", "datetime"])

In [80]:
print(df["datetime"].min())
print(df["datetime"].max())

2024-10-30 18:30:10+00:00
2025-02-15 01:38:54+00:00


In [81]:
dec_1 = datetime(2024, 12, 1, tzinfo=timezone.utc)
feb_1 = datetime(2025, 2, 1, tzinfo=timezone.utc)

# Filter rows within the last month
df = df[(df["datetime"] >= dec_1) & (df["datetime"] <= feb_1)]

In [82]:
df.to_pickle("youtube.pkl")