In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
import numpy as np
import os

In [2]:
driver = webdriver.Chrome()
driver.get("https://archive.vogue.com/login")

In [3]:
start_date = pd.to_datetime("1920-01-01")
end_date = pd.to_datetime("2023-01-01")
term = "bloom"

In [4]:
def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date


os.makedirs("vogue_resource_dump", exist_ok=True)



while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


blossom_df = pd.DataFrame(results)
print("Search Results Summary:")
print(blossom_df)

Searching from 1920-01-01 00:00:00 to 1921-01-01 00:00:00
130 results found
Searching from 1921-01-01 00:00:00 to 1922-01-01 00:00:00
89 results found
Searching from 1922-01-01 00:00:00 to 1923-01-01 00:00:00
65 results found
Searching from 1923-01-01 00:00:00 to 1924-01-01 00:00:00
63 results found
Searching from 1924-01-01 00:00:00 to 1925-01-01 00:00:00
66 results found
Searching from 1925-01-01 00:00:00 to 1926-01-01 00:00:00
100 results found
Searching from 1926-01-01 00:00:00 to 1927-01-01 00:00:00
64 results found
Searching from 1927-01-01 00:00:00 to 1928-01-01 00:00:00
94 results found
Searching from 1928-01-01 00:00:00 to 1929-01-01 00:00:00
73 results found
Searching from 1929-01-01 00:00:00 to 1930-01-01 00:00:00
69 results found
Searching from 1930-01-01 00:00:00 to 1931-01-01 00:00:00
52 results found
Searching from 1931-01-01 00:00:00 to 1932-01-01 00:00:00
63 results found
Searching from 1932-01-01 00:00:00 to 1933-01-01 00:00:00
56 results found
Searching from 1933-01-

In [5]:
blossom_df.to_csv('blossom_df.csv', index=False) 