In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
import numpy as np
import os

In [17]:
driver = webdriver.Chrome()
driver.get("https://archive.vogue.com/login")

In [13]:
start_date = pd.to_datetime("1900-01-01")
end_date = pd.to_datetime("2024-01-01")
term = "suit"

def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=10)
results = []
current_date = start_date

os.makedirs("vogue_resource_dump", exist_ok=True)

while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)

driver.quit()

results_df = pd.DataFrame(results)
print("Search Results Summary:")
print(results_df)

Searching from 1900-01-01 00:00:00 to 1910-01-01 00:00:00
4156 results found
Searching from 1910-01-01 00:00:00 to 1920-01-01 00:00:00
7321 results found
Searching from 1920-01-01 00:00:00 to 1930-01-01 00:00:00
5121 results found
Searching from 1930-01-01 00:00:00 to 1940-01-01 00:00:00
6708 results found
Searching from 1940-01-01 00:00:00 to 1950-01-01 00:00:00
8322 results found
Searching from 1950-01-01 00:00:00 to 1960-01-01 00:00:00
9642 results found
Searching from 1960-01-01 00:00:00 to 1970-01-01 00:00:00
6928 results found
Searching from 1970-01-01 00:00:00 to 1980-01-01 00:00:00
4011 results found
Searching from 1980-01-01 00:00:00 to 1990-01-01 00:00:00
4369 results found
Searching from 1990-01-01 00:00:00 to 2000-01-01 00:00:00
3825 results found
Searching from 2000-01-01 00:00:00 to 2010-01-01 00:00:00
2718 results found
Searching from 2010-01-01 00:00:00 to 2020-01-01 00:00:00
1432 results found
Searching from 2020-01-01 00:00:00 to 2030-01-01 00:00:00
133 results found


In [14]:
print("\nSearch Results Summary:")
print(results_df.to_string(index=False))


Search Results Summary:
Term Start Date   End Date  Result Count
suit 1900-01-01 1910-01-01          4156
suit 1910-01-01 1920-01-01          7321
suit 1920-01-01 1930-01-01          5121
suit 1930-01-01 1940-01-01          6708
suit 1940-01-01 1950-01-01          8322
suit 1950-01-01 1960-01-01          9642
suit 1960-01-01 1970-01-01          6928
suit 1970-01-01 1980-01-01          4011
suit 1980-01-01 1990-01-01          4369
suit 1990-01-01 2000-01-01          3825
suit 2000-01-01 2010-01-01          2718
suit 2010-01-01 2020-01-01          1432
suit 2020-01-01 2030-01-01           133


In [15]:
results_df.to_csv('suit_count_decade.csv', index=False)

In [19]:
start_date = pd.to_datetime("1900-01-01")
end_date = pd.to_datetime("2024-01-01")
term = "shoulder pads"

def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=10)
results = []
current_date = start_date

os.makedirs("vogue_resource_dump", exist_ok=True)

while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)

driver.quit()

shoulderpads_df = pd.DataFrame(results)
print("Search Results Summary:")
print(shoulderpads_df)

Searching from 1900-01-01 00:00:00 to 1910-01-01 00:00:00
32 results found
Searching from 1910-01-01 00:00:00 to 1920-01-01 00:00:00
22 results found
Searching from 1920-01-01 00:00:00 to 1930-01-01 00:00:00
27 results found
Searching from 1930-01-01 00:00:00 to 1940-01-01 00:00:00
10 results found
Searching from 1940-01-01 00:00:00 to 1950-01-01 00:00:00
86 results found
Searching from 1950-01-01 00:00:00 to 1960-01-01 00:00:00
16 results found
Searching from 1960-01-01 00:00:00 to 1970-01-01 00:00:00
48 results found
Searching from 1970-01-01 00:00:00 to 1980-01-01 00:00:00
21 results found
Searching from 1980-01-01 00:00:00 to 1990-01-01 00:00:00
69 results found
Searching from 1990-01-01 00:00:00 to 2000-01-01 00:00:00
97 results found
Searching from 2000-01-01 00:00:00 to 2010-01-01 00:00:00
113 results found
Searching from 2010-01-01 00:00:00 to 2020-01-01 00:00:00
22 results found
Searching from 2020-01-01 00:00:00 to 2030-01-01 00:00:00
3 results found
Search Results Summary:
 

In [20]:
print("\nSearch Results Summary:")
print(shoulderpads_df.to_string(index=False))
shoulderpads_df.to_csv('shoulderpads_df.csv', index=False)


Search Results Summary:
         Term Start Date   End Date  Result Count
shoulder pads 1900-01-01 1910-01-01            32
shoulder pads 1910-01-01 1920-01-01            22
shoulder pads 1920-01-01 1930-01-01            27
shoulder pads 1930-01-01 1940-01-01            10
shoulder pads 1940-01-01 1950-01-01            86
shoulder pads 1950-01-01 1960-01-01            16
shoulder pads 1960-01-01 1970-01-01            48
shoulder pads 1970-01-01 1980-01-01            21
shoulder pads 1980-01-01 1990-01-01            69
shoulder pads 1990-01-01 2000-01-01            97
shoulder pads 2000-01-01 2010-01-01           113
shoulder pads 2010-01-01 2020-01-01            22
shoulder pads 2020-01-01 2030-01-01             3
