In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
import numpy as np
import os

In [31]:
driver = webdriver.Chrome()
driver.get("https://archive.vogue.com/login")

In [10]:
start_date = pd.to_datetime("1935-01-01")
end_date = pd.to_datetime("1945-01-01")
term = "jeans"

def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date

os.makedirs("vogue_resource_dump", exist_ok=True)

while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


results_df = pd.DataFrame(results)
print("Search Results Summary:")
print(results_df)

Searching from 1935-01-01 00:00:00 to 1936-01-01 00:00:00
5 results found
Searching from 1936-01-01 00:00:00 to 1937-01-01 00:00:00
5 results found
Searching from 1937-01-01 00:00:00 to 1938-01-01 00:00:00
4 results found
Searching from 1938-01-01 00:00:00 to 1939-01-01 00:00:00
6 results found
Searching from 1939-01-01 00:00:00 to 1940-01-01 00:00:00
5 results found
Searching from 1940-01-01 00:00:00 to 1941-01-01 00:00:00
4 results found
Searching from 1941-01-01 00:00:00 to 1942-01-01 00:00:00
2 results found
Searching from 1942-01-01 00:00:00 to 1943-01-01 00:00:00
1 results found
Searching from 1943-01-01 00:00:00 to 1944-01-01 00:00:00
5 results found
Searching from 1944-01-01 00:00:00 to 1945-01-01 00:00:00
3 results found
Search Results Summary:
    Term Start Date   End Date  Result Count
0  jeans 1935-01-01 1936-01-01             5
1  jeans 1936-01-01 1937-01-01             5
2  jeans 1937-01-01 1938-01-01             4
3  jeans 1938-01-01 1939-01-01             6
4  jeans 19

In [11]:
start_date = pd.to_datetime("1935-01-01")
end_date = pd.to_datetime("1955-01-01")
term = "comfort"

def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date

os.makedirs("vogue_resource_dump", exist_ok=True)

while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


results_df = pd.DataFrame(results)
print("Search Results Summary:")
print(results_df)

Searching from 1935-01-01 00:00:00 to 1936-01-01 00:00:00
271 results found
Searching from 1936-01-01 00:00:00 to 1937-01-01 00:00:00
264 results found
Searching from 1937-01-01 00:00:00 to 1938-01-01 00:00:00
247 results found
Searching from 1938-01-01 00:00:00 to 1939-01-01 00:00:00
248 results found
Searching from 1939-01-01 00:00:00 to 1940-01-01 00:00:00
269 results found
Searching from 1940-01-01 00:00:00 to 1941-01-01 00:00:00
225 results found
Searching from 1941-01-01 00:00:00 to 1942-01-01 00:00:00
191 results found
Searching from 1942-01-01 00:00:00 to 1943-01-01 00:00:00
175 results found
Searching from 1943-01-01 00:00:00 to 1944-01-01 00:00:00
98 results found
Searching from 1944-01-01 00:00:00 to 1945-01-01 00:00:00
113 results found
Searching from 1945-01-01 00:00:00 to 1946-01-01 00:00:00
116 results found
Searching from 1946-01-01 00:00:00 to 1947-01-01 00:00:00
136 results found
Searching from 1947-01-01 00:00:00 to 1948-01-01 00:00:00
140 results found
Searching fro

In [12]:
print("\nSearch Results Summary:")
print(results_df.to_string(index=False))
results_df.to_csv('comfort_df.csv', index=False)


Search Results Summary:
   Term Start Date   End Date  Result Count
comfort 1935-01-01 1936-01-01           271
comfort 1936-01-01 1937-01-01           264
comfort 1937-01-01 1938-01-01           247
comfort 1938-01-01 1939-01-01           248
comfort 1939-01-01 1940-01-01           269
comfort 1940-01-01 1941-01-01           225
comfort 1941-01-01 1942-01-01           191
comfort 1942-01-01 1943-01-01           175
comfort 1943-01-01 1944-01-01            98
comfort 1944-01-01 1945-01-01           113
comfort 1945-01-01 1946-01-01           116
comfort 1946-01-01 1947-01-01           136
comfort 1947-01-01 1948-01-01           140
comfort 1948-01-01 1949-01-01           145
comfort 1949-01-01 1950-01-01           113
comfort 1950-01-01 1951-01-01           151
comfort 1951-01-01 1952-01-01           130
comfort 1952-01-01 1953-01-01           117
comfort 1953-01-01 1954-01-01           141
comfort 1954-01-01 1955-01-01           131


In [16]:
start_date = pd.to_datetime("1935-01-01")
end_date = pd.to_datetime("1955-01-01")
term = "oversized"

def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date

os.makedirs("vogue_resource_dump", exist_ok=True)

while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


results_df = pd.DataFrame(results)
print("Search Results Summary:")
print(results_df)

Searching from 1935-01-01 00:00:00 to 1936-01-01 00:00:00
3 results found
Searching from 1936-01-01 00:00:00 to 1937-01-01 00:00:00
2 results found
Searching from 1937-01-01 00:00:00 to 1938-01-01 00:00:00
1 results found
Searching from 1938-01-01 00:00:00 to 1939-01-01 00:00:00
3 results found
Searching from 1939-01-01 00:00:00 to 1940-01-01 00:00:00
5 results found
Searching from 1940-01-01 00:00:00 to 1941-01-01 00:00:00
9 results found
Searching from 1941-01-01 00:00:00 to 1942-01-01 00:00:00
9 results found
Searching from 1942-01-01 00:00:00 to 1943-01-01 00:00:00
14 results found
Searching from 1943-01-01 00:00:00 to 1944-01-01 00:00:00
1 results found
Searching from 1944-01-01 00:00:00 to 1945-01-01 00:00:00
3 results found
Searching from 1945-01-01 00:00:00 to 1946-01-01 00:00:00
3 results found
Searching from 1946-01-01 00:00:00 to 1947-01-01 00:00:00
4 results found
Searching from 1947-01-01 00:00:00 to 1948-01-01 00:00:00
4 results found
Searching from 1948-01-01 00:00:00 to

In [17]:
print("\nSearch Results Summary:")
print(results_df.to_string(index=False))
results_df.to_csv('oversized_1935-55.csv', index=False)


Search Results Summary:
     Term Start Date   End Date  Result Count
oversized 1935-01-01 1936-01-01             3
oversized 1936-01-01 1937-01-01             2
oversized 1937-01-01 1938-01-01             1
oversized 1938-01-01 1939-01-01             3
oversized 1939-01-01 1940-01-01             5
oversized 1940-01-01 1941-01-01             9
oversized 1941-01-01 1942-01-01             9
oversized 1942-01-01 1943-01-01            14
oversized 1943-01-01 1944-01-01             1
oversized 1944-01-01 1945-01-01             3
oversized 1945-01-01 1946-01-01             3
oversized 1946-01-01 1947-01-01             4
oversized 1947-01-01 1948-01-01             4
oversized 1948-01-01 1949-01-01             0
oversized 1949-01-01 1950-01-01             7
oversized 1950-01-01 1951-01-01             8
oversized 1951-01-01 1952-01-01             2
oversized 1952-01-01 1953-01-01             4
oversized 1953-01-01 1954-01-01             5
oversized 1954-01-01 1955-01-01             9


In [33]:
start_date = pd.to_datetime("1935-01-01")
end_date = pd.to_datetime("1955-01-01")
terms = ["trouser", "skirt", "dress", "workwear", "pant"]

def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []

os.makedirs("vogue_resource_dump", exist_ok=True)

for term in terms:
    current_date = start_date

while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


results_df = pd.DataFrame(results)
print("Search Results Summary:")
print(results_df)
print(results_df.to_string(index=False))
results_df.to_csv('comparison_1935-55.csv', index=False)

Searching from 1935-01-01 00:00:00 to 1936-01-01 00:00:00
3 results found
Searching from 1936-01-01 00:00:00 to 1937-01-01 00:00:00
8 results found
Searching from 1937-01-01 00:00:00 to 1938-01-01 00:00:00
4 results found
Searching from 1938-01-01 00:00:00 to 1939-01-01 00:00:00
3 results found
Searching from 1939-01-01 00:00:00 to 1940-01-01 00:00:00
1 results found
Searching from 1940-01-01 00:00:00 to 1941-01-01 00:00:00
2 results found
Searching from 1941-01-01 00:00:00 to 1942-01-01 00:00:00
2 results found
Searching from 1942-01-01 00:00:00 to 1943-01-01 00:00:00
2 results found
Searching from 1943-01-01 00:00:00 to 1944-01-01 00:00:00
2 results found
Searching from 1944-01-01 00:00:00 to 1945-01-01 00:00:00
1 results found
Searching from 1945-01-01 00:00:00 to 1946-01-01 00:00:00
0 results found
Searching from 1946-01-01 00:00:00 to 1947-01-01 00:00:00
2 results found
Searching from 1947-01-01 00:00:00 to 1948-01-01 00:00:00
3 results found
Searching from 1948-01-01 00:00:00 to 

In [34]:
print(results_df)
print(results_df.to_string(index=False))
results_df.to_csv('comparison_1935-55.csv', index=False)

    Term Start Date   End Date  Result Count
0   pant 1935-01-01 1936-01-01             3
1   pant 1936-01-01 1937-01-01             8
2   pant 1937-01-01 1938-01-01             4
3   pant 1938-01-01 1939-01-01             3
4   pant 1939-01-01 1940-01-01             1
5   pant 1940-01-01 1941-01-01             2
6   pant 1941-01-01 1942-01-01             2
7   pant 1942-01-01 1943-01-01             2
8   pant 1943-01-01 1944-01-01             2
9   pant 1944-01-01 1945-01-01             1
10  pant 1945-01-01 1946-01-01             0
11  pant 1946-01-01 1947-01-01             2
12  pant 1947-01-01 1948-01-01             3
13  pant 1948-01-01 1949-01-01             2
14  pant 1949-01-01 1950-01-01             0
15  pant 1950-01-01 1951-01-01             1
16  pant 1951-01-01 1952-01-01             0
17  pant 1952-01-01 1953-01-01             1
18  pant 1953-01-01 1954-01-01             0
19  pant 1954-01-01 1955-01-01             1
Term Start Date   End Date  Result Count
pant 1935-01-0