In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
import numpy as np
import os

In [90]:
driver = webdriver.Chrome()
driver.get("https://archive.vogue.com/login")

In [13]:
start_date = pd.to_datetime("2000-01-01")
end_date = pd.to_datetime("2015-01-01")
term = "mini dress"

In [14]:
def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date


os.makedirs("vogue_resource_dump", exist_ok=True)



while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


minidress_df = pd.DataFrame(results)
print("Search Results Summary:")
print(minidress_df)

Searching from 2000-01-01 00:00:00 to 2001-01-01 00:00:00
18 results found
Searching from 2001-01-01 00:00:00 to 2002-01-01 00:00:00
61 results found
Searching from 2002-01-01 00:00:00 to 2003-01-01 00:00:00
46 results found
Searching from 2003-01-01 00:00:00 to 2004-01-01 00:00:00
35 results found
Searching from 2004-01-01 00:00:00 to 2005-01-01 00:00:00
48 results found
Searching from 2005-01-01 00:00:00 to 2006-01-01 00:00:00
31 results found
Searching from 2006-01-01 00:00:00 to 2007-01-01 00:00:00
65 results found
Searching from 2007-01-01 00:00:00 to 2008-01-01 00:00:00
47 results found
Searching from 2008-01-01 00:00:00 to 2009-01-01 00:00:00
43 results found
Searching from 2009-01-01 00:00:00 to 2010-01-01 00:00:00
23 results found
Searching from 2010-01-01 00:00:00 to 2011-01-01 00:00:00
41 results found
Searching from 2011-01-01 00:00:00 to 2012-01-01 00:00:00
18 results found
Searching from 2012-01-01 00:00:00 to 2013-01-01 00:00:00
27 results found
Searching from 2013-01-01

In [15]:
minidress_df.to_csv('minidress_df.csv', index=False)

In [24]:
start_date = pd.to_datetime("1940-01-01")
end_date = pd.to_datetime("1955-01-01")
term = "mini dress"

In [25]:
def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date


os.makedirs("vogue_resource_dump", exist_ok=True)



while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


minidress_40s = pd.DataFrame(results)
print("Search Results Summary:")
print(minidress_40s)

Searching from 1940-01-01 00:00:00 to 1941-01-01 00:00:00
1 results found
Searching from 1941-01-01 00:00:00 to 1942-01-01 00:00:00
2 results found
Searching from 1942-01-01 00:00:00 to 1943-01-01 00:00:00
6 results found
Searching from 1943-01-01 00:00:00 to 1944-01-01 00:00:00
0 results found
Searching from 1944-01-01 00:00:00 to 1945-01-01 00:00:00
5 results found
Searching from 1945-01-01 00:00:00 to 1946-01-01 00:00:00
6 results found
Searching from 1946-01-01 00:00:00 to 1947-01-01 00:00:00
6 results found
Searching from 1947-01-01 00:00:00 to 1948-01-01 00:00:00
8 results found
Searching from 1948-01-01 00:00:00 to 1949-01-01 00:00:00
5 results found
Searching from 1949-01-01 00:00:00 to 1950-01-01 00:00:00
2 results found
Searching from 1950-01-01 00:00:00 to 1951-01-01 00:00:00
1 results found
Searching from 1951-01-01 00:00:00 to 1952-01-01 00:00:00
0 results found
Searching from 1952-01-01 00:00:00 to 1953-01-01 00:00:00
1 results found
Searching from 1953-01-01 00:00:00 to 

In [26]:
start_date = pd.to_datetime("2000-01-01")
end_date = pd.to_datetime("2015-01-01")
term = "maxi dress"

In [27]:
def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date


os.makedirs("vogue_resource_dump", exist_ok=True)



while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


maxidress_df = pd.DataFrame(results)
print("Search Results Summary:")
print(maxidress_df)

Searching from 2000-01-01 00:00:00 to 2001-01-01 00:00:00
1 results found
Searching from 2001-01-01 00:00:00 to 2002-01-01 00:00:00
3 results found
Searching from 2002-01-01 00:00:00 to 2003-01-01 00:00:00
1 results found
Searching from 2003-01-01 00:00:00 to 2004-01-01 00:00:00
0 results found
Searching from 2004-01-01 00:00:00 to 2005-01-01 00:00:00
1 results found
Searching from 2005-01-01 00:00:00 to 2006-01-01 00:00:00
2 results found
Searching from 2006-01-01 00:00:00 to 2007-01-01 00:00:00
1 results found
Searching from 2007-01-01 00:00:00 to 2008-01-01 00:00:00
8 results found
Searching from 2008-01-01 00:00:00 to 2009-01-01 00:00:00
8 results found
Searching from 2009-01-01 00:00:00 to 2010-01-01 00:00:00
3 results found
Searching from 2010-01-01 00:00:00 to 2011-01-01 00:00:00
None results found
Searching from 2011-01-01 00:00:00 to 2012-01-01 00:00:00
6 results found
Searching from 2012-01-01 00:00:00 to 2013-01-01 00:00:00
3 results found
Searching from 2013-01-01 00:00:00 

In [54]:
print("\nSearch Results Summary:")
minidress_40s.to_csv('minidress_40s.csv', index=False)


Search Results Summary:


In [46]:
start_date = pd.to_datetime("1900-01-01")
end_date = pd.to_datetime("2023-01-01")
term = "fitted"

In [47]:
def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date


os.makedirs("vogue_resource_dump", exist_ok=True)



while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


fitted = pd.DataFrame(results)
print("Search Results Summary:")
print(fitted)

Searching from 1900-01-01 00:00:00 to 1901-01-01 00:00:00
215 results found
Searching from 1901-01-01 00:00:00 to 1902-01-01 00:00:00
320 results found
Searching from 1902-01-01 00:00:00 to 1903-01-01 00:00:00
338 results found
Searching from 1903-01-01 00:00:00 to 1904-01-01 00:00:00
363 results found
Searching from 1904-01-01 00:00:00 to 1905-01-01 00:00:00
392 results found
Searching from 1905-01-01 00:00:00 to 1906-01-01 00:00:00
247 results found
Searching from 1906-01-01 00:00:00 to 1907-01-01 00:00:00
293 results found
Searching from 1907-01-01 00:00:00 to 1908-01-01 00:00:00
290 results found
Searching from 1908-01-01 00:00:00 to 1909-01-01 00:00:00
266 results found
Searching from 1909-01-01 00:00:00 to 1910-01-01 00:00:00
264 results found
Searching from 1910-01-01 00:00:00 to 1911-01-01 00:00:00
245 results found
Searching from 1911-01-01 00:00:00 to 1912-01-01 00:00:00
271 results found
Searching from 1912-01-01 00:00:00 to 1913-01-01 00:00:00
267 results found
Searching fr

In [53]:
print(oversized)
print("\nSearch Results Summary:")
oversized.to_csv('fitted_df.csv', index=False)

       Term Start Date   End Date  Result Count
0    fitted 1900-01-01 1901-01-01         215.0
1    fitted 1901-01-01 1902-01-01         320.0
2    fitted 1902-01-01 1903-01-01         338.0
3    fitted 1903-01-01 1904-01-01         363.0
4    fitted 1904-01-01 1905-01-01         392.0
..      ...        ...        ...           ...
118  fitted 2018-01-01 2019-01-01          16.0
119  fitted 2019-01-01 2020-01-01           9.0
120  fitted 2020-01-01 2021-01-01           2.0
121  fitted 2021-01-01 2022-01-01           7.0
122  fitted 2022-01-01 2023-01-01           0.0

[123 rows x 4 columns]

Search Results Summary:


In [65]:
driver = webdriver.Chrome()
driver.get("https://archive.vogue.com/login")

In [57]:
start_date = pd.to_datetime("1900-01-01")
end_date = pd.to_datetime("2023-01-01")
term = "long"
def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date


os.makedirs("vogue_resource_dump", exist_ok=True)



while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


long = pd.DataFrame(results)
print("Search Results Summary:")
print(long)

Searching from 1900-01-01 00:00:00 to 1901-01-01 00:00:00
871 results found
Searching from 1901-01-01 00:00:00 to 1902-01-01 00:00:00
1017 results found
Searching from 1902-01-01 00:00:00 to 1903-01-01 00:00:00
1156 results found
Searching from 1903-01-01 00:00:00 to 1904-01-01 00:00:00
1155 results found
Searching from 1904-01-01 00:00:00 to 1905-01-01 00:00:00
1163 results found
Searching from 1905-01-01 00:00:00 to 1906-01-01 00:00:00
884 results found
Searching from 1906-01-01 00:00:00 to 1907-01-01 00:00:00
1121 results found
Searching from 1907-01-01 00:00:00 to 1908-01-01 00:00:00
1152 results found
Searching from 1908-01-01 00:00:00 to 1909-01-01 00:00:00
1199 results found
Searching from 1909-01-01 00:00:00 to 1910-01-01 00:00:00
1291 results found
Searching from 1910-01-01 00:00:00 to 1911-01-01 00:00:00
951 results found
Searching from 1911-01-01 00:00:00 to 1912-01-01 00:00:00
1080 results found
Searching from 1912-01-01 00:00:00 to 1913-01-01 00:00:00
1132 results found
Se

In [58]:
print(long)
print("\nSearch Results Summary:")
long.to_csv('long_df.csv', index=False)

     Term Start Date   End Date  Result Count
0    long 1900-01-01 1901-01-01         871.0
1    long 1901-01-01 1902-01-01        1017.0
2    long 1902-01-01 1903-01-01        1156.0
3    long 1903-01-01 1904-01-01        1155.0
4    long 1904-01-01 1905-01-01        1163.0
..    ...        ...        ...           ...
118  long 2018-01-01 2019-01-01           NaN
119  long 2019-01-01 2020-01-01           NaN
120  long 2020-01-01 2021-01-01           NaN
121  long 2021-01-01 2022-01-01           NaN
122  long 2022-01-01 2023-01-01           NaN

[123 rows x 4 columns]

Search Results Summary:


In [62]:
print(oversized)
print(long)

       Term Start Date   End Date  Result Count
0    fitted 1900-01-01 1901-01-01         215.0
1    fitted 1901-01-01 1902-01-01         320.0
2    fitted 1902-01-01 1903-01-01         338.0
3    fitted 1903-01-01 1904-01-01         363.0
4    fitted 1904-01-01 1905-01-01         392.0
..      ...        ...        ...           ...
118  fitted 2018-01-01 2019-01-01          16.0
119  fitted 2019-01-01 2020-01-01           9.0
120  fitted 2020-01-01 2021-01-01           2.0
121  fitted 2021-01-01 2022-01-01           7.0
122  fitted 2022-01-01 2023-01-01           0.0

[123 rows x 4 columns]
     Term Start Date   End Date  Result Count
0    long 1900-01-01 1901-01-01         871.0
1    long 1901-01-01 1902-01-01        1017.0
2    long 1902-01-01 1903-01-01        1156.0
3    long 1903-01-01 1904-01-01        1155.0
4    long 1904-01-01 1905-01-01        1163.0
..    ...        ...        ...           ...
118  long 2018-01-01 2019-01-01           NaN
119  long 2019-01-01 2020-01-01 

In [63]:
silhouettes_t = pd.concat([oversized, long], ignore_index=True)
print(silhouettes_t)

       Term Start Date   End Date  Result Count
0    fitted 1900-01-01 1901-01-01         215.0
1    fitted 1901-01-01 1902-01-01         320.0
2    fitted 1902-01-01 1903-01-01         338.0
3    fitted 1903-01-01 1904-01-01         363.0
4    fitted 1904-01-01 1905-01-01         392.0
..      ...        ...        ...           ...
241    long 2018-01-01 2019-01-01           NaN
242    long 2019-01-01 2020-01-01           NaN
243    long 2020-01-01 2021-01-01           NaN
244    long 2021-01-01 2022-01-01           NaN
245    long 2022-01-01 2023-01-01           NaN

[246 rows x 4 columns]


In [64]:
silhouettes_t.to_csv('silhouettes_t.csv', index=False)

In [66]:
start_date = pd.to_datetime("1900-01-01")
end_date = pd.to_datetime("2023-01-01")
term = "short"
def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date


os.makedirs("vogue_resource_dump", exist_ok=True)



while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


short = pd.DataFrame(results)
print("Search Results Summary:")
print(short)

Searching from 1900-01-01 00:00:00 to 1901-01-01 00:00:00
625 results found
Searching from 1901-01-01 00:00:00 to 1902-01-01 00:00:00
623 results found
Searching from 1902-01-01 00:00:00 to 1903-01-01 00:00:00
690 results found
Searching from 1903-01-01 00:00:00 to 1904-01-01 00:00:00
887 results found
Searching from 1904-01-01 00:00:00 to 1905-01-01 00:00:00
800 results found
Searching from 1905-01-01 00:00:00 to 1906-01-01 00:00:00
602 results found
Searching from 1906-01-01 00:00:00 to 1907-01-01 00:00:00
837 results found
Searching from 1907-01-01 00:00:00 to 1908-01-01 00:00:00
749 results found
Searching from 1908-01-01 00:00:00 to 1909-01-01 00:00:00
697 results found
Searching from 1909-01-01 00:00:00 to 1910-01-01 00:00:00
774 results found
Searching from 1910-01-01 00:00:00 to 1911-01-01 00:00:00
535 results found
Searching from 1911-01-01 00:00:00 to 1912-01-01 00:00:00
603 results found
Searching from 1912-01-01 00:00:00 to 1913-01-01 00:00:00
585 results found
Searching fr

In [67]:
short.to_csv('short_df.csv', index=False)
silhouettes_t = pd.concat([oversized, long, short], ignore_index=True)
print(silhouettes_t)

       Term Start Date   End Date  Result Count
0    fitted 1900-01-01 1901-01-01         215.0
1    fitted 1901-01-01 1902-01-01         320.0
2    fitted 1902-01-01 1903-01-01         338.0
3    fitted 1903-01-01 1904-01-01         363.0
4    fitted 1904-01-01 1905-01-01         392.0
..      ...        ...        ...           ...
364   short 2018-01-01 2019-01-01           NaN
365   short 2019-01-01 2020-01-01           NaN
366   short 2020-01-01 2021-01-01           NaN
367   short 2021-01-01 2022-01-01           NaN
368   short 2022-01-01 2023-01-01           NaN

[369 rows x 4 columns]


In [83]:
driver = webdriver.Chrome()
driver.get("https://archive.vogue.com/login")

In [80]:
start_date = pd.to_datetime("1998-01-01")
end_date = pd.to_datetime("2023-01-01")
term = "short"
def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date


os.makedirs("vogue_resource_dump", exist_ok=True)



while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


short_2 = pd.DataFrame(results)
print("Search Results Summary:")
print(short_2)

Searching from 1998-01-01 00:00:00 to 1999-01-01 00:00:00


KeyboardInterrupt: 

In [75]:
short_df = pd.concat([short, short_2], ignore_index=True)
print(short)
short_df.to_csv('short_df.csv', index=False)
silhouettes_t = pd.concat([oversized, long, short, short_2], ignore_index=True)
print(silhouettes_t)

      Term Start Date   End Date  Result Count
0    short 1900-01-01 1901-01-01         625.0
1    short 1901-01-01 1902-01-01         623.0
2    short 1902-01-01 1903-01-01         690.0
3    short 1903-01-01 1904-01-01         887.0
4    short 1904-01-01 1905-01-01         800.0
..     ...        ...        ...           ...
143  short 2018-01-01 2019-01-01          89.0
144  short 2019-01-01 2020-01-01          82.0
145  short 2020-01-01 2021-01-01          47.0
146  short 2021-01-01 2022-01-01          44.0
147  short 2022-01-01 2023-01-01           0.0

[148 rows x 4 columns]
       Term Start Date   End Date  Result Count
0    fitted 1900-01-01 1901-01-01         215.0
1    fitted 1901-01-01 1902-01-01         320.0
2    fitted 1902-01-01 1903-01-01         338.0
3    fitted 1903-01-01 1904-01-01         363.0
4    fitted 1904-01-01 1905-01-01         392.0
..      ...        ...        ...           ...
414   short 2018-01-01 2019-01-01          89.0
415   short 2019-01-01 2020-

In [81]:
start_date = pd.to_datetime("1900-01-01")
end_date = pd.to_datetime("2023-01-01")
term = "fabric"
def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date


os.makedirs("vogue_resource_dump", exist_ok=True)



while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


fabric = pd.DataFrame(results)
print("Search Results Summary:")
print(fabric)

Searching from 1900-01-01 00:00:00 to 1901-01-01 00:00:00
119 results found
Searching from 1901-01-01 00:00:00 to 1902-01-01 00:00:00
162 results found
Searching from 1902-01-01 00:00:00 to 1903-01-01 00:00:00
169 results found
Searching from 1903-01-01 00:00:00 to 1904-01-01 00:00:00
155 results found
Searching from 1904-01-01 00:00:00 to 1905-01-01 00:00:00
171 results found
Searching from 1905-01-01 00:00:00 to 1906-01-01 00:00:00
176 results found
Searching from 1906-01-01 00:00:00 to 1907-01-01 00:00:00
210 results found
Searching from 1907-01-01 00:00:00 to 1908-01-01 00:00:00
205 results found
Searching from 1908-01-01 00:00:00 to 1909-01-01 00:00:00
203 results found
Searching from 1909-01-01 00:00:00 to 1910-01-01 00:00:00
222 results found
Searching from 1910-01-01 00:00:00 to 1911-01-01 00:00:00
197 results found
Searching from 1911-01-01 00:00:00 to 1912-01-01 00:00:00
266 results found
Searching from 1912-01-01 00:00:00 to 1913-01-01 00:00:00
156 results found
Searching fr

In [82]:
print(fabric)
fabric.to_csv('fabric_df.csv', index=False)

       Term Start Date   End Date  Result Count
0    fabric 1900-01-01 1901-01-01         119.0
1    fabric 1901-01-01 1902-01-01         162.0
2    fabric 1902-01-01 1903-01-01         169.0
3    fabric 1903-01-01 1904-01-01         155.0
4    fabric 1904-01-01 1905-01-01         171.0
..      ...        ...        ...           ...
118  fabric 2018-01-01 2019-01-01           NaN
119  fabric 2019-01-01 2020-01-01           NaN
120  fabric 2020-01-01 2021-01-01           NaN
121  fabric 2021-01-01 2022-01-01           NaN
122  fabric 2022-01-01 2023-01-01           NaN

[123 rows x 4 columns]


In [84]:
start_date = pd.to_datetime("1975-01-01")
end_date = pd.to_datetime("2023-01-01")
term = "fabric"
def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date


os.makedirs("vogue_resource_dump", exist_ok=True)



while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


fabric_2 = pd.DataFrame(results)
print("Search Results Summary:")
print(fabric_2)

Searching from 1975-01-01 00:00:00 to 1976-01-01 00:00:00
728 results found
Searching from 1976-01-01 00:00:00 to 1977-01-01 00:00:00
593 results found
Searching from 1977-01-01 00:00:00 to 1978-01-01 00:00:00
326 results found
Searching from 1978-01-01 00:00:00 to 1979-01-01 00:00:00
417 results found
Searching from 1979-01-01 00:00:00 to 1980-01-01 00:00:00
260 results found
Searching from 1980-01-01 00:00:00 to 1981-01-01 00:00:00
271 results found
Searching from 1981-01-01 00:00:00 to 1982-01-01 00:00:00
357 results found
Searching from 1982-01-01 00:00:00 to 1983-01-01 00:00:00
484 results found
Searching from 1983-01-01 00:00:00 to 1984-01-01 00:00:00
377 results found
Searching from 1984-01-01 00:00:00 to 1985-01-01 00:00:00
420 results found
Searching from 1985-01-01 00:00:00 to 1986-01-01 00:00:00
434 results found
Searching from 1986-01-01 00:00:00 to 1987-01-01 00:00:00
327 results found
Searching from 1987-01-01 00:00:00 to 1988-01-01 00:00:00
316 results found
Searching fr

In [86]:
fabric_df = pd.concat([fabric, fabric_2], ignore_index=True)
print(fabric_df)
fabric_df.to_csv('fabric_df.csv', index=False)
silhouettes_lis = pd.concat([oversized, long, short, short_2, fabric_df], ignore_index=True)
print(silhouettes_lis)

       Term Start Date   End Date  Result Count
0    fabric 1900-01-01 1901-01-01         119.0
1    fabric 1901-01-01 1902-01-01         162.0
2    fabric 1902-01-01 1903-01-01         169.0
3    fabric 1903-01-01 1904-01-01         155.0
4    fabric 1904-01-01 1905-01-01         171.0
..      ...        ...        ...           ...
166  fabric 2018-01-01 2019-01-01          22.0
167  fabric 2019-01-01 2020-01-01          21.0
168  fabric 2020-01-01 2021-01-01          27.0
169  fabric 2021-01-01 2022-01-01          19.0
170  fabric 2022-01-01 2023-01-01           0.0

[171 rows x 4 columns]
       Term Start Date   End Date  Result Count
0    fitted 1900-01-01 1901-01-01         215.0
1    fitted 1901-01-01 1902-01-01         320.0
2    fitted 1902-01-01 1903-01-01         338.0
3    fitted 1903-01-01 1904-01-01         363.0
4    fitted 1904-01-01 1905-01-01         392.0
..      ...        ...        ...           ...
585  fabric 2018-01-01 2019-01-01          22.0
586  fabric 2019

In [87]:
print(silhouettes_lis)
silhouettes_lis.to_csv('silhouettes_lis.csv', index=False)

       Term Start Date   End Date  Result Count
0    fitted 1900-01-01 1901-01-01         215.0
1    fitted 1901-01-01 1902-01-01         320.0
2    fitted 1902-01-01 1903-01-01         338.0
3    fitted 1903-01-01 1904-01-01         363.0
4    fitted 1904-01-01 1905-01-01         392.0
..      ...        ...        ...           ...
585  fabric 2018-01-01 2019-01-01          22.0
586  fabric 2019-01-01 2020-01-01          21.0
587  fabric 2020-01-01 2021-01-01          27.0
588  fabric 2021-01-01 2022-01-01          19.0
589  fabric 2022-01-01 2023-01-01           0.0

[590 rows x 4 columns]


In [88]:
fabric_df.to_csv('fabric_df.csv', index=False)

In [89]:
print(long)

     Term Start Date   End Date  Result Count
0    long 1900-01-01 1901-01-01         871.0
1    long 1901-01-01 1902-01-01        1017.0
2    long 1902-01-01 1903-01-01        1156.0
3    long 1903-01-01 1904-01-01        1155.0
4    long 1904-01-01 1905-01-01        1163.0
..    ...        ...        ...           ...
118  long 2018-01-01 2019-01-01           NaN
119  long 2019-01-01 2020-01-01           NaN
120  long 2020-01-01 2021-01-01           NaN
121  long 2021-01-01 2022-01-01           NaN
122  long 2022-01-01 2023-01-01           NaN

[123 rows x 4 columns]


In [91]:
start_date = pd.to_datetime("1989-01-01")
end_date = pd.to_datetime("2023-01-01")
term = "long"
def make_term_search(driver, term, start_date, end_date):
    url = f"https://archive.vogue.com/search?QueryTerm={term}&startDate={start_date}&endDate={end_date}"
    driver.get(url)

def get_count(driver):
    count_elements = driver.find_elements(By.CLASS_NAME, "count")
    if not count_elements:
        return None
    count_text = count_elements[0].text
    count = int(count_text.split("(")[1].split(")")[0])
    return count


increment = pd.DateOffset(years=1)
results = []
current_date = start_date


os.makedirs("vogue_resource_dump", exist_ok=True)



while current_date < end_date:
    next_date = current_date + increment
    print(f"Searching from {current_date} to {next_date}")

    make_term_search(driver, term, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))

    count = get_count(driver)
    print(f"{count} results found")

    with open(f"vogue_resource_dump/{term}_{current_date.strftime('%Y-%m-%d')}_{next_date.strftime('%Y-%m-%d')}.html", "w") as f:
        f.write(driver.page_source)

    results.append({
        "Term": term,
        "Start Date": current_date,
        "End Date": next_date,
        "Result Count": count
    })

    current_date = next_date

    # Sleep to avoid detection, ensuring non-negative sleep length
    sleep_duration = max(0, np.random.normal(1, 0.5))
    time.sleep(sleep_duration)


long_1989 = pd.DataFrame(results)
print("Search Results Summary:")
print(long_1989)

Searching from 1989-01-01 00:00:00 to 1990-01-01 00:00:00
740 results found
Searching from 1990-01-01 00:00:00 to 1991-01-01 00:00:00
557 results found
Searching from 1991-01-01 00:00:00 to 1992-01-01 00:00:00
662 results found
Searching from 1992-01-01 00:00:00 to 1993-01-01 00:00:00
841 results found
Searching from 1993-01-01 00:00:00 to 1994-01-01 00:00:00
788 results found
Searching from 1994-01-01 00:00:00 to 1995-01-01 00:00:00
634 results found
Searching from 1995-01-01 00:00:00 to 1996-01-01 00:00:00
651 results found
Searching from 1996-01-01 00:00:00 to 1997-01-01 00:00:00
626 results found
Searching from 1997-01-01 00:00:00 to 1998-01-01 00:00:00
594 results found
Searching from 1998-01-01 00:00:00 to 1999-01-01 00:00:00
509 results found
Searching from 1999-01-01 00:00:00 to 2000-01-01 00:00:00
518 results found
Searching from 2000-01-01 00:00:00 to 2001-01-01 00:00:00
531 results found
Searching from 2001-01-01 00:00:00 to 2002-01-01 00:00:00
478 results found
Searching fr

In [95]:
print(long_1989)
long_update = pd.concat([long, long_1989], ignore_index=True)
print(long_update)
long_update.to_csv('long_updated.csv', index=False)


    Term Start Date   End Date  Result Count
0   long 1989-01-01 1990-01-01         740.0
1   long 1990-01-01 1991-01-01         557.0
2   long 1991-01-01 1992-01-01         662.0
3   long 1992-01-01 1993-01-01         841.0
4   long 1993-01-01 1994-01-01         788.0
5   long 1994-01-01 1995-01-01         634.0
6   long 1995-01-01 1996-01-01         651.0
7   long 1996-01-01 1997-01-01         626.0
8   long 1997-01-01 1998-01-01         594.0
9   long 1998-01-01 1999-01-01         509.0
10  long 1999-01-01 2000-01-01         518.0
11  long 2000-01-01 2001-01-01         531.0
12  long 2001-01-01 2002-01-01         478.0
13  long 2002-01-01 2003-01-01           NaN
14  long 2003-01-01 2004-01-01         618.0
15  long 2004-01-01 2005-01-01         563.0
16  long 2005-01-01 2006-01-01         476.0
17  long 2006-01-01 2007-01-01         581.0
18  long 2007-01-01 2008-01-01         572.0
19  long 2008-01-01 2009-01-01         539.0
20  long 2009-01-01 2010-01-01         433.0
21  long 2

In [97]:
silhouettes_lis = pd.concat([silhouettes_lis, long_1989], ignore_index=True)
print(silhouettes_lis)

       Term Start Date   End Date  Result Count
0    fitted 1900-01-01 1901-01-01         215.0
1    fitted 1901-01-01 1902-01-01         320.0
2    fitted 1902-01-01 1903-01-01         338.0
3    fitted 1903-01-01 1904-01-01         363.0
4    fitted 1904-01-01 1905-01-01         392.0
..      ...        ...        ...           ...
653    long 2018-01-01 2019-01-01         228.0
654    long 2019-01-01 2020-01-01         266.0
655    long 2020-01-01 2021-01-01         174.0
656    long 2021-01-01 2022-01-01         145.0
657    long 2022-01-01 2023-01-01           0.0

[658 rows x 4 columns]


In [98]:
print(silhouettes_lis)
silhouettes_lis.to_csv('silhouettes_lis.csv', index=False)

       Term Start Date   End Date  Result Count
0    fitted 1900-01-01 1901-01-01         215.0
1    fitted 1901-01-01 1902-01-01         320.0
2    fitted 1902-01-01 1903-01-01         338.0
3    fitted 1903-01-01 1904-01-01         363.0
4    fitted 1904-01-01 1905-01-01         392.0
..      ...        ...        ...           ...
653    long 2018-01-01 2019-01-01         228.0
654    long 2019-01-01 2020-01-01         266.0
655    long 2020-01-01 2021-01-01         174.0
656    long 2021-01-01 2022-01-01         145.0
657    long 2022-01-01 2023-01-01           0.0

[658 rows x 4 columns]
