In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

In [2]:
def fetch_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None

def parse_shoe_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    specs_panels = soup.select('.panel-inverted')

    data = []
    for panel in specs_panels:
        rows = panel.select('.row')
        for row in rows:
            cols = row.find_all('div', recursive=False)
            if len(cols) >= 3:
                attribute_name = cols[1].text.strip()
                attribute_value = cols[2].text.strip()
                data.append({
                    'Attribute': attribute_name,
                    'Value': attribute_value
                })

    return pd.DataFrame(data)

def extract_links(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    panels = soup.find_all('div', class_="panel")

    ordered_unique_links = []
    seen_links = set()

    for panel in panels:
        links = panel.find_all('a', href=True)
        for link in links:
            url = link['href']
            if ('https://www.runningshoesguru.com/reviews/road' in url or 'https://www.runningshoesguru.com/reviews/trail' in url) and (url not in seen_links):
                seen_links.add(url)
                ordered_unique_links.append(url)

    return ordered_unique_links

In [3]:
available_page = 22
# Nike Pegasus Turbo Next Nature is last updated shoe (located on page 22)
all_shoes_lists = list()

for i in tqdm(range(1, available_page + 1)):
    url = f'https://www.runningshoesguru.com/reviews/page/{i}/'
    html_content = fetch_page(url)
    shoe_links = extract_links(html_content)
    all_shoes_lists.extend(shoe_links)

# print(len(all_shoes_lists))

100%|██████████| 22/22 [00:11<00:00,  1.95it/s]


In [4]:
shoes_table = pd.DataFrame()

for url in tqdm(all_shoes_lists):
    html_content = fetch_page(url)
    if html_content:
        df = parse_shoe_data(html_content)
        df = df[~df['Attribute'].str.contains(r'\n\t+')]
        df = df.iloc[:-1]
        df = df.set_index('Attribute').T
        shoes_table = pd.concat([shoes_table, df], axis=0)
    else:
        print("Failed to fetch the webpage.")

100%|██████████| 208/208 [00:57<00:00,  3.64it/s]


In [5]:
shoes_table

Attribute,Brand,Model,Previous model,Type,Weight,MSRP,Heel,Toe,Heel drop,Carbon plate,...,Toebox fit,Type of cushioning,Amount of cushioning,Stability,Flexibility,Family,Rockplate,Traction,Water resitance,Road-to-trail
Value,Puma,Magnify Nitro 2,Magnify Nitro,Cushioning,10.7 oz (303 g),$140.00,39 mm.,29 mm.,9 mm.,No plate,...,Tight,Responsive/balanced,Highly cushioned,Some stability,Rigid,,,,,
Value,Asics,METASPEED Edge Paris,Metaspeed Edge,Racing,6.5 oz (184 g),250.00,39.5 mm.,34.5 mm.,5 mm.,Full length carbon plate,...,Tight,Responsive,Medium cushioning,Some stability,Rigid,,,,,
Value,Puma,Velocity Nitro 3,Velocity Nitro 2,Cushioning,9.17 oz (260 g),$135.00,36 mm.,26 mm.,10 mm.,No plate,...,Tight,Balanced,Medium cushioning,Some stability,Medium,,,,,
Value,Saucony,Guide 17,Guide 16,Road,9.4 oz (266 g),$140.00,35 mm.,29 mm.,6 mm.,No plate,...,Normal,Balanced/plush,Highly cushioned,Very stable,Medium,Guide,,,,
Value,Saucony,Omni 22,,Road,10.1 oz (286 g),$140.00,35 mm.,27 mm.,8 mm.,No plate,...,Tight,Balanced/plush,Medium cushioning,Some stability,Medium,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Value,Asics,GT-1000 11,GT 1000 10,Road,9.5 oz (269 g),$100.00,21 mm.,13 mm.,8 mm.,No plate,...,Normal,Responsive/balanced,Medium cushioning,Some stability,Flexible,,,,,
Value,Asics,Novablast 3,Novablast 2,Cushioning,8.9 oz (252 g),$140.00,mm.,mm.,9 mm.,No plate,...,Normal,Balanced,Highly cushioned,Very stable,Flexible,,,,,
Value,Saucony,Endorphin Speed 3,Endorphin Speed,Cushioning,8.10 oz (230 g),$170.00,36 mm.,28 mm.,8 mm.,Plate in other materials,...,Normal,Balanced,Highly cushioned,Some stability,Medium,Endorphin,,,,
Value,Hoka One One,Gaviota 4,Gaviota 3,Road,11.10 oz (315 g),$170.00,40 mm.,35 mm.,5 mm.,No plate,...,Normal,Balanced/plush,Highly cushioned,Very stable,Medium,,,,,


In [6]:
shoes_table.to_csv('shoes_parameter_table.csv')