In [80]:
from selenium import webdriver 
from selenium.webdriver import Chrome

from bs4 import BeautifulSoup

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')

path_list = get_path_list(driver)
workout_data = []
driver = Chrome(options=options)
for path in path_list: 
    driver.get("https://www.muscleandstrength.com" + path)
    page_source = driver.page_source

    soup = BeautifulSoup(page_source, 'html.parser')

    #get workout summary
    items = soup.find("div", {"class":"node-stats-block"})
    workout_summary = get_workout_summary(items.find_all("li"))
    
    #get workout description and add to summary
    description = soup.find("div", {"class":"field field-name-body field-type-text-with-summary field-label-hidden"}).get_text()
    updated_description_str = update_workout_description(description)
    workout_summary["Workout Description"] = updated_description_str
    
    #add summary to workout data 
    workout_data.append(workout_summary)



In [61]:
path_list = get_path_list(driver)
print(path_list)

['/workouts/12-week-fat-destroyer', '/workouts/10-week-mass-building-program.html', '/workouts/6-day-powerbuilding-split-meal-plan', '/workouts/5-day-dumbbell-only-workout-split', '/workouts/8-week-hypertrophy-workout', '/workouts/phul-workout', '/workouts/upper-lower-4-day-gym-bodybuilding-workout', '/workouts/4-day-maximum-mass-workout', '/workouts/m-f-workout-routine', '/workouts/dumbbell-only-upper-lower-workout-routine', '/workouts/6-day-dumbbell-only-workout', '/workouts/8-week-spring-shred-program', '/workouts/6-week-workout-program-to-build-lean-muscle', '/workouts/5-day-muscle-and-strength-building-workout-split', '/workouts/4-day-power-muscle-burn-workout-split.html', '/workouts/the-best-upper-body-workout-routine', '/workouts/advanced-fat-loss-workout', '/workouts/michael-b-jordan-workout-program', '/workouts/summer-burn-12-week-fat-melting-workout', '/workouts/dumbbell-only-home-or-gym-fullbody-workout.html', '/workouts/8-week-fat-incinerator-workout', '/workouts/12-week-to

In [54]:
def get_category_list(mainpage_category_list):
    prev = ""
    href_category_list = []
    for href in mainpage_category_list.find_all("a", href=True):
        href_text = href["href"]
        if href_text != prev:
            href_category_list.append(href_text)
        prev = href_text
    return href_category_list

In [55]:
def get_category_workout_urls(content, path_list):
    prev = ""
    for div in content:
        href_text = div.find("a", href=True)["href"]
        if href_text != prev:
            path_list.append(href_text)
        prev = href_text
    return path_list

In [56]:
def get_category_urls(category_path_list):
    path_list = []

    for category_path in category_path_list:
        driver.get("https://www.muscleandstrength.com" + category_path)
        page_source = driver.page_source

        soup = BeautifulSoup(page_source, 'html.parser')
        content = soup.find_all("div", {"class":"view-content-button"})
        path_list = get_category_workout_urls(content, URL_list)
        
    return path_list

In [57]:
def get_path_list(driver):

    driver.get("https://www.muscleandstrength.com/workout-routines")
    page_source = driver.page_source

    soup = BeautifulSoup(page_source, 'html.parser')
    mainpage_category_list = soup.find("div", {"class":"mainpage-category-list"})
    category_list = get_category_list(mainpage_category_list)
    path_list = get_category_urls(category_list)
    
    return path_list

In [52]:
#workout summary
def get_workout_summary(items) :
    data = {}
    for li in items:
        # Extract the text from the <li> element
        label = li.find("span", {"class": "row-label"}).get_text()        
        value = li.get_text().replace(label, "").strip()
        if label != "Workout PDF":
            data[label] = value
    
    return data

In [66]:
workout_summary = get_workout_summary(items.find_all("li"))
print(workout_summary)

{'Main Goal': 'Build Muscle', 'Workout Type': 'Full Body', 'Training Level': 'Beginner', 'Program Duration': '8 weeks', 'Days Per Week': '3', 'Time Per Workout': '25-30 minutes', 'Equipment Required': 'Dumbbells', 'Target Gender': 'Male & Female', 'Recommended Supps': 'Whey Protein Protein Bars (Optional) Multivitamin Fish Oil'}


In [51]:
def update_workout_description(description_html):
    split_description = description_html.split()
    updated_description_str = " ".join(split_description)
    return updated_description_str

In [67]:
workout_summary["Workout Description"] = updated_description_str
print(workout_summary)

{'Main Goal': 'Build Muscle', 'Workout Type': 'Full Body', 'Training Level': 'Beginner', 'Program Duration': '8 weeks', 'Days Per Week': '3', 'Time Per Workout': '25-30 minutes', 'Equipment Required': 'Dumbbells', 'Target Gender': 'Male & Female', 'Recommended Supps': 'Whey Protein Protein Bars (Optional) Multivitamin Fish Oil', 'Workout Description': "The following workout is designed for those who only have access to a set of dumbbells. It’s perfect for those who work out at home, travel and are on the road a lot, or beginners who are new to weight lifting. It can be performed as a complete workout program for up to 8 weeks. At that point you may want to consider increasing the volume of the workouts or change up your routine to include more of a variety of equipment. This program calls for you to work out 3 times a week. Since these workouts are full body workouts, it is best to have a rest day in between your work out days. For example, you could perform this program on Monday, Wed

In [83]:
test_workout_data = workout_data[:]
for workout in test_workout_data:
    descrip =  workout["Workout Description"]
    descrip =  re.sub(r"[^a-zA-Z0-9 ]", "", descrip)
    workout["Workout Description"] = descrip


In [84]:
import pandas as pd

df = pd.DataFrame(test_workout_data)
df.to_csv("workout_data.csv", index=False)