In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Setting header to mimic a request from a web browser and delays between requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
delay = 2  # seconds

In [2]:
# Firstly, test to see if URL is correct
url = ('https://www.jdsports.ie/men/mens-footwear/')
page = requests.get(url)
# Error Handling
if page.status_code == 200:
    print('Success!')
else:
    print('An error has occurred. Please ensure url is correct')

Success!


In [3]:
def scrape_data(url):
    # Retrieving data from the url
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')
    
    
  # Extracting hotel information
    shoes_info = soup.find_all('span', {'class': 'itemContainer'})

    # Creating lists to store hotel data
    shoes_names = []
    shoes_prices = []

    # Looping through hotel information to extract data
    for info in shoes_info:
        # Extracting hotel name
        shoes_name = info.find('span', {'class': 'itemTitle'}).text.strip()
        shoes_names.append(shoes_name)

        # Extracting hotel price
        shoes_price = info.find('div', {'class': 'itemPrice'}).text.strip()
        shoes_prices.append(shoes_price)

       
    # Creating a dictionary with the extracted data
    shoes_data = {
        'Shoe Name': shoes_names,
        'Price': shoes_prices,
    }

    return shoes_data

In [4]:
# This function get_next_page takes a BeautifulSoup object representing a webpage as input and returns the URL of the next page of hotel listings, if it is available
def get_next_page(soup):
    # Finding the next page url from the pagination element
    pagination = soup.find('div', {'class': 'pageLinks'})
    if pagination:
        next_page = pagination.find('a', {'rel': 'next'})
        if next_page:
            return 'https://www.jdsports.ie' + next_page['href']
    return None

In [5]:
# Starting url
url = 'https://www.jdsports.ie/men/mens-footwear/'
    
all_shoes_data = []  # List to store all hotel data

# Looping through all pages and scraping hotel data
while url:
    shoes_data = scrape_data(url)
    all_shoes_data.append(shoes_data)
    time.sleep(delay)
    url = get_next_page(BeautifulSoup(requests.get(url, headers=headers).content, 'lxml'))

# Combining all hotel data into a single dataframe
df = pd.concat([pd.DataFrame(data) for data in all_shoes_data])

# Saving dataframe to a CSV file
df.to_csv('shoes_data.csv', index=False)

print('Data successfully scraped and saved to hotel_data.csv')


Data successfully scraped and saved to hotel_data.csv
