In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

url = "https://capitalwholesalediamonds.com/product-category/cushion/"

# GET request to the URL
response = requests.get(url)
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")
#check status code
response

<Response [200]>

In [2]:
product_list = soup.find_all("div", class_="ftc-product product")
all =[]

# Iterate over each product in the product list
for product in product_list:
    # Extract the product URL
    product_url = product.find("a")["href"]
    
    # GET request to the product URL
    response = requests.get(product_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract the price
    price = soup.find('span', {'class':'woocommerce-Price-amount amount'}).text
    
    # Extract the description and split it into individual lines
    description = soup.find('div', id='tab-description').p.text.split('\n')

    # Extract specific details from the description using split and indexing
    product_id = description[0].split(':')[1]
    shape = description[1].split(':')[1]
    carat = description[2].split(':')[1]
    clarity = description[3].split(':')[1]
    colour = description[4].split(':')[1]
    cut = description[5].split(':')[1]
    polish = description[6].split(':')[1]
    symmetry = description[6].split(':')[1]
    fluorescence = description[7].split(':')[1]
    measurements = description[8].split(':')[1]

    # Append the extracted data to the 'all' list
    all.append([product_id, shape, carat, clarity, colour, cut, polish, symmetry, fluorescence, measurements, price, product_url])


**Pandas Dataframe**

In [6]:
cushion_df = pd.DataFrame(all, columns=['Product_id', 'Shape', 'Weight', 'Clarity', 'Colour', 'Cut', 'Polish',
       'Symmetry', 'Fluorescence', 'Messurements', 'Price', 'Product_url'])
cushion_df.head()

Unnamed: 0,Product_id,Shape,Weight,Clarity,Colour,Cut,Polish,Symmetry,Fluorescence,Messurements,Price,Product_url
0,2106452,CUSHION,0.55,SI2,N,VG,EX,EX,VG,N,$673.61,https://capitalwholesalediamonds.com/product/0...
1,2042329,CUSHION,0.52,SI2,Y-Z,EX,EX,EX,VG,F,$735.67,https://capitalwholesalediamonds.com/product/0...
2,2055268,CUSHION,0.5,SI1,L,VG,EX,EX,VG,N,$768.14,https://capitalwholesalediamonds.com/product/0...
3,2128779,CUSHION,0.5,VS2,M,EX,EX,EX,VG,F,$770.64,https://capitalwholesalediamonds.com/product/0...
4,2103991,CUSHION,0.51,SI1,M,EX,EX,EX,VG,N,$784.13,https://capitalwholesalediamonds.com/product/0...


In [12]:
#Shape of the dataframe
cushion_df.shape

(15, 12)

The DataFrame cushion_df contains information on 15 products and 12 extracted features from a single page.

**Store output in Excel**

In [28]:
cushion_df.to_excel('single_page_cushion.xlsx', index=False)

**Scraping mulitple pages**

In [17]:

urls = ["https://capitalwholesalediamonds.com/product-category/cushion/page/" + str(i) for i in range(1, 101)]
all_data = []

# Iterate through the URLs
for url in urls:
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    # Find all the product divs in the HTML
    product_list = soup.find_all("div", class_="ftc-product product")
    
    # Iterate through each product
    for product in product_list:
        # Extract the product URL
        product_url = product.find("a")["href"]
        # Send a GET request to the product URL
        response = requests.get(product_url)
        # Parse the HTML content of the product page
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the desired information from the product page
        price = soup.find('span', {'class':'woocommerce-Price-amount amount'}).text
        description = soup.find('div', id='tab-description').p.text.split('\n')
        product_id = description[0].split(':')[1]
        shape = description[1].split(':')[1]
        carat = description[2].split(':')[1]
        clarity = description[3].split(':')[1]
        colour = description[4].split(':')[1]
        cut = description[5].split(':')[1]
        polish = description[6].split(':')[1]
        symmetry = description[6].split(':')[1]
        fluorescence = description[7].split(':')[1]
        measurements = description[8].split(':')[1]
        
        # Append the extracted data to the all_data list
        all_data.append([product_id, shape, carat, clarity, colour, cut, polish, symmetry, fluorescence, measurements, price, product_url])


In [18]:
cushion_multiple_df = pd.DataFrame(all_data, columns=['Product_id', 'Shape', 'Weight', 'Clarity', 'Colour', 'Cut', 'Polish', 'Symmetry', 'Fluorescence', 'Measurements', 'Price', 'Product_url'])
cushion_multiple_df.head()

Unnamed: 0,Product_id,Shape,Weight,Clarity,Colour,Cut,Polish,Symmetry,Fluorescence,Measurements,Price,Product_url
0,2106452,CUSHION,0.55,SI2,N,VG,EX,EX,VG,N,$673.61,https://capitalwholesalediamonds.com/product/0...
1,2042329,CUSHION,0.52,SI2,Y-Z,EX,EX,EX,VG,F,$735.67,https://capitalwholesalediamonds.com/product/0...
2,2055268,CUSHION,0.5,SI1,L,VG,EX,EX,VG,N,$768.14,https://capitalwholesalediamonds.com/product/0...
3,2128779,CUSHION,0.5,VS2,M,EX,EX,EX,VG,F,$770.64,https://capitalwholesalediamonds.com/product/0...
4,2103991,CUSHION,0.51,SI1,M,EX,EX,EX,VG,N,$784.13,https://capitalwholesalediamonds.com/product/0...


In [19]:
cushion_multiple_df.shape

(1500, 12)

The DataFrame cushion_multiple_df contains information on 1500 products and 12 extracted features from multiple pages.

**Store results in excel**

In [21]:
cushion_multiple_df.to_excel('multiple_page_cushion.xlsx', index=False)