In [16]:
# importing the necessary libraries
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

In [17]:
driver = webdriver.Chrome()
driver.get("https://www.ikea.com/se/en/offers/family-offers/")

# I want to make sure the webpage loads completely and scrolls to the bottom for any additional products 
time.sleep(5)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

In [18]:
offers = soup.find_all("div", class_="plp-fragment-wrapper")
len(offers)

23

In [19]:
type(offers)

bs4.element.ResultSet

In [20]:
offers[0]

<div class="plp-fragment-wrapper"><div class="plp-mastercard" data-cs-capture="" data-currency="SEK" data-price="995" data-product-compact="" data-product-name="GUNNÖN" data-product-number="20468855" data-ref-id="20468855" data-testid="plp-product-card"><div class="plp-mastercard__item plp-mastercard__compare"></div><div class="plp-mastercard__item plp-mastercard__image"><a aria-disabled="false" aria-hidden="true" class="plp-product__image-link link" href="https://www.ikea.com/se/en/p/gunnoen-gazebo-dark-grey-grey-20468855/"><img class="plp-image plp-product__image" loading="lazy" src="https://www.ikea.com/se/en/images/products/gunnoen-gazebo-dark-grey-grey__0961342_pe807499_s5.jpg?f=xxs"/><img class="image plp-product__image plp-product__image--alt" loading="lazy" src="https://www.ikea.com/se/en/images/products/gunnoen-gazebo-dark-grey-grey__0961345_pe807502_s5.jpg?f=xxs"/></a></div><div class="plp-mastercard__item plp-mastercard__price"><a aria-disabled="false" class="plp-price-link-

In [21]:
# as a demonstration, I will scrape a few relevant data related to the displayed products
product_title = []
product_currency = []
product_price = []
product_id = []
product_link = []
product_image = []

for o in offers:
    product_title.append(o.find("div").attrs.get("data-product-name"))
    product_currency.append(o.find("div").attrs.get("data-currency"))
    product_price.append(o.find("div").attrs.get("data-price"))
    product_id.append(o.find("div").attrs.get("data-ref-id"))
    product_link.append(o.find("a").attrs.get("href"))
    images = o.find_all("img", class_="plp-image plp-product__image")
    for img in images:
        product_image.append(img["src"])

In [22]:
# checking the length and content of each list
# product_link[0:5]
len(product_image)

22

I realized we have the same product names for different articles, so I will add a new column named "product description" for more clarity.

In [23]:
product_descr = []

for o in offers:
    h3 = o.find("h3", class_="plp-price-module__name")
    if h3:
        description_span = h3.find("span", class_="plp-price-module__description")
        if description_span:
            product_descr.append(description_span.get_text())
        else:
            product_descr.append("None")

product_descr[0:5]

['Gazebo, 238x233 cm',
 'Pergola, 300x300 cm',
 'Gazebo, 300x300 cm',
 'Gazebo with curtains and net',
 'Cabinet in/outdoor, 90x161 cm']

I am satisfied with the scraping part.

Now, I can see that some of the lists are made of 22 rows, others 23. 
Upon checking the webpage, I can see that we have 22 products and an additional banner in the middle. 
I will take this element out of the lists containing 1 row in eccess. Then, I will be able to create a dataframe. 

In [24]:
# "None" to be removed from:
product_id = [item for item in product_id if item is not None]
product_title = [item for item in product_title if item is not None]
product_currency = [item for item in product_currency if item is not None]
product_price = [item for item in product_price if item is not None]
# non-product related link to be removed from: 
product_link = [item for item in product_link if item != "https://www.ikea.com/se/en/profile/login?utm_source=family-offers&utm_medium=login-reminder&utm_campaign=login&from=https://www.ikea.com/se/en/offers/family-offers/"]

In [25]:
# now we can create a dataframe 
products = pd.DataFrame(
    {"product_id": product_id,
     "product_title": product_title,
     "product_description": product_descr,
     "currency": product_currency,
     "product_price": product_price,
     "product_link": product_link,
     "image_link": product_image,
    })

In [26]:
products.head()

Unnamed: 0,product_id,product_title,product_description,currency,product_price,product_link,image_link
0,20468855,GUNNÖN,"Gazebo, 238x233 cm",SEK,995,https://www.ikea.com/se/en/p/gunnoen-gazebo-da...,https://www.ikea.com/se/en/images/products/gun...
1,20549239,HAMMARÖN,"Pergola, 300x300 cm",SEK,3595,https://www.ikea.com/se/en/p/hammaroen-pergola...,https://www.ikea.com/se/en/images/products/ham...
2,50468854,HIMMELSÖ,"Gazebo, 300x300 cm",SEK,2795,https://www.ikea.com/se/en/p/himmelsoe-gazebo-...,https://www.ikea.com/se/en/images/products/him...
3,s19414123,HIMMELSÖ,Gazebo with curtains and net,SEK,3895,https://www.ikea.com/se/en/p/himmelsoe-gazebo-...,https://www.ikea.com/se/en/images/products/him...
4,20345633,KOLBJÖRN,"Cabinet in/outdoor, 90x161 cm",SEK,1395,https://www.ikea.com/se/en/p/kolbjoern-cabinet...,https://www.ikea.com/se/en/images/products/kol...


I will now continue with my data inspection and some cleaning.

In [27]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           22 non-null     object
 1   product_title        22 non-null     object
 2   product_description  22 non-null     object
 3   currency             22 non-null     object
 4   product_price        22 non-null     object
 5   product_link         22 non-null     object
 6   image_link           22 non-null     object
dtypes: object(7)
memory usage: 1.3+ KB


In [28]:
# no NaNs, great! However, we need to change a few data types
products["product_id"] = products["product_id"].str.replace("s", "") # I noticed a typo in product_id too
products["product_id"] = products["product_id"].astype("int")
products["product_price"] = products["product_price"].astype("float")
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   product_id           22 non-null     int32  
 1   product_title        22 non-null     object 
 2   product_description  22 non-null     object 
 3   currency             22 non-null     object 
 4   product_price        22 non-null     float64
 5   product_link         22 non-null     object 
 6   image_link           22 non-null     object 
dtypes: float64(1), int32(1), object(5)
memory usage: 1.2+ KB


Now the dataframe is ready to be analysed. 