In [2]:
import pandas as pd

# Try loading each file
files = [
    "IKEA_Full_Dataset.csv",
    "IKEA_Products.csv",
    "ikea_all_products.csv",
    "ikea_all_regions.csv",
    "ikea_furniture_data.csv",
    "ikea_saudi_all_products.csv",
    "ikea_saudi_furniture_data.csv",
    "scraped_furniture_data.csv"
]

for file in files:
    try:
        df = pd.read_csv(file)
        print(f"✅ Loaded {file} successfully!")
        print(df.head(), "\n")  # Show first few rows
    except Exception as e:
        print(f"❌ Could not load {file}: {e}\n")

❌ Could not load IKEA_Full_Dataset.csv: No columns to parse from file

✅ Loaded IKEA_Products.csv successfully!
   Name  Price                                          Image URL  \
0   NaN    199  https://www.ikea.com/us/en/images/products/hae...   
1   NaN    299  https://www.ikea.com/us/en/images/products/ton...   
2   NaN     40  https://www.ikea.com/us/en/images/products/hau...   
3   NaN    160  https://www.ikea.com/us/en/images/products/tos...   
4   NaN    460  https://www.ikea.com/us/en/images/products/bes...   

                                                Link  
0  https://www.ikea.com/us/en/p/haegernaes-table-...  
1  https://www.ikea.com/us/en/p/tonstad-sideboard...  
2  https://www.ikea.com/us/en/p/hauga-chair-white...  
3  https://www.ikea.com/us/en/p/tossberg-chair-me...  
4  https://www.ikea.com/us/en/p/besta-storage-com...   

✅ Loaded ikea_all_products.csv successfully!
  Category Name  Price                                          Image URL  \
0   Chairs  NaN    

In [3]:
import pandas as pd

# Load datasets
df_products = pd.read_csv("IKEA_Products.csv")
df_regions = pd.read_csv("ikea_all_regions.csv")

# Check first few rows
print(df_products.head())
print(df_regions.head())

# Merge on 'Link' column (assuming this column uniquely identifies products)
df_merged = df_regions.merge(df_products, on="Link", how="left")

# Save the cleaned dataset
df_merged.to_csv("ikea_cleaned_data.csv", index=False)
print("✅ Cleaned dataset saved as ikea_cleaned_data.csv")

   Name  Price                                          Image URL  \
0   NaN    199  https://www.ikea.com/us/en/images/products/hae...   
1   NaN    299  https://www.ikea.com/us/en/images/products/ton...   
2   NaN     40  https://www.ikea.com/us/en/images/products/hau...   
3   NaN    160  https://www.ikea.com/us/en/images/products/tos...   
4   NaN    460  https://www.ikea.com/us/en/images/products/bes...   

                                                Link  
0  https://www.ikea.com/us/en/p/haegernaes-table-...  
1  https://www.ikea.com/us/en/p/tonstad-sideboard...  
2  https://www.ikea.com/us/en/p/hauga-chair-white...  
3  https://www.ikea.com/us/en/p/tossberg-chair-me...  
4  https://www.ikea.com/us/en/p/besta-storage-com...  
  Region Category Name Price  \
0    USA   Chairs  NaN   199   
1    USA   Chairs  NaN   299   
2    USA   Chairs  NaN    40   
3    USA   Chairs  NaN   160   
4    USA   Chairs  NaN   460   

                                           Image URL  \
0  htt

In [4]:
print(df_merged.isnull().sum())  # Count missing values per column

Region           0
Category         0
Name_x         355
Price_x          0
Image URL_x      0
Link             0
Name_y         388
Price_y        368
Image URL_y    368
dtype: int64


In [5]:
import pandas as pd

# Load merged dataset
df = pd.read_csv("ikea_cleaned_data.csv")

# Fill missing product names
df["Name_x"].fillna(df["Name_y"], inplace=True)

# Fill missing prices
df["Price_x"].fillna(df["Price_y"], inplace=True)

# Fill missing images
df["Image URL_x"].fillna(df["Image URL_y"], inplace=True)

# Drop redundant columns
df_cleaned = df.drop(columns=["Name_y", "Price_y", "Image URL_y"])

# Rename columns for clarity
df_cleaned.rename(columns={"Name_x": "Product Name", "Price_x": "Price", "Image URL_x": "Image URL"}, inplace=True)

# Save the cleaned dataset
df_cleaned.to_csv("ikea_final_data.csv", index=False)
print("✅ Cleaned dataset saved as ikea_final_data.csv")

✅ Cleaned dataset saved as ikea_final_data.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Name_x"].fillna(df["Name_y"], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Price_x"].fillna(df["Price_y"], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

In [6]:
import pandas as pd

# Load merged dataset
df = pd.read_csv("ikea_cleaned_data.csv")

# Correct way to fill missing values
df["Name_x"] = df["Name_x"].fillna(df["Name_y"])
df["Price_x"] = df["Price_x"].fillna(df["Price_y"])
df["Image URL_x"] = df["Image URL_x"].fillna(df["Image URL_y"])

# Drop redundant columns
df_cleaned = df.drop(columns=["Name_y", "Price_y", "Image URL_y"])

# Rename columns for clarity
df_cleaned.rename(columns={"Name_x": "Product Name", "Price_x": "Price", "Image URL_x": "Image URL"}, inplace=True)

# Save the cleaned dataset
df_cleaned.to_csv("ikea_final_data.csv", index=False)
print("✅ Cleaned dataset saved as ikea_final_data.csv")

✅ Cleaned dataset saved as ikea_final_data.csv


In [7]:
print(df_cleaned.isnull().sum())

Region            0
Category          0
Product Name    355
Price             0
Image URL         0
Link              0
dtype: int64


In [16]:
import pandas as pd
import time
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# ✅ Load dataset
df = pd.read_csv("ikea_final_data.csv")

# ✅ Filter rows where product names are missing
missing_names = df[df["Product Name"].isnull()]

# ✅ Set up Selenium (Headless Chrome)
chrome_options = Options()
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--disable-dev-shm-usage")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# ✅ Function to scrape product names from IKEA
def get_product_name(url):
    try:
        driver.get(url)
        time.sleep(5)  # Wait for page to load
        name = driver.find_element(By.CLASS_NAME, "pip-header-section__title").text.strip()
        return name
    except:
        return None

# ✅ Scrape missing product names
for index, row in missing_names.iterrows():
    product_name = get_product_name(row["Link"])
    if product_name:
        df.at[index, "Product Name"] = product_name
    time.sleep(2)  # Avoid getting blocked

# ✅ Close Selenium
driver.quit()

# ✅ Save updated dataset
df.to_csv("ikea_final_fixed.csv", index=False)
print("✅ Missing product names scraped and saved as ikea_final_fixed.csv")

✅ Missing product names scraped and saved as ikea_final_fixed.csv


In [18]:
print(df.isnull().sum())

Region            0
Category          0
Product Name    355
Price             0
Image URL         0
Link              0
dtype: int64


In [None]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# ✅ Load dataset
df = pd.read_csv("ikea_final_data.csv")

# ✅ Filter rows where product names are missing
missing_names = df[df["Product Name"].isnull()]

# ✅ Set up Selenium (Headless Chrome)
chrome_options = Options()
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--disable-dev-shm-usage")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# ✅ Function to scrape product names from IKEA
def get_product_name(url):
    try:
        print(f"🔄 Fetching: {url}")
        driver.get(url)
        time.sleep(7)  # Wait for page to fully load

        # Debug: Print first 500 characters of page source
        print(driver.page_source[:500])

        # ✅ Try different selectors
        try:
            name = driver.find_element(By.CLASS_NAME, "pip-header-section__title").text.strip()
        except:
            name = driver.find_element(By.CSS_SELECTOR, "h1.pip-header-section__title").text.strip()

        print(f"✅ Product Name Found: {name}")
        return name
    except Exception as e:
        print(f"❌ Failed to get name for {url}: {e}")
        return None

# ✅ Scrape missing product names
for index, row in missing_names.iterrows():
    product_name = get_product_name(row["Link"])
    if product_name:
        df.at[index, "Product Name"] = product_name
    time.sleep(3)  # Avoid blocking

# ✅ Close Selenium
driver.quit()

# ✅ Save updated dataset
df.to_csv("ikea_final_fixed.csv", index=False)
print("✅ Missing product names scraped and saved as ikea_final_fixed.csv")

🔄 Fetching: https://www.ikea.com/us/en/p/haegernaes-table-and-4-chairs-antique-stain-pine-70575947/
<html lang="en-US" dir="ltr" class="js-focus-visible js" data-js-focus-visible=""><head data-optly-286029aecfdd4012a678eb19bd35e0a6="" data-optly-5d71a0e8-349f-4cb8-8340-353245be15fd=""><script src="https://ct.pinterest.com/static/ct/token_create.js"></script><script type="text/javascript" async="" src="https://analytics.tiktok.com/i18n/pixel/static/identify_9d76dc36.js"></script><script async="" src="https://s.pinimg.com/ct/lib/main.8821a9da.js"></script><script type="text/javascript" async="" 
❌ Failed to get name for https://www.ikea.com/us/en/p/haegernaes-table-and-4-chairs-antique-stain-pine-70575947/: Message: no such element: Unable to locate element: {"method":"css selector","selector":"h1.pip-header-section__title"}
  (Session info: chrome=133.0.6943.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-su

In [1]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# ✅ Set up Selenium
chrome_options = Options()
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--disable-dev-shm-usage")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# ✅ IKEA Product Page (Use any product link)
url = "https://www.ikea.com/us/en/p/haegernaes-table-and-4-chairs-antique-stain-pine-70575947/"

# ✅ Load Page
driver.get(url)
time.sleep(5)

# ✅ Print Full Page Source (Check for Product Name Element)
print(driver.page_source)

# ✅ Close Selenium
driver.quit()

<html lang="en-US" dir="ltr" class="js-focus-visible js" data-js-focus-visible=""><head data-optly-286029aecfdd4012a678eb19bd35e0a6="" data-optly-5d71a0e8-349f-4cb8-8340-353245be15fd=""><script src="https://ct.pinterest.com/static/ct/token_create.js"></script><script type="text/javascript" async="" src="https://analytics.tiktok.com/i18n/pixel/static/identify_9d76dc36.js"></script><script type="text/javascript" async="" src="https://analytics.tiktok.com/i18n/pixel/static/main.MTIwNTdiZjNmMA.js" data-id="CR6808BC77UBVEOEFGAG"></script><script async="" src="https://s.pinimg.com/ct/lib/main.8821a9da.js"></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=DC-6739053&amp;l=mtDataLayer&amp;cx=c&amp;gtm=45be52q0v898469414za200&amp;tag_exp=101732282~101732284~102067808~102482433~102539968~102558064~102587591~102605417~102640600~102658453~102717422~102732003"></script><script type="text/javascript" async="" src="https://analytics.tiktok.com/i18n/pixe

In [None]:
import time
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# ✅ Load dataset
df = pd.read_csv("ikea_final_data.csv")

# ✅ Filter rows where product names are missing
missing_names = df[df["Product Name"].isnull()]

# ✅ Set up Selenium (Headless Chrome)
chrome_options = Options()
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--disable-dev-shm-usage")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# ✅ Function to scrape product names from JSON-LD
def get_product_name(url):
    try:
        print(f"🔄 Fetching: {url}")
        driver.get(url)
        time.sleep(5)  # Wait for page to fully load

        # ✅ Find JSON-LD script with product details
        script_tag = driver.find_element(By.ID, "pip-range-json-ld")
        json_data = json.loads(script_tag.get_attribute("innerText"))

        # ✅ Extract product name
        product_name = json_data.get("name", "N/A")
        print(f"✅ Product Name Found: {product_name}")
        return product_name
    except Exception as e:
        print(f"❌ Failed to get name for {url}: {e}")
        return None

# ✅ Scrape missing product names
for index, row in missing_names.iterrows():
    product_name = get_product_name(row["Link"])
    if product_name:
        df.at[index, "Product Name"] = product_name
    time.sleep(3)  # Prevent blocking

# ✅ Close Selenium
driver.quit()

# ✅ Save updated dataset
df.to_csv("ikea_final_fixed.csv", index=False)
print("✅ Missing product names scraped and saved as ikea_final_fixed.csv")