# Wayfair Data Cleaning Automation Tool

## Overview
This notebook automates the process of cleaning and transforming data for Wayfair product listings. The objective is to ensure the data meets required quality standards before analysis and reporting. 

Key tasks include:
- Removing duplicates
- Handling missing values
- Normalizing product names
- Ensuring consistent date formatsimport pandas as pd


# Importing necessary libraries

In [2]:
import pandas as pd
import json
import os
import re
from datetime import datetime
from ast import literal_eval
from openpyxl import Workbook, load_workbook
from openpyxl.styles import PatternFill, Border, Side, Alignment, Font, numbers

# Define paths for files

In [3]:
json_filename = "../data/raw/snap_m82yajn42myeb4szeo.json"
json_file_path = os.path.join(json_filename)

# Data loading and preprocessing

In [4]:
df = pd.read_json(json_file_path)

In [6]:
def extract_sku_from_url(url: str):
    try:
        sku = url.split('?')[0].split('-')[-1].split('.')[0]
        return sku
    except Exception as e:
        print(f"Error extracting SKU from URL: {e}")
        return 'none'

In [7]:
# Apply SKU extraction function
df['wayfair_sku'] = df['url'].map(extract_sku_from_url)
# Drop unnecessary columns and remove duplicates
df.drop(['reviews'], axis=1, inplace=True)
# Convert any columns with lists or dictionaries to strings for drop_duplicates to work
df = df.applymap(lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x)
# Drop duplicates
df.drop_duplicates(inplace=True)
# Rename columns for consistency
df = df.rename(columns={'promo_price': 'discounted_retail_price', 'breadcrumbs': 'category'})
df['regular_price'].fillna('{"value": 0, "currency": "USD", "symbol": "$"}', inplace=True)

  df = df.applymap(lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['regular_price'].fillna('{"value": 0, "currency": "USD", "symbol": "$"}', inplace=True)


# Extracting product specifications and features

In [8]:
# Handle regular price column (parsing JSON safely)
def safe_json_loads(x):
    if pd.isna(x):  
        return {"value": 0, "currency": "USD", "symbol": "$"}
    try:
        return json.loads(x)
    except json.JSONDecodeError:
        return {}

In [9]:
# Apply safe_json_loads to the 'specifications' column
df['specifications_dict'] = df['specifications'].apply(safe_json_loads)

In [10]:
# Apply the safe json loading function
df['regular_price_dict'] = df['regular_price'].apply(safe_json_loads)
# Normalize the 'regular_price_dict' column and drop unused columns
regular_price_df = pd.json_normalize(df['regular_price_dict'])
df = pd.concat([df, regular_price_df], axis=1)
df.drop(['regular_price', 'currency', 'symbol'], axis=1, inplace=True)


In [11]:
# Parse 'specifications_details' from the 'specifications' column
def extract_specifications_details(specifications):
    if isinstance(specifications, dict):
        return specifications.get('specifications_details', None)
    else:
        return None


In [12]:
# Apply the function to extract 'specifications_details'
df['specifications_details'] = df['specifications_dict'].apply(extract_specifications_details)

In [13]:
def extract_features(s):
    if not isinstance(s, str):  
        return {}  
    
    # Extract the 'Features' section
    features_str = re.search(r'Features:\s*\[(.*?)\],', s)
    
    if features_str:
        features_str = features_str.group(1)
        
        # Split into individual feature key-value pairs
        features_list = features_str.split(', ')
        
        # Create a dictionary by splitting on the first ': ' for each pair
        features_dict = {}
        for feature in features_list:
            key_value = feature.split(': ', 1)  
            if len(key_value) == 2:
                key, value = key_value
                features_dict[key.strip()] = value.strip()  
        
        return features_dict
    else:
        return {}

In [14]:
def dollar_remover(record:str):
    try:
        price =  record.replace("$","")
        return price
    except:
        return 'none'

In [15]:
# Apply the extract_features function
df['parsed_features'] = df['specifications_details'].apply(extract_features)

# Normalize parsed_features to combine
df_features = pd.json_normalize(df['parsed_features'])

# Concatenate the parsed features back to the original DataFrame
df_combined = pd.concat([df, df_features], axis=1)


In [16]:
# Remove $ sign
df_combined['discounted_retail_price'] = df_combined['discounted_retail_price'].map(lambda x: dollar_remover(x),na_action='ignore')

In [17]:
df_combined = df_combined.rename(columns={'value':'regular_retail_price'})

In [18]:
# Drop unnecessary columns
df_combined.drop(['regular_price_dict', 'specifications_details', 'specifications_dict','parsed_features','specifications','input_url'], axis=1, inplace=True)

In [19]:
# Essential columns
essential_columns = ['wayfair_sku', 'availability', 'average_ratings', 'category', 'combinations', 'description','regular_retail_price','discounted_retail_price']

# The remaining columns by excluding the essential ones
remaining_columns = [col for col in df_combined.columns if col not in essential_columns]

# Combine essential columns with the remaining columns
selected_columns = essential_columns + remaining_columns

In [20]:
df_final = df_combined[selected_columns]

# Excel file output

In [21]:
# Save the DataFrame to an Excel file in the Downloads folder
current_date = datetime.now().strftime("%Y%m%d")
output_filename = f"../data/processed/wayfair_retail_data_{current_date}.xlsx"
output_path = os.path.join(output_filename)

df_final.to_excel(output_path, index=False)
print(f"Data saved to {output_path}")

Data saved to ../data/processed/wayfair_retail_data_20250313.xlsx


# Excel Formatting and Styling

In [22]:
file_path = os.path.join(output_filename)

In [24]:
# Load the workbook and select the active worksheet
wb = load_workbook(filename=file_path)
ws = wb.active

# Apply auto filter to the worksheet
ws.auto_filter.ref = ws.dimensions

# Define common styles
font = Font(size=15, bold=True)
wrap_alignment = Alignment(wrapText=True)
left_alignment = Alignment(horizontal='left')
fill = PatternFill("solid", fgColor="00CCFFCC")
thin_border = Border(
    top=Side(border_style='thin', color="FF000000"),
    bottom=Side(border_style='thin', color="FF000000"),
    left=Side(border_style='thin', color="FF000000"),
    right=Side(border_style='thin', color="FF000000")
)

# Set row heights and apply left alignment to all rows
last_row = ws.max_row
for i in range(2, last_row + 1):
    ws.row_dimensions[i].height = 15

# Apply number format to specific columns
for col in ["B", "AJ"]:
    for cell in ws[col]:
        cell.number_format = numbers.FORMAT_NUMBER

# Apply alignment, border, and wrapping to all cells
for row in ws.iter_rows(min_row=1, max_row=last_row):
    for cell in row:
        cell.alignment = left_alignment  
        cell.border = thin_border        
        cell.alignment = wrap_alignment  

# Apply font and fill to header (first row)
for cell in ws["1:1"]:
    cell.font = font
    cell.fill = fill

# Freeze the top row and first column
ws.freeze_panes = ws["B2"]

# Set a standard column width for all columns
for col in ws.columns:
    ws.column_dimensions[col[0].column_letter].width = 30

# Save the formatted workbook
styled_file_path = os.path.join(f"../data/processed/wayfair_retail_data_{current_date}_styled.xlsx")
wb.save(styled_file_path)

print(f"Styled Excel file saved to: {styled_file_path}")

Styled Excel file saved to: ../data/processed/wayfair_retail_data_20250313_styled.xlsx


#### Similar Items

In [25]:
def safe_json_loads(item):
    if pd.isna(item):  # Check if the item is NaN or None
        return None
    try:
        return json.loads(item)  # Attempt to parse JSON
    except json.JSONDecodeError:
        return None  # Return None if JSON parsing fails

# Ensure unique column names in the DataFrame
df_final = df_final.loc[:, ~df_final.columns.duplicated()]

# Apply the safe function to the 'similar_items' column
df_final['parsed_similar_items'] = df_final['similar_items'].apply(safe_json_loads)

# Expand the rows of the parsed_similar_items column
expanded_rows = df_final.explode('parsed_similar_items')

# Normalize the JSON data and retain the wayfair_sku column
normalized_df = pd.json_normalize(expanded_rows['parsed_similar_items'])
normalized_df['wayfair_sku'] = expanded_rows['wayfair_sku'].values
normalized_df.to_excel('similar_items_wayfair.xlsx')