# Amazon Data Cleaning Automation Tool

## Overview
This notebook automates the process of cleaning and transforming data for Amazon product listings. The objective is to ensure the data meets required quality standards before analysis and reporting. 

Key tasks include:
- Removing duplicates
- Handling missing values
- Normalizing product names
- Ensuring consistent date formats


## 1. Import Libraries and Load Data

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import ast
from ast import literal_eval
from itertools import repeat
from datetime import datetime
import os
from openpyxl import Workbook
from openpyxl import load_workbook
from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font, Fill
from openpyxl.styles import numbers

In [2]:
#downloads_folder = os.path.join(os.path.expanduser("~"), "Downloads")
# List of CSV file paths
csv_filenames = [
    "../data/raw/www.amazon.com_20250313_153438.csv",
    "../data/raw/www.amazon.com_20250313_201257.csv",
    # Add all file paths here
]

# Initialize an empty list to store DataFrames
dataframes = []
# Generate full file paths for each CSV
csv_file_paths = [os.path.join(filename) for filename in csv_filenames]
# Loop through each file in the provided list and read it into a DataFrame
for csv_file_path in csv_file_paths:
    try:
        dfcon = pd.read_csv(csv_file_path)
        dataframes.append(dfcon)
    except Exception as e:
        print(f"Error reading {csv_file_path}: {e}")

# Concatenate all DataFrames into one
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
else:
    df = pd.DataFrame()  # Empty DataFrame if no valid files are found


In [3]:
# Get current date in YYYYMMDD format
current_date = datetime.now().strftime('%Y%m%d')

# Section: Data Cleaning and Preprocessing

In [4]:
# Renaming columns
df = df.rename(columns={'Listing_price': 'regular_retail_price', 'promo_price': 'discounted_retail_price'})

# Clean the dataset to fix the issue
def clean_prices(df):
    # If `regular_retail_price` is missing, but `discounted_retail_price` exists, move the value
    df['regular_retail_price'] = df.apply(
        lambda row: row['discounted_retail_price'] if pd.isna(row['regular_retail_price']) and not pd.isna(row['discounted_retail_price']) else row['regular_retail_price'],
        axis=1
    )
    
    # Set `discounted_retail_price` to NaN where there is no discount (regular price == discounted price)
    df['discounted_retail_price'] = df.apply(
        lambda row: np.nan if row['regular_retail_price'] == row['discounted_retail_price'] else row['discounted_retail_price'],
        axis=1
    )
    
    return df

# Apply the cleaning function to the dataset
df = clean_prices(df)

# Handling missing values
df['ASIN'].fillna('None', inplace=True)
df['sellers'].fillna('[]', inplace=True)

# Dropping unnecessary columns
columns_to_drop = ['images', 'warning', 'error', 'error_code', 'job_id', 'collector_id']
df = df.drop(columns=columns_to_drop, axis=1)

# Cleaning specific columns
for i in range(len(df)):
    try:
        df.loc[i, 'bullet_points'] = df.loc[i, 'bullet_points'].replace(']', "").replace('[', "")
        df.loc[i, 'product_information'] = df.loc[i, 'product_information'].replace('{', "").replace('}', "")
        df.loc[i, 'combination'] = df.loc[i, 'combination'].replace('{', "").replace('}', "").replace(']', "").replace('[', "").replace("name", "Option")
        df.loc[i, 'sellers'] = df.loc[i, 'sellers'].replace('UnbeatableSale, Inc', "UnbeatableSale")
    except:
        continue

# Filling NaN values in specific columns
df['product_information'].fillna('None', inplace=True)
df['bullet_points'].fillna('None', inplace=True)
df['combination'].fillna('None', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ASIN'].fillna('None', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sellers'].fillna('[]', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behav

# Section: Processing Product Information Column

In [5]:
# Extracting and cleaning 'product_information' column
product_information = df[['ASIN', 'product_information']]
product_information['product_information'].fillna('{}', inplace=True)

# Standardizing certain key terms in the product information
product_information['product_information'] = product_information['product_information'].replace("Assembly required", "Assembly Required", regex=True)
product_information['product_information'] = product_information['product_information'].replace("Number of pieces", "Number of Pieces", regex=True)
product_information['product_information'] = product_information['product_information'].replace("Seat height", "Seat Height", regex=True)

# Initialize a list to hold the processed dataframes
dataframes = []

# Iterate over rows to convert product information into individual dataframes
for index, row in product_information.iterrows():
    try:
        # Clean any non-dictionary-like formats (if necessary)
        product_info = row['product_information']
        
        # Enclose the string in curly braces if they are missing
        if not (product_info.startswith('{') and product_info.endswith('}')):
            product_info = '{' + product_info.strip() + '}'
        
        # Safely evaluate the string representation of a dictionary
        data_dict = literal_eval(product_info)
        
        # If data_dict is a dictionary, proceed to create a DataFrame
        if isinstance(data_dict, dict):
            dF = pd.DataFrame(data_dict, index=[row['ASIN']])
            dataframes.append(dF)
        else:
            print(f"Non-dict data at index {index}")
    
    except (ValueError, SyntaxError) as e:
        print(f"Error converting to dictionary at index {index}: {e}")
        # Handle the error (optional: append NaN or empty DataFrame)

# Concatenate all the DataFrames into one combined DataFrame
if dataframes:
    pi_combined = pd.concat(dataframes)
else:
    print("No valid product information found.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  product_information['product_information'].fillna('{}', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_information['product_information'].fillna('{}', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pro

Non-dict data at index 7014
Non-dict data at index 7015
Non-dict data at index 7016
Non-dict data at index 7017
Non-dict data at index 7018
Non-dict data at index 7019
Non-dict data at index 7020
Non-dict data at index 7021
Non-dict data at index 7022
Non-dict data at index 7023
Non-dict data at index 7024
Non-dict data at index 7025
Non-dict data at index 7026
Non-dict data at index 7027
Non-dict data at index 7028
Non-dict data at index 7029
Non-dict data at index 7030
Non-dict data at index 7031
Non-dict data at index 7032
Non-dict data at index 7033
Non-dict data at index 7034
Non-dict data at index 7035
Non-dict data at index 7036
Non-dict data at index 7037
Non-dict data at index 7038
Non-dict data at index 7039
Non-dict data at index 7040
Non-dict data at index 7041
Non-dict data at index 7042
Non-dict data at index 7043
Non-dict data at index 7044
Non-dict data at index 7045
Non-dict data at index 7046
Non-dict data at index 7047
Non-dict data at index 7048
Non-dict data at ind

# Section: Processing Sellers and Product Information

In [6]:
# Extract relevant columns and define the converter function for 'sellers'
df_new = df[['sellers', 'ASIN']]

# Define the converter function with improved error handling
def converter_with_error_handling(record: str):
    try:
        if isinstance(record, str):
            # Correctly handle boolean values and parse the JSON
            record = record.replace("true", "True").replace("false", "False")
            parsed_data = ast.literal_eval(record)
            if isinstance(parsed_data, list):
                return parsed_data
            elif isinstance(parsed_data, dict):
                return [parsed_data]
            else:
                return []  # Return empty list for unexpected data structures
        else:
            return []  # Return empty list for non-string records
    except Exception:
        return []  # Return empty list for rows with errors


# Apply the converter function to clean the 'sellers' column
df_new['sellers'] = df_new['sellers'].map(lambda x: converter_with_error_handling(x), na_action='ignore')

# Filter rows with valid 'sellers' data
valid_rows = df_new[df_new['sellers'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

# Process the valid rows to expand the sellers data
dataframeS = []
for i in valid_rows.index:
    try:
        # Repeat ASINs for the number of sellers
        asin_list = list(repeat(valid_rows.loc[i, 'ASIN'], len(valid_rows.loc[i, 'sellers'])))
        # Convert sellers list into a DataFrame
        target = pd.DataFrame(valid_rows.loc[i, 'sellers'], index=asin_list).iloc[::-1]  # Reverse order
        dataframeS.append(target)
    except Exception as e:
        print(f"Error processing row {i}: {e}")

# Concatenate all processed DataFrames
if dataframeS:
    total = pd.concat(dataframeS)
    total.index.name = 'ASIN'
    total = total.rename(columns={"name": "seller_name"})
    total = total.reset_index()
else:
    total = pd.DataFrame()  # Empty DataFrame if no valid rows

# Display the final processed DataFrame
#print(total)

# Merging with main DataFrame and product information
df = pd.merge(df, total, on='ASIN', how='left')
df = pd.merge(df, pi_combined, on='ASIN', how='left')

# Selecting and renaming columns
df = df.rename(columns={'price': 'seller_price'})
df.drop_duplicates(inplace=True)

# Exporting to Excel
filename = f"amazon_retail_data_{current_date}.xlsx"
# Save the DataFrame to Excel with the new filename
df.to_excel(f"../data/processed/{filename}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['sellers'] = df_new['sellers'].map(lambda x: converter_with_error_handling(x), na_action='ignore')


# Excel Formatting and Styling

In [7]:
# Load the workbook and select the active worksheet
wb = load_workbook(filename=f"../data/processed/{filename}")
ws = wb.active

# Apply auto filter to the worksheet
ws.auto_filter.ref = ws.dimensions

# Define common styles
font = Font(size=15, bold=True)
wrap_alignment = Alignment(wrapText=True)
left_alignment = Alignment(horizontal='left')
fill = PatternFill("solid", fgColor="00CCFFCC")
thin_border = Border(
    top=Side(border_style='thin', color="FF000000"),
    bottom=Side(border_style='thin', color="FF000000"),
    left=Side(border_style='thin', color="FF000000"),
    right=Side(border_style='thin', color="FF000000")
)

# Set row heights and apply left alignment to all rows
last_row = ws.max_row
for i in range(2, last_row + 1):
    ws.row_dimensions[i].height = 15

# Apply number format to specific columns
for col in ["B", "AJ"]:
    for cell in ws[col]:
        cell.number_format = numbers.FORMAT_NUMBER

# Apply alignment, border, and wrapping to all cells
for row in ws.iter_rows(min_row=1, max_row=last_row):
    for cell in row:
        cell.alignment = left_alignment  # Left alignment for all cells
        cell.border = thin_border        # Thin border for all cells
        cell.alignment = wrap_alignment  # Enable text wrapping for all cells

# Apply font and fill to header (first row)
for cell in ws["1:1"]:
    cell.font = font
    cell.fill = fill

# Freeze the top row and first column
ws.freeze_panes = ws["B2"]

# Set a standard column width for all columns
for col in ws.columns:
    ws.column_dimensions[col[0].column_letter].width = 30

# Save the formatted workbook
styled_file_path = os.path.join(f"../data/processed/amazon_retail_data_{current_date}_styled.xlsx")
wb.save(styled_file_path)

print(f"Styled Excel file saved to: {styled_file_path}")

Styled Excel file saved to: ../data/processed/amazon_retail_data_20250313_styled.xlsx
