# Amazon Product Data Cleaning
This notebook cleans the scraped Amazon product data by handling missing values, converting data types, removing duplicates, and standardizing text.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np


## Load the Data

In [3]:
import pandas as pd

file_path = r'D:\Assignment\amazon_sponsored_soft_toys.csv'  # Use raw string to avoid escape issues
df = pd.read_csv(file_path)  # Use read_csv for a CSV file

df.head()  # Display the first 5 rows


Unnamed: 0,Title,Brand,Rating,Reviews,Price (₹),Image URL,Product URL
0,,,0.0,0,,,
1,Storio Toy Octopus Plushie Reversible Soft Toy...,,4.2,1585,178.0,https://m.media-amazon.com/images/I/51nqXz7iWr...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...
2,Storio Toy Cute Penguin Plushie Soft Toys for ...,,4.3,145,199.0,https://m.media-amazon.com/images/I/51i6bQTaMH...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...
3,ADORA Adorable Pou Alien Cute Stuffed Toy - 22...,,2.8,7,789.0,https://m.media-amazon.com/images/I/51JRfo+dlD...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...
4,Meera The Blue Astronaut Soft Plush Toy - A St...,,5.0,2,1499.0,https://m.media-amazon.com/images/I/51NcFOWL8I...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...


## Handling Missing Values

In [4]:
# Fill missing values or drop rows with excessive missing data
df.fillna({'Brand': 'Unknown', 'Reviews': '0', 'Rating': '0', 'Selling Price': '0'}, inplace=True)
df.dropna(subset=['Title', 'Product URL'], inplace=True)  # Essential fields cannot be missing


## Convert Data Types

In [5]:
# Convert data types
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce').fillna(0).astype(float)


## Remove Duplicates

In [6]:
# Remove duplicate entries
df.drop_duplicates(subset=['Title', 'Product URL'], keep='first', inplace=True)

## Standardize Text Formatting

In [7]:
# Convert text columns to title case
df['Title'] = df['Title'].str.title()
df['Brand'] = df['Brand'].str.title()

## Save the Cleaned Data

In [8]:
# Save cleaned data to a CSV file
cleaned_file_path = 'cleaned_products.csv'
df.to_csv(cleaned_file_path, index=False, encoding='utf-8-sig')

print(f'Cleaned data saved to {cleaned_file_path}')


Cleaned data saved to cleaned_products.csv
