# cosmetics Products Data Cleaning Project
This notebook demonstrates the process of cleaning and preparing product data for analysis.
# Steps covered:
1. Handling missing values in categories and numerical data.
2. Feature engineering (Calculating Revenues).
3. Data type optimization and formatting.

## Load Dataset

In [None]:
import pandas as pd
df=pd.read_csv("product_info_start.csv")
print(df.head())
print(df.dtypes)

## Select Relevant Columns

In [None]:
columns_to_keep = [
    'product_id', 'product_name', 'brand_id', 'brand_name', 
    'loves_count', 'rating', 'reviews', 'price_usd', 
    'primary_category', 'secondary_category', 'tertiary_category']
df_cleaned =df[columns_to_keep].copy()
print(df_cleaned)

## Handle Missing Values

In [None]:
print(df_cleaned.isnull().sum())
df_cleaned = df_cleaned.dropna(subset=['rating', 'reviews'])
df_cleaned['secondary_category']=df_cleaned['secondary_category'].fillna("other")
df_cleaned['tertiary_category'] = df_cleaned['tertiary_category'].fillna(df_cleaned['secondary_category'])
print(df_cleaned.isnull().sum())

In [None]:
#check for duplicates
df_cleaned =df_cleaned.drop_duplicates()

## Create Revenue Metric

In [None]:
# Create a revenue proxy metric based on price and number of reviews
df_cleaned["revenues"]=df_cleaned["price_usd"]*df_cleaned["reviews"]

In [None]:
#rearrange column's dataset
final_columns = [
    'product_id', 'product_name', 'brand_id', 'brand_name', 
    'loves_count', 'revenues', 'rating', 'reviews', 'price_usd', 
    'primary_category', 'secondary_category', 'tertiary_category']
df_cleaned = df_cleaned[final_columns]
print(df_cleaned)

In [None]:
#saving dataset as csv file
df_cleaned.to_csv("cosmetics cleaned dataset.csv",index=False)

In [None]:
print(f"Original shape: {df.shape}")
print(f"Final shape: {df_cleaned.shape}")
df_cleaned.describe()
df_cleaned.head()