In [2]:
# Import required libraries
import pandas as pd
import numpy as np

# Load the dataset
file_path = "Sales_April_2019.csv"
df = pd.read_csv(file_path, on_bad_lines='skip')

# Inspect the raw data (optional)
print("Initial shape:", df.shape)
print(df.head())

Initial shape: (18383, 6)
  Order ID                     Product Quantity Ordered Price Each  \
0   176558        USB-C Charging Cable                2      11.95   
1      NaN                         NaN              NaN        NaN   
2   176559  Bose SoundSport Headphones                1      99.99   
3   176560                Google Phone                1        600   
4   176560            Wired Headphones                1      11.99   

       Order Date                      Purchase Address  
0  04/19/19 08:46          917 1st St, Dallas, TX 75001  
1             NaN                                   NaN  
2  04/07/19 22:30     682 Chestnut St, Boston, MA 02215  
3  04/12/19 14:38  669 Spruce St, Los Angeles, CA 90001  
4  04/12/19 14:38  669 Spruce St, Los Angeles, CA 90001  


In [3]:
def fill_missing_with_mean(df):
    """
    Fill missing (NaN) values in numeric columns with the column mean.
    Non-numeric columns are left unchanged (you could extend this if needed).
    """
    df_numeric = df.select_dtypes(include=[np.number])
    df[df_numeric.columns] = df_numeric.fillna(df_numeric.mean())
    return df

In [4]:
def drop_duplicates(df):
    """
    Remove duplicate rows from the DataFrame.
    Keeps the first occurrence.
    """
    return df.drop_duplicates().reset_index(drop=True)

In [5]:
# Step 1: Handle missing values
df_clean = fill_missing_with_mean(df)

# Step 2: Drop duplicate rows
df_clean = drop_duplicates(df_clean)

# Final check
print("Cleaned shape:", df_clean.shape)
print("Missing values remaining:\n", df_clean.isnull().sum())

Cleaned shape: (18269, 6)
Missing values remaining:
 Order ID            1
Product             1
Quantity Ordered    1
Price Each          1
Order Date          1
Purchase Address    1
dtype: int64


In [6]:
# Ensure proper column names
expected_columns = ['Order ID', 'Product', 'Quantity Ordered', 'Price Each', 'Order Date', 'Purchase Address']
if list(df.columns) != expected_columns:
    df.columns = expected_columns