# Data Cleaning & Preprocessing

### 1. Import Required Libraries

In [8]:
import pandas as pd
import numpy as np

### 2. Load the Dataset

In [9]:
df = pd.read_csv('/Users/naveenapaleti/Projects/ShopTrack360/data/data.csv',encoding="ISO-8859-1", dtype={'CustomerID': str,'InvoiceID': str})

### 3. Initial Data Overview

In [10]:
print("Initial Shape:", df.shape)

Initial Shape: (541909, 8)


In [13]:
print("Missing Values:")
print(df.isnull().sum())

Missing Values:
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64


In [15]:
print("Data Types:")
print(df.dtypes)

Data Types:
InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID      object
Country         object
dtype: object


In [16]:
print("Duplicate Rows:") 
print(df.duplicated().sum())

Duplicate Rows:
5268


### 4. Convert 'InvoiceDate' to datetime

In [17]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

### 5. Drop rows with missing CustomerID (essential for customer-level analysis)

In [18]:
df = df.dropna(subset=["CustomerID"])

### 6. Remove Duplicates

In [19]:
df = df.drop_duplicates()

### 7. Remove rows with Quantity <= 0 or UnitPrice <= 0 (invalid transactions)

In [21]:
df = df[(df["Quantity"] > 0) & (df["UnitPrice"] > 0)]

### 8. Reset Index after cleaning

In [23]:
df = df.reset_index(drop=True)

### 9. Final Overview

In [24]:
print("Cleaned Shape:", df.shape)

Cleaned Shape: (392692, 8)


In [25]:
print("Missing Values After Cleaning:")
print(df.isnull().sum())

Missing Values After Cleaning:
InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64


In [27]:
print("Duplicate Rows After Cleaning:", df.duplicated().sum())

Duplicate Rows After Cleaning: 0


In [28]:
df.to_csv("/Users/naveenapaleti/Projects/ShopTrack360/data/cleaned_ecommerce_data.csv", index=False)