In [1]:
import numpy as np
import pandas as pd

### Mini-Project Overview: Retail Sales Data Wrangling
- Let's walk through the entire data wrangling process on a mock retail sales dataset.
- We’ll cover steps such as loading, cleaning, transforming, and documenting the data.

**Dataset:**
- Here, we’ll simulate a dataset of retail sales containing product and transaction information with columns such as:
    - 'OrderID', 'Product', 'Category', 'Quantity', 'Price', 'Discount', 'Total', 'OrderDate', and 'CustomerID'.
- This sample dataset will have missing values, inconsistent formats, and other issues to address in the data wrangling process.

**Loading the Data**

In [2]:
# Sample data
data = {
    'OrderID': [1001, 1002, 1003, 1004, 1005, 1006, 1007, None, 1009, 1010],
    'Product': ['Laptop', 'Tablet', 'Laptop', 'Phone', 'Tablet', 'Laptop', 'Phone', 'Tablet', 'Phone', 'Laptop'],
    'Category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', None, 'Electronics', 'Electronics', 'Electronics', 'Electronics'],
    'Quantity': [1, 2, None, 1, 2, 1, None, 3, 1, 1],
    'Price': [1000, 600, 1100, 500, 600, 1200, 550, 600, None, 1150],
    'Discount': [0.1, 0.05, None, 0.1, 0.05, 0.1, 0.05, None, 0.1, 0.1],
    'Total': [900, 1140, None, 450, None, 1080, 522.5, 1710, None, None],
    'OrderDate': ['2023-07-01', '2023/07/05', 'July 6, 2023', '2023-07-10', '2023/07/12', '2023-07-15', '2023-07-17', '07/18/2023', '2023-07-20', None],
    'CustomerID': [101, 102, 103, 104, 105, 106, 101, 107, 108, 109]
}

# Creating the DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,OrderID,Product,Category,Quantity,Price,Discount,Total,OrderDate,CustomerID
0,1001.0,Laptop,Electronics,1.0,1000.0,0.1,900.0,2023-07-01,101
1,1002.0,Tablet,Electronics,2.0,600.0,0.05,1140.0,2023/07/05,102
2,1003.0,Laptop,Electronics,,1100.0,,,"July 6, 2023",103
3,1004.0,Phone,Electronics,1.0,500.0,0.1,450.0,2023-07-10,104
4,1005.0,Tablet,Electronics,2.0,600.0,0.05,,2023/07/12,105
5,1006.0,Laptop,,1.0,1200.0,0.1,1080.0,2023-07-15,106
6,1007.0,Phone,Electronics,,550.0,0.05,522.5,2023-07-17,101
7,,Tablet,Electronics,3.0,600.0,,1710.0,07/18/2023,107
8,1009.0,Phone,Electronics,1.0,,0.1,,2023-07-20,108
9,1010.0,Laptop,Electronics,1.0,1150.0,0.1,,,109


**Quick Exploration**

In [3]:
# Basic exploration
print(df.head())         # Display first few rows
print(df.info())         # Column data types and null counts
print(df.describe())     # Summary statistics

   OrderID Product     Category  Quantity   Price  Discount   Total  \
0   1001.0  Laptop  Electronics       1.0  1000.0      0.10   900.0   
1   1002.0  Tablet  Electronics       2.0   600.0      0.05  1140.0   
2   1003.0  Laptop  Electronics       NaN  1100.0       NaN     NaN   
3   1004.0   Phone  Electronics       1.0   500.0      0.10   450.0   
4   1005.0  Tablet  Electronics       2.0   600.0      0.05     NaN   

      OrderDate  CustomerID  
0    2023-07-01         101  
1    2023/07/05         102  
2  July 6, 2023         103  
3    2023-07-10         104  
4    2023/07/12         105  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   OrderID     9 non-null      float64
 1   Product     10 non-null     object 
 2   Category    9 non-null      object 
 3   Quantity    8 non-null      float64
 4   Price       9 non-null      float64
 5   

**Handling Missing Values**

In [4]:
# Fill missing Quantity with median value
df['Quantity'].fillna(df['Quantity'].median(), inplace=True)
df

Unnamed: 0,OrderID,Product,Category,Quantity,Price,Discount,Total,OrderDate,CustomerID
0,1001.0,Laptop,Electronics,1.0,1000.0,0.1,900.0,2023-07-01,101
1,1002.0,Tablet,Electronics,2.0,600.0,0.05,1140.0,2023/07/05,102
2,1003.0,Laptop,Electronics,1.0,1100.0,,,"July 6, 2023",103
3,1004.0,Phone,Electronics,1.0,500.0,0.1,450.0,2023-07-10,104
4,1005.0,Tablet,Electronics,2.0,600.0,0.05,,2023/07/12,105
5,1006.0,Laptop,,1.0,1200.0,0.1,1080.0,2023-07-15,106
6,1007.0,Phone,Electronics,1.0,550.0,0.05,522.5,2023-07-17,101
7,,Tablet,Electronics,3.0,600.0,,1710.0,07/18/2023,107
8,1009.0,Phone,Electronics,1.0,,0.1,,2023-07-20,108
9,1010.0,Laptop,Electronics,1.0,1150.0,0.1,,,109


In [5]:
# Drop rows where essential fields (OrderID, Product, Price) are missing
df.dropna(subset=['OrderID', 'Product', 'Price'], inplace=True)
df

Unnamed: 0,OrderID,Product,Category,Quantity,Price,Discount,Total,OrderDate,CustomerID
0,1001.0,Laptop,Electronics,1.0,1000.0,0.1,900.0,2023-07-01,101
1,1002.0,Tablet,Electronics,2.0,600.0,0.05,1140.0,2023/07/05,102
2,1003.0,Laptop,Electronics,1.0,1100.0,,,"July 6, 2023",103
3,1004.0,Phone,Electronics,1.0,500.0,0.1,450.0,2023-07-10,104
4,1005.0,Tablet,Electronics,2.0,600.0,0.05,,2023/07/12,105
5,1006.0,Laptop,,1.0,1200.0,0.1,1080.0,2023-07-15,106
6,1007.0,Phone,Electronics,1.0,550.0,0.05,522.5,2023-07-17,101
9,1010.0,Laptop,Electronics,1.0,1150.0,0.1,,,109


In [6]:
# Fill Discount with 0 for missing values
df['Discount'].fillna(0, inplace=True)
df

Unnamed: 0,OrderID,Product,Category,Quantity,Price,Discount,Total,OrderDate,CustomerID
0,1001.0,Laptop,Electronics,1.0,1000.0,0.1,900.0,2023-07-01,101
1,1002.0,Tablet,Electronics,2.0,600.0,0.05,1140.0,2023/07/05,102
2,1003.0,Laptop,Electronics,1.0,1100.0,0.0,,"July 6, 2023",103
3,1004.0,Phone,Electronics,1.0,500.0,0.1,450.0,2023-07-10,104
4,1005.0,Tablet,Electronics,2.0,600.0,0.05,,2023/07/12,105
5,1006.0,Laptop,,1.0,1200.0,0.1,1080.0,2023-07-15,106
6,1007.0,Phone,Electronics,1.0,550.0,0.05,522.5,2023-07-17,101
9,1010.0,Laptop,Electronics,1.0,1150.0,0.1,,,109


**Correcting Data Formats**

In [7]:
# Convert OrderDate to a datetime format
df['OrderDate'] = pd.to_datetime(df['OrderDate'], errors='coerce')

In [8]:
# Check for any remaining NaT (not-a-time) values
df['OrderDate'].fillna(method='ffill', inplace=True)
df

Unnamed: 0,OrderID,Product,Category,Quantity,Price,Discount,Total,OrderDate,CustomerID
0,1001.0,Laptop,Electronics,1.0,1000.0,0.1,900.0,2023-07-01,101
1,1002.0,Tablet,Electronics,2.0,600.0,0.05,1140.0,2023-07-01,102
2,1003.0,Laptop,Electronics,1.0,1100.0,0.0,,2023-07-01,103
3,1004.0,Phone,Electronics,1.0,500.0,0.1,450.0,2023-07-10,104
4,1005.0,Tablet,Electronics,2.0,600.0,0.05,,2023-07-10,105
5,1006.0,Laptop,,1.0,1200.0,0.1,1080.0,2023-07-15,106
6,1007.0,Phone,Electronics,1.0,550.0,0.05,522.5,2023-07-17,101
9,1010.0,Laptop,Electronics,1.0,1150.0,0.1,,2023-07-17,109


**Creating Calculated Columns (Enrichment)**

In [9]:
# Calculate total with discount applied
df['Total'] = df['Quantity'] * df['Price'] * (1 - df['Discount'])
df

Unnamed: 0,OrderID,Product,Category,Quantity,Price,Discount,Total,OrderDate,CustomerID
0,1001.0,Laptop,Electronics,1.0,1000.0,0.1,900.0,2023-07-01,101
1,1002.0,Tablet,Electronics,2.0,600.0,0.05,1140.0,2023-07-01,102
2,1003.0,Laptop,Electronics,1.0,1100.0,0.0,1100.0,2023-07-01,103
3,1004.0,Phone,Electronics,1.0,500.0,0.1,450.0,2023-07-10,104
4,1005.0,Tablet,Electronics,2.0,600.0,0.05,1140.0,2023-07-10,105
5,1006.0,Laptop,,1.0,1200.0,0.1,1080.0,2023-07-15,106
6,1007.0,Phone,Electronics,1.0,550.0,0.05,522.5,2023-07-17,101
9,1010.0,Laptop,Electronics,1.0,1150.0,0.1,1035.0,2023-07-17,109


**Validating Data**

In [10]:
# Validate Quantity to ensure no negative values
df = df[df['Quantity'] > 0]

# Validate that Total is calculated correctly
df['Total'] = df['Total'].round(2)
df

Unnamed: 0,OrderID,Product,Category,Quantity,Price,Discount,Total,OrderDate,CustomerID
0,1001.0,Laptop,Electronics,1.0,1000.0,0.1,900.0,2023-07-01,101
1,1002.0,Tablet,Electronics,2.0,600.0,0.05,1140.0,2023-07-01,102
2,1003.0,Laptop,Electronics,1.0,1100.0,0.0,1100.0,2023-07-01,103
3,1004.0,Phone,Electronics,1.0,500.0,0.1,450.0,2023-07-10,104
4,1005.0,Tablet,Electronics,2.0,600.0,0.05,1140.0,2023-07-10,105
5,1006.0,Laptop,,1.0,1200.0,0.1,1080.0,2023-07-15,106
6,1007.0,Phone,Electronics,1.0,550.0,0.05,522.5,2023-07-17,101
9,1010.0,Laptop,Electronics,1.0,1150.0,0.1,1035.0,2023-07-17,109


**Consolidating Data**

In [11]:
# Aggregate total sales by Product and Category
consolidated_df = df.groupby(['Product', 'Category']).agg({
    'Quantity': 'sum',
    'Total': 'sum'
}).reset_index()

consolidated_df

Unnamed: 0,Product,Category,Quantity,Total
0,Laptop,Electronics,3.0,3035.0
1,Phone,Electronics,2.0,972.5
2,Tablet,Electronics,4.0,2280.0


**Documenting Data**

In [12]:
# Adding column descriptions
data_dict = {
    'OrderID': 'Unique ID for each order',
    'Product': 'Name of the product',
    'Category': 'Product category',
    'Quantity': 'Number of items ordered',
    'Price': 'Price per unit of product',
    'Discount': 'Discount applied to the order',
    'Total': 'Total price after discount',
    'OrderDate': 'Date when the order was placed',
    'CustomerID': 'Unique ID for each customer'
}

# Printing data dictionary for reference
for col, desc in data_dict.items():
    print(f"{col}: {desc}")

OrderID: Unique ID for each order
Product: Name of the product
Category: Product category
Quantity: Number of items ordered
Price: Price per unit of product
Discount: Discount applied to the order
Total: Total price after discount
OrderDate: Date when the order was placed
CustomerID: Unique ID for each customer


**Result**
- Final Output: A structured, enriched, and validated DataFrame.
- Documentation: Data dictionary describing each column.
- This project walks through real-world data wrangling steps, preparing the dataset for effective analysis by applying common techniques in data cleaning, enrichment, and validation with Python and Pandas.