# Data cleaning with pandas

In [2]:
import pandas as pd

In [3]:
sales = pd.read_csv('sales_data.csv')

In [4]:
product = pd.read_csv('product_data.csv')

## Explore the data 

In [6]:
sales.head(3)

Unnamed: 0,sale_id,date,store_id,product_id,units_sold,unit_price,sales_rep
0,1,2023-01-01,3,102,15.0,39.0,
1,2,2023-01-02,4,103,12.0,21.27,Charlie
2,3,2023-01-03,1,101,,52.69,Alice


In [7]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sale_id     1000 non-null   int64  
 1   date        1000 non-null   object 
 2   store_id    1000 non-null   int64  
 3   product_id  1000 non-null   int64  
 4   units_sold  900 non-null    float64
 5   unit_price  900 non-null    float64
 6   sales_rep   715 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 54.8+ KB


* fix missing info in unit_sold, unit_price and sales_rep column
* Date column needs to be standardized

In [9]:
# how many stores present 
sales['store_id'].unique()

array([3, 4, 1, 2], dtype=int64)

In [10]:
# how many products present 
sales['product_id'].unique()

array([102, 103, 101, 104], dtype=int64)

In [11]:
sales['sales_rep'].unique()

array([nan, 'Charlie', 'Alice', 'Bob', 'David'], dtype=object)

In [12]:
sales['date'].min()

'2023-01-01'

In [13]:
sales['date'].max()

'2025-09-26'

In [14]:
print(f"The dataset contains sales from '{sales['date'].min()}' to '{sales['date'].min()}.")

print(f'The dataset contains {
    len(sales['product_id'].unique())} products, {
    len(sales['store_id'].unique())} stores and {
    len(sales['sales_rep'].unique())} sales rep.')

The dataset contains sales from '2023-01-01' to '2023-01-01.
The dataset contains 4 products, 4 stores and 5 sales rep.


In [42]:
# check for duplicates 

duplicate_ids =sales['sale_id'].value_counts()
for id, counts in duplicate_ids.items():
    if counts > 1:
        print(id)

* No id was printed: no duplicates in the dataset

In [16]:
list(sales.columns)

['sale_id',
 'date',
 'store_id',
 'product_id',
 'units_sold',
 'unit_price',
 'sales_rep']

## Data Cleaning

In [74]:
# format/standardize date column 
sales['date']= pd.to_datetime(sales['date'])
sales.dtypes

sale_id                int64
date          datetime64[ns]
store_id               int64
product_id             int64
units_sold           float64
unit_price           float64
sales_rep             object
dtype: object

In [62]:
# Drop missng values (nulls) across rows: units_sold, unit_price, sales_rep 
sales.dropna(subset= ['units_sold', 'unit_price', 'sales_rep'], axis=0, inplace=True)

## Data Transformation

In [68]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 578 entries, 1 to 999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   sale_id     578 non-null    int64         
 1   date        578 non-null    datetime64[ns]
 2   store_id    578 non-null    int64         
 3   product_id  578 non-null    int64         
 4   units_sold  578 non-null    float64       
 5   unit_price  578 non-null    float64       
 6   sales_rep   578 non-null    object        
dtypes: datetime64[ns](1), float64(2), int64(3), object(1)
memory usage: 36.1+ KB
