# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Bob', 'Charlie', None],
    'age': [25, np.nan, 30, 35, 40],
    'salary': ['50000', '60000', '60000', None, '70000'],
    'join_date': ['2021-01-01', '2021-02-15', '2021-02-15', None, '2022-03-10']
})

df

Unnamed: 0,name,age,salary,join_date
0,Alice,25.0,50000.0,2021-01-01
1,Bob,,60000.0,2021-02-15
2,Bob,30.0,60000.0,2021-02-15
3,Charlie,35.0,,
4,,40.0,70000.0,2022-03-10


## Missing data

In [2]:
df.isnull()

Unnamed: 0,name,age,salary,join_date
0,False,False,False,False
1,False,True,False,False
2,False,False,False,False
3,False,False,True,True
4,True,False,False,False


In [3]:
df.notnull()

Unnamed: 0,name,age,salary,join_date
0,True,True,True,True
1,True,False,True,True
2,True,True,True,True
3,True,True,False,False
4,False,True,True,True


In [4]:
df.dropna()

Unnamed: 0,name,age,salary,join_date
0,Alice,25.0,50000,2021-01-01
2,Bob,30.0,60000,2021-02-15


In [5]:
df.dropna(subset=['salary'])

Unnamed: 0,name,age,salary,join_date
0,Alice,25.0,50000,2021-01-01
1,Bob,,60000,2021-02-15
2,Bob,30.0,60000,2021-02-15
4,,40.0,70000,2022-03-10


In [6]:
df.fillna(0)

Unnamed: 0,name,age,salary,join_date
0,Alice,25.0,50000,2021-01-01
1,Bob,0.0,60000,2021-02-15
2,Bob,30.0,60000,2021-02-15
3,Charlie,35.0,0,0
4,0,40.0,70000,2022-03-10


In [7]:
df.fillna({'age': df['age'].mean(), 'salary': '0'})

Unnamed: 0,name,age,salary,join_date
0,Alice,25.0,50000,2021-01-01
1,Bob,32.5,60000,2021-02-15
2,Bob,30.0,60000,2021-02-15
3,Charlie,35.0,0,
4,,40.0,70000,2022-03-10


In [8]:
df.ffill()

Unnamed: 0,name,age,salary,join_date
0,Alice,25.0,50000,2021-01-01
1,Bob,25.0,60000,2021-02-15
2,Bob,30.0,60000,2021-02-15
3,Charlie,35.0,60000,2021-02-15
4,Charlie,40.0,70000,2022-03-10


In [9]:
df.bfill()

Unnamed: 0,name,age,salary,join_date
0,Alice,25.0,50000,2021-01-01
1,Bob,30.0,60000,2021-02-15
2,Bob,30.0,60000,2021-02-15
3,Charlie,35.0,70000,2022-03-10
4,,40.0,70000,2022-03-10


## Duplicates

In [10]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [11]:
df.duplicated(subset=['name', 'join_date'])

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [12]:
df.drop_duplicates()

Unnamed: 0,name,age,salary,join_date
0,Alice,25.0,50000.0,2021-01-01
1,Bob,,60000.0,2021-02-15
2,Bob,30.0,60000.0,2021-02-15
3,Charlie,35.0,,
4,,40.0,70000.0,2022-03-10


In [13]:
df.drop_duplicates(subset=['name'], keep='first')

Unnamed: 0,name,age,salary,join_date
0,Alice,25.0,50000.0,2021-01-01
1,Bob,,60000.0,2021-02-15
3,Charlie,35.0,,
4,,40.0,70000.0,2022-03-10


In [14]:
df.drop_duplicates(subset=['name'], keep='last')

Unnamed: 0,name,age,salary,join_date
0,Alice,25.0,50000.0,2021-01-01
2,Bob,30.0,60000.0,2021-02-15
3,Charlie,35.0,,
4,,40.0,70000.0,2022-03-10


## Data types

In [15]:
df.dtypes

name          object
age          float64
salary        object
join_date     object
dtype: object

In [16]:
df['salary'] = df['salary'].astype('float')
df.dtypes

name          object
age          float64
salary       float64
join_date     object
dtype: object

In [17]:
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
df['salary']

0    50000.0
1    60000.0
2    60000.0
3        NaN
4    70000.0
Name: salary, dtype: float64

In [18]:
df['join_date'] = pd.to_datetime(df['join_date'], errors='coerce')
df['join_date']

0   2021-01-01
1   2021-02-15
2   2021-02-15
3          NaT
4   2022-03-10
Name: join_date, dtype: datetime64[ns]

In [19]:
df['name'] = df['name'].astype('category')
df.dtypes

name               category
age                 float64
salary              float64
join_date    datetime64[ns]
dtype: object

In [20]:
df['name'].cat.categories

Index(['Alice', 'Bob', 'Charlie'], dtype='object')