#Data Cleaning

**How find missing value**

In [5]:
import pandas as pd
import numpy as np

In [6]:
data = {
    "Name": ["John", "Anna", "Mike", "Sara", "Tom"],
    "Age": [28, np.nan, 35, 29, np.nan],
    "Salary": [50000, 60000, np.nan, 58000, 70000]
}
df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,Salary
0,John,28.0,50000.0
1,Anna,,60000.0
2,Mike,35.0,
3,Sara,29.0,58000.0
4,Tom,,70000.0


In [7]:
df.isnull().sum()

Unnamed: 0,0
Name,0
Age,2
Salary,1


**Method 1: Remove Missing Data**

In [None]:

df_drop=df.dropna()
df_drop

Unnamed: 0,Name,Age,Salary
0,John,28.0,50000.0
3,Sara,29.0,58000.0


**Method 2: Replace with Constant**

In [None]:

df_const = df.fillna(0)
df_const

Unnamed: 0,Name,Age,Salary
0,John,28.0,50000.0
1,Anna,0.0,60000.0
2,Mike,35.0,0.0
3,Sara,29.0,58000.0
4,Tom,0.0,70000.0


**Method 3: Replace with Statistical Values**

In [None]:


df_stat = df.copy()
df_stat['Age'].fillna(df_stat['Age'].mean(), inplace=True)
df_stat['Salary'].fillna(df_stat['Salary'].median(), inplace=True)

df_stat

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_stat['Age'].fillna(df_stat['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_stat['Salary'].fillna(df_stat['Salary'].median(), inplace=True)


Unnamed: 0,Name,Age,Salary
0,John,28.0,50000.0
1,Anna,30.666667,60000.0
2,Mike,35.0,59000.0
3,Sara,29.0,58000.0
4,Tom,30.666667,70000.0


**Method 4 & 5 Forward / Backward Fill**

In [None]:

df_ffill = df.fillna(method="ffill")
df_ffill

  df_ffill = df.fillna(method="ffill")


Unnamed: 0,Name,Age,Salary
0,John,28.0,50000.0
1,Anna,28.0,60000.0
2,Mike,35.0,60000.0
3,Sara,29.0,58000.0
4,Tom,29.0,70000.0


In [4]:
#bacward fill
df_bfill = df.fillna(method="bfill")
df_bfill

  df_bfill = df.fillna(method="bfill")


Unnamed: 0,Name,Age,Salary
0,John,28.0,50000.0
1,Anna,35.0,60000.0
2,Mike,35.0,58000.0
3,Sara,29.0,58000.0
4,Tom,,70000.0


**Method 6 : Interpolation**

In [None]:

df_interp = df.interpolate()
df_interp

  df_interp = df.interpolate()


Unnamed: 0,Name,Age,Salary
0,John,28.0,50000.0
1,Anna,31.5,60000.0
2,Mike,35.0,59000.0
3,Sara,29.0,58000.0
4,Tom,29.0,70000.0


# Duplicacy Handle

**How to Check for Duplicates**




In [9]:
data = {
    "Name": ["John", "Anna", "Mike", "Sara", "John"],
    "Age": [28, 22, 35, 29, 28],
    "City": ["Dhaka", "Chittagong", "Dhaka", "Khulna", "Dhaka"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,John,28,Dhaka
1,Anna,22,Chittagong
2,Mike,35,Dhaka
3,Sara,29,Khulna
4,John,28,Dhaka


In [None]:
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,True


In [None]:
print(df.duplicated().sum())

1


**Remove All Duplicate Rows**

In [None]:
df_no_dup = df.drop_duplicates()
df_no_dup

Unnamed: 0,Name,Age,City
0,John,28,Dhaka
1,Anna,22,Chittagong
2,Mike,35,Dhaka
3,Sara,29,Khulna


**Keep Last Occurrence Instead of First**

In [None]:
df_keep_last = df.drop_duplicates(keep="last")
df_keep_last

Unnamed: 0,Name,Age,City
1,Anna,22,Chittagong
2,Mike,35,Dhaka
3,Sara,29,Khulna
4,John,28,Dhaka


**Mark Duplicates Without Removing**

In [10]:
df["duplicate"] = df.duplicated()
df

Unnamed: 0,Name,Age,City,duplicate
0,John,28,Dhaka,False
1,Anna,22,Chittagong,False
2,Mike,35,Dhaka,False
3,Sara,29,Khulna,False
4,John,28,Dhaka,True
