# ***Data Cleaning***

Data cleaning and preparation are essential steps in the data analysis pipeline.

The Process of data cleaning involves identifying and handling issues within a dataset that could cause inaccurate or biased analysis results.

This involves handling missing values , removing duplicates , scaling data , and encoding categorical data.

In [1]:
import pandas as pd

In [2]:
 #handling Duplicate values and Outliners 
data = {'A' : [1,2,2,3,5,9] , 
       'B' : [4,5,5,7,18,0]}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,2,5
3,3,7
4,5,18
5,9,0


In [3]:
print(df.duplicated())

0    False
1    False
2     True
3    False
4    False
5    False
dtype: bool


In [5]:
df1 = df.drop_duplicates()
df1

Unnamed: 0,A,B
0,1,4
1,2,5
3,3,7
4,5,18
5,9,0


In [6]:
# outliers are extreme values that can skew your result

In [7]:
# Quantile method is used to identify the outliers and clip method to clip the particular clip

In [10]:
q = df['A'].quantile(0.99)
q

8.8

In [11]:
df['A'] = df['A'].clip(lower = None , upper = q)
print(df)

     A   B
0  1.0   4
1  2.0   5
2  2.0   5
3  3.0   7
4  5.0  18
5  8.8   0


In [12]:
 # Handling of Missing values

In [25]:
# isna method is used to find any missing values and fillna is used to fill the missing values with statistical method 
# for Filling = if the data is numerical and contains  outliers then we use median  
# if the data does not contain outliers then we use mean 
# if the data is categorical we use mode 
import numpy as np
data = {'A' : [1,2,3,np.nan ,np.nan] , 
       'B':[5,np.nan ,6,7,np.nan]}
df = pd.DataFrame(data)


In [26]:
print(df.isna())

       A      B
0  False  False
1  False   True
2  False  False
3   True  False
4   True   True


In [27]:
print(df.fillna(value = 0))

     A    B
0  1.0  5.0
1  2.0  0.0
2  3.0  6.0
3  0.0  7.0
4  0.0  0.0


In [29]:
df['A'] = df['A'].fillna(df['A'].mean())
df['B'] = df['B'].fillna(df['B'].mean())
print(df)

     A    B
0  1.0  5.0
1  2.0  6.0
2  3.0  6.0
3  2.0  7.0
4  2.0  6.0


In [31]:
# data normalization and scaling 
# Technique used to transform the numerical data to common scale this is done to reduce the biasness towards higher values .
# Pandas provides us with min and max methods to compute the min and max values 

In [32]:
max_value = df.max()
min_value = df.min()
print(max_value)
print(min_value)

A    3.0
B    7.0
dtype: float64
A    1.0
B    5.0
dtype: float64


In [71]:
# encoding of categorical data  
# categorical data is the data that takes on limited no of values when we use this in analysis then we have to encode this data 
# in pandas we use get_dummies method which one hot encodes the categorical  data (convert 0 and 1)
# and replace that map categorical values ( 0 ,1) with integer number
data = {'A':['Red', 'Blue','Yellow','Pink','white','Purple'], 
       'B' : [4,5,5,7,18,0]}
df = pd.DataFrame(data)
print(df)

        A   B
0     Red   4
1    Blue   5
2  Yellow   5
3    Pink   7
4   white  18
5  Purple   0


In [72]:
df = pd.get_dummies(df ,columns= ['A'])
print(df)

    B  A_Blue  A_Pink  A_Purple  A_Red  A_Yellow  A_white
0   4   False   False     False   True     False    False
1   5    True   False     False  False     False    False
2   5   False   False     False  False      True    False
3   7   False    True     False  False     False    False
4  18   False   False     False  False     False     True
5   0   False   False      True  False     False    False


In [73]:
df.replace({'Red':0,'Blue':1,'Yellow':18,'Pink':37,'white':3,'Purple' :4 })
print(df)

    B  A_Blue  A_Pink  A_Purple  A_Red  A_Yellow  A_white
0   4   False   False     False   True     False    False
1   5    True   False     False  False     False    False
2   5   False   False     False  False      True    False
3   7   False    True     False  False     False    False
4  18   False   False     False  False     False     True
5   0   False   False      True  False     False    False
