In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = {'Emp_ID' : [125,127,128,130,131,134,128,135,131,137],
       'Name' : ['Aman', 'Vipin', 'Arpit', 'Vidya', 'Priya', 'Ankit', 'Arpit','Sneha','Priya', 'Hitesh'],
       'Age' : [32,30,34,35,28,39,34,np.nan,28,27],
       'Dept' : ['IT',np.nan,'FIN','INV','LOG','IT','FIN','HR','LOG','IT'],
       'Gender' : [np.nan, 'M', 'M', 'M' , 'F', 'F', 'M', 'F','F','M']
       }
df = pd.DataFrame(data)
df

Unnamed: 0,Emp_ID,Name,Age,Dept,Gender
0,125,Aman,32.0,IT,
1,127,Vipin,30.0,,M
2,128,Arpit,34.0,FIN,M
3,130,Vidya,35.0,INV,M
4,131,Priya,28.0,LOG,F
5,134,Ankit,39.0,IT,F
6,128,Arpit,34.0,FIN,M
7,135,Sneha,,HR,F
8,131,Priya,28.0,LOG,F
9,137,Hitesh,27.0,IT,M


### Copies Of DataFrame:


In [3]:
df1 = df.copy()
df2 = df.copy()

###  Detect the null value:

In [4]:
df.isnull().sum()

Emp_ID    0
Name      0
Age       1
Dept      1
Gender    1
dtype: int64

### Filling Null values: 

In [5]:
df['Age'].value_counts()

34.0    2
28.0    2
32.0    1
30.0    1
35.0    1
39.0    1
27.0    1
Name: Age, dtype: int64

In [6]:
df['Age'].fillna(df['Age'].mean(),inplace = True)

In [7]:
df.isnull().sum()

Emp_ID    0
Name      0
Age       0
Dept      1
Gender    1
dtype: int64

In [8]:
df['Dept'].value_counts()
# This is categorical column so we have use a mode for fill the null value

IT     3
FIN    2
LOG    2
INV    1
HR     1
Name: Dept, dtype: int64

In [9]:
# the IT is more time in the categorical column so we can use IT for the fill the null values:
df['Dept'].fillna('IT',inplace = True)

In [10]:
df.isnull().sum()

Emp_ID    0
Name      0
Age       0
Dept      0
Gender    1
dtype: int64

In [11]:
df['Gender'].value_counts()

M    5
F    4
Name: Gender, dtype: int64

In [12]:
df['Gender'].fillna('M', inplace = True)


In [13]:
df.isnull().sum()

Emp_ID    0
Name      0
Age       0
Dept      0
Gender    0
dtype: int64

### How to Drop a Column:
1. Drop the column that we have no use in data:

In [20]:
df1.head()

Unnamed: 0,Name,Age,Dept,Gender
0,Aman,32.0,IT,
1,Vipin,30.0,,M
2,Arpit,34.0,FIN,M
3,Vidya,35.0,INV,M
4,Priya,28.0,LOG,F


In [21]:
df.drop('Emp_ID',axis= 1,inplace = True)

In [22]:
df.shape

(10, 4)

In [26]:
df.columns

Index(['Name', 'Age', 'Dept', 'Gender'], dtype='object')

###  Using SimpleImputer to fill a null value:

In [27]:
df1.isnull().sum()

Name      0
Age       1
Dept      1
Gender    1
dtype: int64

In [30]:
from sklearn.impute import SimpleImputer
simple_numerical_cols = SimpleImputer(missing_values = np.nan, strategy = 'mean')
simple_categorical_cols = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

In [34]:
age_imp = simple_numerical_cols.fit_transform(df1[['Age']])
age_imp

array([[32.        ],
       [30.        ],
       [34.        ],
       [35.        ],
       [28.        ],
       [39.        ],
       [34.        ],
       [31.88888889],
       [28.        ],
       [27.        ]])

In [38]:
dept_imp = simple_categorical_cols.fit_transform(df1[['Dept']])
dept_imp

array([['IT'],
       ['IT'],
       ['FIN'],
       ['INV'],
       ['LOG'],
       ['IT'],
       ['FIN'],
       ['HR'],
       ['LOG'],
       ['IT']], dtype=object)

In [40]:
df1['Dept']

0     IT
1    NaN
2    FIN
3    INV
4    LOG
5     IT
6    FIN
7     HR
8    LOG
9     IT
Name: Dept, dtype: object

In [41]:
gender_imp = simple_categorical_cols.fit_transform(df1[['Gender']])
gender_imp

array([['M'],
       ['M'],
       ['M'],
       ['M'],
       ['F'],
       ['F'],
       ['M'],
       ['F'],
       ['F'],
       ['M']], dtype=object)

In [42]:
df1['Gender']

0    NaN
1      M
2      M
3      M
4      F
5      F
6      M
7      F
8      F
9      M
Name: Gender, dtype: object

### Handling Duplicated:

In [54]:
df

Unnamed: 0,Name,Age,Dept,Gender
0,Aman,32.0,IT,M
1,Vipin,30.0,IT,M
2,Arpit,34.0,FIN,M
3,Vidya,35.0,INV,M
4,Priya,28.0,LOG,F
5,Ankit,39.0,IT,F
6,Arpit,34.0,FIN,M
7,Sneha,31.888889,HR,F
8,Priya,28.0,LOG,F
9,Hitesh,27.0,IT,M


In [55]:
df.duplicated().sum()

2

In [56]:
df.shape

(10, 4)

### How to remove duplicates

In [60]:
df.drop_duplicates(inplace= True)

In [61]:
df

Unnamed: 0,Name,Age,Dept,Gender
0,Aman,32.0,IT,M
1,Vipin,30.0,IT,M
2,Arpit,34.0,FIN,M
3,Vidya,35.0,INV,M
4,Priya,28.0,LOG,F
5,Ankit,39.0,IT,F
7,Sneha,31.888889,HR,F
9,Hitesh,27.0,IT,M


In [62]:
df.shape

(8, 4)