In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df=pd.read_csv('preprocessing data.csv')
df.head(5)

Unnamed: 0,A,B,C,D,E,F,G,H
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male
1,32.0,5000.0,2.1,26.0,,Blue,No,Male
2,,6000.0,,125.0,,Black,Yes,
3,54.0,70000.0,,140.0,,Red,,
4,64.0,6500.0,3.6,24.0,574.0,,No,Male


In [3]:
df.isnull().sum()

A    3
B    3
C    4
D    3
E    4
F    2
G    4
H    4
dtype: int64

In [4]:
df['A'].mean()

42.285714285714285

## Handling Missing Values

In [5]:
# Imputing mean for missing values

# df['A'].fillna(df['A'].mean())   #this fills the original column by the mean value

In [6]:
#Imputing median for missing values in column A and creating a new column for it
df['A'+'_median'] = df['A'].fillna(df['A'].median())
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_median
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0
1,32.0,5000.0,2.1,26.0,,Blue,No,Male,32.0
2,,6000.0,,125.0,,Black,Yes,,46.0
3,54.0,70000.0,,140.0,,Red,,,54.0
4,64.0,6500.0,3.6,24.0,574.0,,No,Male,64.0
5,46.0,,2.5,,556.0,Red,,Female,46.0
6,,,,,545.0,Blue,,Male,46.0
7,53.0,4500.0,,,586.0,Red,Yes,,53.0
8,,,4.2,26.0,,Black,Yes,Male,46.0
9,24.0,3200.0,2.3,25.0,546.0,,,,24.0


In [7]:
#Imputing column B using random sample values:

b = df['B'].isnull().sum()
b

3

In [8]:
#sample function to randomly select 3 values from the column B , afterremoving null values
#random_state: reproducing the same result, you can assign any integer to it.

# When we do not mention random_state , the sample function will select any three values randomly from the column
# Everytime you run the program the values chosen by sample function will CHANGE
# This will generate different results everytime
# to avoide this we mention the random_state as a fixed number, so that everytime when the program is executed
# we get the same result. This is called 'Reproducibility of the result'


# In the below code, null values are dropped, from the rest of the value 3 values are randomly chosen

random_sample=df['B'].dropna().sample(b, random_state = 0) 
random_sample

#Try using a different random state ( 5 , 50, 500) and re-run the kernel , observe the sample chosen
# try using no random_state, re-run the code 2 to 3 times and observe the same chosen


9    3200.0
2    6000.0
1    5000.0
Name: B, dtype: float64

In [9]:
# The index of those 3 values is matched with null values in original dataframe

random_sample.index = df[df['B'].isnull()].index
random_sample

5    3200.0
6    6000.0
8    5000.0
Name: B, dtype: float64

In [10]:
# Now finally in those index values , the sample values so chosen are filled
df['B']=df['B'].fillna(random_sample)
df['B']

0     8000.0
1     5000.0
2     6000.0
3    70000.0
4     6500.0
5     3200.0
6     6000.0
7     4500.0
8     5000.0
9     3200.0
Name: B, dtype: float64

In [11]:
# Capturing the (missingness) NaN values with a new feature:
# get_dummies (1 for missing value and 0 for filled value) (Assignment)

df['C_NAN'] = np.where(df['C'].isnull(),1,0)
df.head(5)

Unnamed: 0,A,B,C,D,E,F,G,H,A_median,C_NAN
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,0
1,32.0,5000.0,2.1,26.0,,Blue,No,Male,32.0,0
2,,6000.0,,125.0,,Black,Yes,,46.0,1
3,54.0,70000.0,,140.0,,Red,,,54.0,1
4,64.0,6500.0,3.6,24.0,574.0,,No,Male,64.0,0


In [12]:
x = pd.get_dummies(df["C"])
x

Unnamed: 0,1.1,2.1,2.3,2.5,3.6,4.2
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,1,0
5,0,0,0,1,0,0
6,0,0,0,0,0,0
7,0,0,0,0,0,0
8,0,0,0,0,0,1
9,0,0,1,0,0,0


In [13]:
# Filling with end of distribution

df['D']=df['D'].fillna(df.D.mean() + 3*df.D.std())  #df['D']  ~ df.D
df['D']

0     23.000000
1     26.000000
2    125.000000
3    140.000000
4     24.000000
5    213.794598
6    213.794598
7    213.794598
8     26.000000
9     25.000000
Name: D, dtype: float64

In [14]:
#Arbitrary value imputation
df['E']=df['E'].fillna(100)
df['E']

0    555.0
1    100.0
2    100.0
3    100.0
4    574.0
5    556.0
6    545.0
7    586.0
8    100.0
9    546.0
Name: E, dtype: float64

In [15]:
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_median,C_NAN
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,0
1,32.0,5000.0,2.1,26.0,100.0,Blue,No,Male,32.0,0
2,,6000.0,,125.0,100.0,Black,Yes,,46.0,1
3,54.0,70000.0,,140.0,100.0,Red,,,54.0,1
4,64.0,6500.0,3.6,24.0,574.0,,No,Male,64.0,0
5,46.0,3200.0,2.5,213.794598,556.0,Red,,Female,46.0,0
6,,6000.0,,213.794598,545.0,Blue,,Male,46.0,1
7,53.0,4500.0,,213.794598,586.0,Red,Yes,,53.0,1
8,,5000.0,4.2,26.0,100.0,Black,Yes,Male,46.0,0
9,24.0,3200.0,2.3,25.0,546.0,,,,24.0,0


In [16]:
df["F"].value_counts()

Red      4
Blue     2
Black    2
Name: F, dtype: int64

In [17]:
df["F"].value_counts().sort_values()

Blue     2
Black    2
Red      4
Name: F, dtype: int64

In [18]:
#Frequent category imputation for categorical column (mode)

df['F'].value_counts().sort_values(ascending=False).index[0]
# df['F'].value_counts().sort_values(ascending=False).index[0]

'Red'

In [19]:
df['F']=df['F'].fillna((df['F'].value_counts().sort_values(ascending=False).index[0]))
df['F']

0      Red
1     Blue
2    Black
3      Red
4      Red
5      Red
6     Blue
7      Red
8    Black
9      Red
Name: F, dtype: object

In [20]:
# Treat NaN as a new category
df['G']=df['G'].fillna('missing')
df['G']

0       Yes 
1         No
2       Yes 
3    missing
4         No
5    missing
6    missing
7       Yes 
8       Yes 
9    missing
Name: G, dtype: object

## Detecting Outliers

In [21]:
tips_df=sns.load_dataset('tips')
tips_df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


In [22]:
##IQR method of detecting outliers

def iqr_func(data):
    Q1 = np.percentile(data, 25, interpolation = 'midpoint') 
    Q3 = np.percentile(data, 75, interpolation = 'midpoint')
    IQR=Q3-Q1
    lo=Q1-1.5*IQR
    uo=Q3+1.5*IQR
    outlier=[]
    for x in data:
        if ((x> uo) or (x<lo)):
            outlier.append(x)
            
    print('outlier in the dataset are:', outlier )
    
iqr_func(tips_df['total_bill'])

outlier in the dataset are: [48.27, 44.3, 41.19, 48.17, 50.81, 45.35, 40.55, 43.11, 48.33]


In [23]:
## Z score method of detecting outliers

def norm_func(data):
    outlier=[]
    mean=np.mean(data)
    std=np.std(data)
    for x in data:
        z=(x-mean)/std
        if z>3 or z<-3:
            outlier.append(z)
    
    
    print('outlier in the dataset are: ', outlier)
    final = [i*std+mean for i in outlier]
    print(final)        
        
norm_func(tips_df['total_bill'])

outlier in the dataset are:  [3.2061655335197283, 3.194909533396294, 3.492067936654957, 3.2129191335937883]
[48.27, 48.17, 50.81, 48.33]


## Feature Scaling

In [24]:
#Maximum Absolute Scaling

from sklearn.preprocessing import MaxAbsScaler
mas=MaxAbsScaler()    #instatiating the class (creating the object mas to be used for this dataframe)
column = ['A','B','C','D','E']
mas.fit_transform(df[column])    #fit and transform ()

array([[0.359375  , 0.11428571, 0.26190476, 0.10757989, 0.94709898],
       [0.5       , 0.07142857, 0.5       , 0.12161205, 0.17064846],
       [       nan, 0.08571429,        nan, 0.58467333, 0.17064846],
       [0.84375   , 1.        ,        nan, 0.65483413, 0.17064846],
       [1.        , 0.09285714, 0.85714286, 0.11225728, 0.97952218],
       [0.71875   , 0.04571429, 0.5952381 , 1.        , 0.94880546],
       [       nan, 0.08571429,        nan, 1.        , 0.93003413],
       [0.828125  , 0.06428571,        nan, 1.        , 1.        ],
       [       nan, 0.07142857, 1.        , 0.12161205, 0.17064846],
       [0.375     , 0.04571429, 0.54761905, 0.11693467, 0.93174061]])

In [25]:
# Min Max Scaler

from sklearn.preprocessing import MinMaxScaler
mms=MinMaxScaler()
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
mms.fit_transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [26]:
## Standardization (most commonly used)   these values are between -3 and 3 , it used the Z score formula
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
ss.fit_transform(data)


array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [27]:
## Normalization

from sklearn.preprocessing import normalize
normalize(data)

array([[-0.4472136 ,  0.89442719],
       [-0.08304548,  0.99654576],
       [ 0.        ,  1.        ],
       [ 0.05547002,  0.99846035]])

In [28]:
## Robust scaling

from sklearn.preprocessing import RobustScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
rs=RobustScaler()
rs.fit_transform(data)


array([[-0.85714286, -0.85714286],
       [-0.28571429, -0.28571429],
       [ 0.28571429,  0.28571429],
       [ 1.42857143,  1.42857143]])

## Encoding the categorical variables

In [29]:
df.head(5)

Unnamed: 0,A,B,C,D,E,F,G,H,A_median,C_NAN
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,0
1,32.0,5000.0,2.1,26.0,100.0,Blue,No,Male,32.0,0
2,,6000.0,,125.0,100.0,Black,Yes,,46.0,1
3,54.0,70000.0,,140.0,100.0,Red,missing,,54.0,1
4,64.0,6500.0,3.6,24.0,574.0,Red,No,Male,64.0,0


In [30]:
# It creates a column for each class of that feature. These columns are filled with 0 and 1.
# This is exactly what OneHotEncoding also does

pd.get_dummies(df['F'])

# pd.get_dummies(df,columns=['F','G','H'])

Unnamed: 0,Black,Blue,Red
0,0,0,1
1,0,1,0
2,1,0,0
3,0,0,1
4,0,0,1
5,0,0,1
6,0,1,0
7,0,0,1
8,1,0,0
9,0,0,1


In [43]:
df.G.values.reshape(-1,1)

array([['Yes '],
       ['No'],
       ['Yes '],
       ['missing'],
       ['No'],
       ['missing'],
       ['missing'],
       ['Yes '],
       ['Yes '],
       ['missing']], dtype=object)

In [40]:
# Using OneHotEncoder

from sklearn.preprocessing import OneHotEncoder   #importing
ohe = OneHotEncoder()    #instatiating by dropping one column of the final array

final=ohe.fit_transform(df.G.values.reshape(-1, 1)).toarray()  #fitting and transforming categories using OneHotEncoder
final

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [32]:
#Adding the arrays to the original dataframe

df['No']=final[:,0]
df['Yes']=final[:,1]
df['Missing']=final[:,2]
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_median,C_NAN,No,Yes,Missing
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,0,0.0,1.0,0.0
1,32.0,5000.0,2.1,26.0,100.0,Blue,No,Male,32.0,0,1.0,0.0,0.0
2,,6000.0,,125.0,100.0,Black,Yes,,46.0,1,0.0,1.0,0.0
3,54.0,70000.0,,140.0,100.0,Red,missing,,54.0,1,0.0,0.0,1.0
4,64.0,6500.0,3.6,24.0,574.0,Red,No,Male,64.0,0,1.0,0.0,0.0
5,46.0,3200.0,2.5,213.794598,556.0,Red,missing,Female,46.0,0,0.0,0.0,1.0
6,,6000.0,,213.794598,545.0,Blue,missing,Male,46.0,1,0.0,0.0,1.0
7,53.0,4500.0,,213.794598,586.0,Red,Yes,,53.0,1,0.0,1.0,0.0
8,,5000.0,4.2,26.0,100.0,Black,Yes,Male,46.0,0,0.0,1.0,0.0
9,24.0,3200.0,2.3,25.0,546.0,Red,missing,,24.0,0,0.0,0.0,1.0


In [33]:
#Using LabelEncoder

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le_H=le.fit_transform(df['H'])
df['le_H']=le_H
df

Unnamed: 0,A,B,C,D,E,F,G,H,A_median,C_NAN,No,Yes,Missing,le_H
0,23.0,8000.0,1.1,23.0,555.0,Red,Yes,Male,23.0,0,0.0,1.0,0.0,1
1,32.0,5000.0,2.1,26.0,100.0,Blue,No,Male,32.0,0,1.0,0.0,0.0,1
2,,6000.0,,125.0,100.0,Black,Yes,,46.0,1,0.0,1.0,0.0,2
3,54.0,70000.0,,140.0,100.0,Red,missing,,54.0,1,0.0,0.0,1.0,2
4,64.0,6500.0,3.6,24.0,574.0,Red,No,Male,64.0,0,1.0,0.0,0.0,1
5,46.0,3200.0,2.5,213.794598,556.0,Red,missing,Female,46.0,0,0.0,0.0,1.0,0
6,,6000.0,,213.794598,545.0,Blue,missing,Male,46.0,1,0.0,0.0,1.0,1
7,53.0,4500.0,,213.794598,586.0,Red,Yes,,53.0,1,0.0,1.0,0.0,2
8,,5000.0,4.2,26.0,100.0,Black,Yes,Male,46.0,0,0.0,1.0,0.0,1
9,24.0,3200.0,2.3,25.0,546.0,Red,missing,,24.0,0,0.0,0.0,1.0,2
