In [1]:
####

In [2]:
import pandas as pd
import numpy as np

<h4 class="text-center"> Nominal Categorical </h4>

<span class="badge">LabelBinarizer </span>

In [3]:
#data
data_one = np.array([
        ['USA'],
        ['UK'],
        ['RU'],
        ['IND'],
        ['BD']
])

In [4]:
data_one

array([['USA'],
       ['UK'],
       ['RU'],
       ['IND'],
       ['BD']], dtype='<U3')

In [5]:
from sklearn.preprocessing import LabelBinarizer

In [7]:
label_biarizer = LabelBinarizer()

In [11]:
label_biarizer.fit_transform(data_one) 
#we have total 5 values. 
#first row represent BD = 1 and for other country = 0
#2nd row represent IND= 1 and for other countries =0  

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0]])

In [12]:
label_biarizer.classes_

array(['BD', 'IND', 'RU', 'UK', 'USA'], dtype='<U3')

In [13]:
#reverse transform
label_biarizer.inverse_transform(label_biarizer.transform(data_one))

array(['USA', 'UK', 'RU', 'IND', 'BD'], dtype='<U3')

<span class="badge"> Pandas Method </span>

In [21]:
pd.get_dummies(data_one[:,0]) #data must be 1 dimensional

Unnamed: 0,BD,IND,RU,UK,USA
0,0,0,0,0,1
1,0,0,0,1,0
2,0,0,1,0,0
3,0,1,0,0,0
4,1,0,0,0,0


<span class="badge"> MultiLabel Binarizer </span>

In [22]:
from sklearn.preprocessing import MultiLabelBinarizer

In [23]:
data_multiple = [ ('USA','BD'),
                    ('BD','IND'),
                    ('RU','UK'),
                    ('UK','USA')]

In [24]:
data_multiple

[('USA', 'BD'), ('BD', 'IND'), ('RU', 'UK'), ('UK', 'USA')]

In [25]:
multilabel_binarizer = MultiLabelBinarizer()

In [26]:
multilabel_binarizer.fit_transform(data_multiple)

array([[1, 0, 0, 0, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 1, 1, 0],
       [0, 0, 0, 1, 1]])

In [27]:
multilabel_binarizer.classes_

array(['BD', 'IND', 'RU', 'UK', 'USA'], dtype=object)

<h4 class="text-center"> Ordinal Categorical </h4>

In [34]:
game = pd.DataFrame()

In [35]:
game['Name'] = ['Limbo','Inside','Hades','Terraria','Ghost']
game['Popularity']= ['High','High','Low','Medium','Low']

In [36]:
game

Unnamed: 0,Name,Popularity
0,Limbo,High
1,Inside,High
2,Hades,Low
3,Terraria,Medium
4,Ghost,Low


<span class="badge"> Mapper </span>

In [37]:
popularity_scale  = {'Low':1,
                    'Medium':2,
                    'High':3}

In [38]:
#replace the values
game['Popularity'].replace(popularity_scale)

0    3
1    3
2    1
3    2
4    1
Name: Popularity, dtype: int64

In [40]:
game['Popularity'][0] = 'Very High'

In [41]:
game

Unnamed: 0,Name,Popularity
0,Limbo,Very High
1,Inside,High
2,Hades,Low
3,Terraria,Medium
4,Ghost,Low


In [42]:
popularity_scale  = {'Low':1,
                    'Medium':2,
                    'High':3,
                    'Very High':3.1}

In [44]:
game['Numeric_score'] = game['Popularity'].replace(popularity_scale)

In [45]:
game

Unnamed: 0,Name,Popularity,Numeric_score
0,Limbo,Very High,3.1
1,Inside,High,3.0
2,Hades,Low,1.0
3,Terraria,Medium,2.0
4,Ghost,Low,1.0


<h4 class="text-center"> Encode Dictionary </h4>

<span class="badge"> DictVectorizer </span>

In [46]:
from sklearn.feature_extraction import DictVectorizer

In [47]:
## Dict
data_dict = [{'Red':1,'Blue':2},
            {'Blue':3,'Yellow':4,},
            {'Blue':1,'Red':4}]

In [48]:
data_dict

[{'Red': 1, 'Blue': 2}, {'Blue': 3, 'Yellow': 4}, {'Blue': 1, 'Red': 4}]

In [57]:
dictvectorizer = DictVectorizer(sparse=False) #sparse = false(dense matrix) otherwise it will create a sparse matrix .

In [58]:
dictvectorizer.fit_transform(data_dict)

array([[2., 1., 0.],
       [3., 0., 4.],
       [1., 4., 0.]])

In [59]:
dictvectorizer.get_feature_names()

['Blue', 'Red', 'Yellow']

In [60]:
pd.DataFrame(dictvectorizer.fit_transform(data_dict) , columns = dictvectorizer.get_feature_names())

Unnamed: 0,Blue,Red,Yellow
0,2.0,1.0,0.0
1,3.0,0.0,4.0
2,1.0,4.0,0.0


<h4 class="text-center"> Impute Missing Values </h4>

In [90]:
game_multy = game.copy()
game_multy['Review'] = [8,9,7,8,8]
game_multy['Multiplayer']= [1,0,1,1,np.nan]

In [91]:
game_multy = game_multy.iloc[:,2:]

In [92]:
game_multy

Unnamed: 0,Numeric_score,Review,Multiplayer
0,3.1,8,1.0
1,3.0,9,0.0
2,1.0,7,1.0
3,2.0,8,1.0
4,1.0,8,


<span class="badge"> Most Frequent </span>

In [93]:
from sklearn.impute import SimpleImputer

In [95]:
imputer = SimpleImputer(strategy='most_frequent')

In [97]:
impute_data = imputer.fit_transform(game_multy)

In [98]:
pd.DataFrame(impute_data, columns= game_multy.columns)

Unnamed: 0,Numeric_score,Review,Multiplayer
0,3.1,8.0,1.0
1,3.0,9.0,0.0
2,1.0,7.0,1.0
3,2.0,8.0,1.0
4,1.0,8.0,1.0


#### K-Nearest Neighbors is also a good method for imputing missig categorical values

<h4 class="text-center"> Imbalance Target/Classes </span>

In [99]:
#creating a dataset with imbalance classes(target column)

In [100]:
from sklearn.datasets import load_iris

In [101]:
iris = load_iris()

In [105]:
feature_iris = iris.data

In [106]:
target_iris = iris.target

In [107]:
features = feature_iris[40:,:]
targets = target_iris[40:]

In [111]:
np.unique(targets)#it has three classes

array([0, 1, 2])

In [112]:
#creating a imbalance class
targets = np.where(targets==0,0,1)

In [114]:
np.unique(targets) # now it has two class

array([0, 1])

#### To handle Imbalance Data
* step 1 : COllect more observations
* Step 2 : Model Evaluation Metrics
* Step 3 : Class Weighing Parameters
* Step 4 : Downsampling
* Step 5 : Upsampling