# Encoding Nominal Features

* Features which does not follow a natural ordering (e.g. apple, banana, grapes etc.) are called nominal features

* Features which follow a natural ordering (e.g. low, medium, high etc.) are called ordinal features

In [1]:
import numpy as np
from sklearn import preprocessing

# For single class

feature = np.array([["Texas"],
                   ["California"],
                   ["Texas"],
                   ["Delaware"],
                   ["Texas"]])

one_hot = preprocessing.LabelBinarizer()
print('One Hot Encoding \n',one_hot.fit_transform(feature))
print('Classes {}'.format(one_hot.classes_))
print('Reversed One-Hot Encoding {}'.format(one_hot.inverse_transform(one_hot.transform(feature))))

One Hot Encoding 
 [[0 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 0]
 [0 0 1]]
Classes ['California' 'Delaware' 'Texas']
Reversed One-Hot Encoding ['Texas' 'California' 'Texas' 'Delaware' 'Texas']


In [2]:
# For multiclass
multiclass_feature = [("Texas", "Florida"),
                     ("California", "Albama"),
                     ("Texas", "Florida"),
                     ("Delware", "Florida"),
                     ("Texas", "Albama")]

one_hot_multiclass = preprocessing.MultiLabelBinarizer()
print("One Hot Multiclass encoding \n",one_hot_multiclass.fit_transform(multiclass_feature))
print("One Hot Multiclass classes: {}".format(one_hot_multiclass.classes_))


One Hot Multiclass encoding 
 [[0 0 0 1 1]
 [1 1 0 0 0]
 [0 0 0 1 1]
 [0 0 1 1 0]
 [1 0 0 0 1]]
One Hot Multiclass classes: ['Albama' 'California' 'Delware' 'Florida' 'Texas']


# Encoding Ordinal Features

In [3]:
import pandas as pd

df = pd.DataFrame({"Score":["Low", "Low", "Medium", "Medium", "High"]})

# Create scale mappers
scale_mapper = {"Low":1, "Medium":2, "High":3}

df["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

# Encoding Dictionary of Features

In [4]:
from sklearn.feature_extraction import DictVectorizer

data_dict = [{"Red":2, "Blue":4},
            {"Red":4, "Blue":3},
            {"Red":1, "Yellow":2},
            {"Red": 2, "Yellow":2}]

dictvectorizer = DictVectorizer(sparse=False)
features = dictvectorizer.fit_transform(data_dict)
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [5]:
print(dictvectorizer.get_feature_names())

['Blue', 'Red', 'Yellow']


# Imputing Missing Class Values

In [6]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier


## Solution - 1 --> Applicable for smaller dataset

# Create a feature matrix with categorical feature
X = np.array([[0, 2.10, 1.45],
            [1, 1.18, 1.33],
            [0, 1.22, 1.27],
            [1, -0.21, -1.19]])


# Create feature matrix with missing categorical feature
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                        [np.nan, -0.67, -0.22]])

# Train KNN Learner
clf = KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[:, 1:], X[:, 0])

imputed_values = trained_model.predict(X_with_nan[:,1:])

X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:,1:]))

np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [7]:
## Solution - 2 --> Applicable for Larger Dataset
from sklearn.impute import SimpleImputer

X_complete = np.vstack((X_with_nan, X))

imputer = SimpleImputer(strategy='most_frequent')

imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

# Handling Imbalanced Classes

In [8]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
# Create an imbalanced dataset
features, targets = datasets.make_classification(n_samples = 150,
                                                n_features = 3, 
                                                n_informative = 3, 
                                                n_redundant = 0,
                                                n_classes = 2,
                                                weights = [.10, .90],
                                                random_state = 1)

targets

array([0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

 So you can see how imbalanced the dataset is is.
### Solution - 1 
weights = {0:.9,  1: 0.1}

RandomForestClassifier(weights = weights)

### Solution - 2
RandomForestClassifier(weights = 'balanced')

which will create weights inversely proportional to class frequencies

### Solution - 3
Using either downsampling of majority class or upsampling of minority class