In [None]:
#Chapter 5. Handling Categorical Data

In [7]:
#5.1 Encoding Nominal Categorical Features

# Import libraries
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

# Create feature
feature = np.array([["Texas"],
                    ["California"],
                    ["Texas"],
                    ["Delaware"],
                    ["Texas"]])

# Create one-hot encoder
one_hot = LabelBinarizer()

# One-hot encode feature
print(one_hot.fit_transform(feature))
print("- - - - - - - - - - - - - - - - ")

# View feature classes
print(one_hot.classes_)
print("- - - - - - - - - - - - - - - - ")

# Reverse one-hot encoding
print(one_hot.inverse_transform(one_hot.transform(feature)))

[[0 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 0]
 [0 0 1]]
- - - - - - - - - - - - - - - - 
['California' 'Delaware' 'Texas']
- - - - - - - - - - - - - - - - 
['Texas' 'California' 'Texas' 'Delaware' 'Texas']


In [8]:
# Import library
import pandas as pd 

# Create dummy variables from feature
pd.get_dummies(feature[:, 0])

Unnamed: 0,California,Delaware,Texas
0,False,False,True
1,True,False,False
2,False,False,True
3,False,True,False
4,False,False,True


In [10]:
# Create multiclass feature
multiclass_feature = [("Texas", "Florida"),
                      ("California", "Alabama"),
                      ("Texas", "Florida"),
                      ("Delaware", "Florida"),
                      ("Texas", "Alabama")]

# Create multiclass one-hot encoder
one_hot_multiclass = MultiLabelBinarizer()

# One-hot encode multiclass feature
print(one_hot_multiclass.fit_transform(multiclass_feature))
print("- - - - - - - - - - - - - - - - ")

# View classes
print(one_hot_multiclass.classes_)

[[0 0 0 1 1]
 [1 1 0 0 0]
 [0 0 0 1 1]
 [0 0 1 1 0]
 [1 0 0 0 1]]
- - - - - - - - - - - - - - - - 
['Alabama' 'California' 'Delaware' 'Florida' 'Texas']


In [None]:
#5.2 Encoding Ordinal Categorical Features

# Create features
df = pd.DataFrame({"Score": ["Low",
                             "Low",
                             "Medium",
                             "Medium",
                             "High"]})

# Create mapper
scale_mapper = {"Low": 1,
                "Medium": 2,
                "High": 3}

# Replace feature values with scale
df["Score"].replace(scale_mapper)


  df["Score"].replace(scale_mapper)


0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [17]:
df = pd.DataFrame({"Score": ["Low",
                             "Low",
                             "Medium",
                             "Medium",
                             "High", 
                             "Barely More Than Medium"]})

scale_mapper = {"Low": 1,
                "Medium": 2,
                "Barely More Than Medium": 3,
                "High": 4}

df["Score"].replace(scale_mapper)

  df["Score"].replace(scale_mapper)


0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

In [18]:
scale_mapper = {"Low": 1,
                "Medium": 2,
                "Barely More Than Medium": 2.1,
                "High": 3}

df["Score"].replace(scale_mapper)

  df["Score"].replace(scale_mapper)


0    1.0
1    1.0
2    2.0
3    2.0
4    3.0
5    2.1
Name: Score, dtype: float64

In [20]:
#5.3 Encoding Dictionaries of Features

# Import library
from sklearn.feature_extraction import DictVectorizer

# Create dictionary
data_dict = [{"Red": 2, "Blue": 4},
             {"Red": 4, "Blue": 3},
             {"Red": 1, "Yellow": 2},
             {"Red": 2, "Yellow": 2}]

# Create dictionary vectorizer
dicvectorizer = DictVectorizer(sparse=False)

# Convert dictionary to feature matrix
features = dicvectorizer.fit_transform(data_dict)

# View feature matrix
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [22]:
# Get feature names
feature_names = dicvectorizer.get_feature_names_out()

# View feature names
feature_names

array(['Blue', 'Red', 'Yellow'], dtype=object)

In [23]:
# Create dataframe from features
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [24]:
# Create word count dictionaries for four documents

doc_1_word_count = {"Red": 2, "Blue": 4}
doc_2_word_count = {"Red": 4, "Blue": 3}
doc_3_word_count = {"Red": 1, "Yellow": 2}
doc_4_word_count = {"Red": 2, "Yellow": 2}

# Create list
doc_word_counts = [doc_1_word_count,
                   doc_2_word_count,
                   doc_3_word_count,
                   doc_4_word_count]

# Convert list of word count dictionaries into feature matrix
dicvectorizer.fit_transform(doc_word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [28]:
#5.4 Imputing Missing Class Values

# Load libraries
from sklearn.neighbors import KNeighborsClassifier

# Create feature matrix with categorical feature
X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

# Create feature matrix with missing values in the categorical feature
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                       [np.nan, -0.67, -0.22]])

# Train KNN learner
clf = KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[:, 1:], X[:, 0])

# Predict class of missing values
imputed_values = trained_model.predict(X_with_nan[:, 1:])

# Join column of predicted class with their other features
X_with_imputed = np.hstack((imputed_values.reshape(-1, 1), X_with_nan[:, 1:]))

# Join two feature matrices
np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [29]:
from sklearn.impute import SimpleImputer

# Join the two feature matrices
X_complete = np.vstack((X_with_nan, X))

imputer = SimpleImputer(strategy='most_frequent')

imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [36]:
#5.5 Handling Imbalanced Classes

# Load libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load iris data
iris = load_iris()

# Create feature matrix
features = iris.data

# Create target vector
target = iris.target

# Remove first 40 observations
features = features[40:, :]
target = target[40:]

# Create binary target vector indicating if class 0
target = np.where((target == 0), 0, 1)

# Look at the imbalanced target vector
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [65]:
# Create weights
weights = {0: 0.9,
           1: 0.1}

# Create random forest classifier with weights
RandomForestClassifier(class_weight=weights)

# Train a random forest with balanced class weights
RandomForestClassifier(class_weight='balanced')

# Indicies of each class's observations
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

# Number of observations in each class
n_class0 = len(i_class0)
n_class1 = len(i_class1)

# For every observation of class 0, randomly sample
# from class 1 without replacement
i_class1_downsample = np.random.choice(i_class1, size=n_class0, replace=False)

# Join together class 0's target vector with the
# downsampled class 1's target vector
print(np.hstack((target[i_class0] , target[i_class1_downsample])))
print("- - - - - - - - - - - - - - - - ")

# Join together class 0's feature matrix with the
# downsampled class 1's feature matrix
print(np.vstack((features[i_class0,:],features[i_class1_downsample,:]))[0:5])

[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1]
- - - - - - - - - - - - - - - - 
[[5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]]


In [66]:
# For every observation in class 1, randomly sample from class 0 with
# replacement
i_class0_upsample = np.random.choice(i_class0, size=n_class1, replace=True)

# Join together class 0's upsampled target vector with class 1's target vector
np.concatenate((target[i_class0_upsample], target[i_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [70]:
# Join together class 0's upsampled feature matrix with class 1's feature matrix
np.vstack((features[i_class0_upsample,:], features[i_class1,:]))[0:5]

array([[5.3, 3.7, 1.5, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5. , 3.5, 1.3, 0.3]])