<h1> Chap 05 - Handling Categorical Data</h1>

Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

## 5.1 Enconding Nominal Categorical Features

Dummying or one hot encoding 

In [7]:
feature = np.array([['Texas'],
                     ['California'],
                     ['Texas'],
                     ['Delaware'],
                     ['Texas']])

In [8]:
one_hot = LabelBinarizer()

In [9]:
one_hot.fit_transform(feature)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [10]:
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [11]:
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [13]:
pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [14]:
multiclass_feature = [('Texas', 'Florida'),
                      ('California','Alabama'),
                      ('Texas','Florida'),
                      ('Texas','Florida'),
                      ('Delaware','Florida'),
                      ('Texas','Alabama')]

one_hot_multiclass = MultiLabelBinarizer()

In [15]:
one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [16]:
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delaware', 'Florida', 'Texas'],
      dtype=object)

## 5.2 Encoding Ordinal Categorical Features 

In [18]:
dataframe = pd.DataFrame({'Score': ['Low', 'Low', 'Medium', 'Medium', 'High']})
scale_mapper = {'Low': 1, 'Medium': 2, 'High': 3}
dataframe['Score'].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [19]:
dataframe = pd.DataFrame({'Score': ['Low',
                                    'Low',
                                    'Medium',
                                    'Medium',
                                    'High',
                                    'Barely More Than Medium']})


scale_mapper = {'Low': 1, 'Medium': 2, 'Barely More Than Medium': 3,'High': 4}

In [20]:
dataframe['Score'].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

In [22]:
scale_mapper = {'Low': 1,
                'Medium': 2,
                'Barely More Than Medium': 2.1,
                'High': 3}

dataframe['Score'].replace(scale_mapper)

0    1.0
1    1.0
2    2.0
3    2.0
4    3.0
5    2.1
Name: Score, dtype: float64

## 5.3 Encoding Dictionaries of Features 

In [24]:
data_dict = [{'Red': 2, 'Blue': 4},
             {'Red': 4, 'Blue': 3},
             {'Red': 1, 'Yellow': 2},
             {'Red': 2, 'Yellow': 2}]

In [26]:
dictvectorizer = DictVectorizer(sparse=False)

In [27]:
features = dictvectorizer.fit_transform(data_dict)

In [28]:
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [29]:
feature_names = dictvectorizer.get_feature_names()

In [30]:
feature_names

['Blue', 'Red', 'Yellow']

In [31]:
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [37]:
doc_1_word_count = {'Red': 2, 'Blue': 4}
doc_2_word_count = {'Red': 4, 'Blue': 3}
doc_3_word_count = {'Red': 1, 'Yellow': 2}
doc_4_word_count = {'Red': 2, 'Yellow': 2}

In [38]:
doc_word_counts = [doc_1_word_count,
                   doc_2_word_count,
                   doc_3_word_count,
                   doc_4_word_count]

In [39]:
dictvectorizer.fit_transform(doc_word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

## 5.4 Imputing Missing Class Values 

In [41]:
X = np.array([[0, 2.1, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

In [42]:
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                       [np.nan, -0.67, -0.22]])

In [43]:
clf = KNeighborsClassifier(3, weights = 'distance')
trained_model = clf.fit(X[:,1:], X[:,0])

In [44]:
imputed_values = trained_model.predict(X_with_nan[:, 1:])
X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:, 1:]))

In [46]:
np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [54]:
X_complete = np.vstack((X_with_nan, X))

In [57]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

## 5.5 Handling Imbalanced Classes 

- Collect moredata
- Change the metrics used to evaluate the model
- Weight parameters
- Downsampling
- Oversampling

In [3]:
iris = load_iris()
features = iris.data
target = iris.target
features = features[40:,:]
target = target[40:]
target = np.where((target == 0), 0, 1)
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [8]:
#weights = {0: .9, 1: 0.1}
#RandomForestClassifier(class_weight = weights) # Instanciate the RandomForestClassifier with specified class_weights
RandomForestClassifier(class_weight= 'balanced') # Let the model create the weights inversely proportional to class frequencies

RandomForestClassifier(class_weight='balanced')

Undesampling

In [10]:
# indices of each class observations
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

n_class0 = len(i_class0)
n_class1 = len(i_class1)

# For every observation of class 0, randomly sample
# From class 1 without replacement
i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)
np.hstack((target[i_class0], target[i_class1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [12]:
np.vstack((features[i_class0,:], features[i_class1_downsampled,:]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

Oversampling

In [13]:
i_class0_unsampled = np.random.choice(i_class0, size=n_class1, replace=True)
np.concatenate((target[i_class0_unsampled], target[i_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [14]:
np.vstack((features[i_class0_unsampled,:], features[i_class1,:]))[0:5]

array([[5. , 3.5, 1.6, 0.6],
       [4.6, 3.2, 1.4, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 3.5, 1.6, 0.6]])