In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

5.1 : Encoding Nominal Categorical Features

In [2]:
feature = np.array([['Texas'],
                    ['California'],
                    ['Texas'],
                    ['Delaware'],
                    ['Texas']])

one_hot = LabelBinarizer()
one_hot.fit_transform(feature)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [3]:
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [4]:
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [5]:
pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,False,False,True
1,True,False,False
2,False,False,True
3,False,True,False
4,False,False,True


In [6]:
mulyiclass_feature = [("Texas", "Florida"),
                     ("California", "Alabama"),
                     ("Texas", "Florida"),
                     ("Delaware", "Florida"),
                     ("Texas", "Alabama") ]

one_hot_multiclass = MultiLabelBinarizer()
one_hot_multiclass.fit_transform(mulyiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [7]:
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delaware', 'Florida', 'Texas'],
      dtype=object)

5.2 : Encoding Ordinal Categorical Features

In [8]:
dataframe = pd.DataFrame({'Score' : ['Low', 'Low', 'Medium', 'Medium', 'High']})

scal_mapper = {'Low' : 1,
               'Medium' : 2,
               'High' : 3}

dataframe['Score'].replace(scal_mapper)

  dataframe['Score'].replace(scal_mapper)


0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

Discussion

In [9]:
dataframe = pd.DataFrame({'Score':["Low", 
                                   "Low", 
                                   "Medium", 
                                   "Medium", 
                                   "High",
                                   "Barely More Than Medium",]})

scal_mapper = {'Low' : 1,
               'Medium' : 2,
               'Barely More Than Medium' : 3,
               'High' : 4}

dataframe['Score'].replace(scal_mapper)

  dataframe['Score'].replace(scal_mapper)


0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

In [10]:
scale_mapper = {'Low' : 1,
                'Medium' : 2,
                "Barely More Than Medium" : 2.1,
                'High' : 3}

dataframe['Score'].replace(scale_mapper)

  dataframe['Score'].replace(scale_mapper)


0    1.0
1    1.0
2    2.0
3    2.0
4    3.0
5    2.1
Name: Score, dtype: float64

5.3 : Encoding Dictionaries of Features

In [11]:
data_dict = [{'Red' : 2, 'Blue' : 4},
             {'Red' : 4 , 'Blue' : 3},
             {'Red' : 1 , 'Yellow' : 2},
             {'Red' : 2 , 'Yellow' : 2}]

dictvectorizer = DictVectorizer(sparse=False)

features = dictvectorizer.fit_transform(data_dict)

features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [12]:
feature_names = dictvectorizer.get_feature_names_out()
feature_names

array(['Blue', 'Red', 'Yellow'], dtype=object)

In [13]:
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


Discussion

In [14]:
doc_1_word_counts = {"Red": 2 , "Blue": 4}
doc_2_word_counts = {"Red": 4 , "Blue": 3}
doc_3_word_counts = {"Red": 1 , "Yellow": 2}
doc_4_word_counts = {"Red": 2 , "Yellow": 2}

doc_word_counts = [doc_1_word_counts,
                     doc_2_word_counts,
                     doc_3_word_counts,
                     doc_4_word_counts]

dictvectorizer.fit_transform(doc_word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

5.4 : Imputing Missing Class Values

In [15]:
X = np.array([[0,2.10,1.45],
              [1,1.18,1.33],
              [0,1.22,1.27],
              [1,-0.21,-1.19]])

X_with_nan = np.array([[np.nan,0.87,1.31],
                       [np.nan,-0.67,-0.22]])

clf = KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[: , 1:], X[:,0])

imputed_values = trained_model.predict(X_with_nan[:,1:])

X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:,1:]))
np.vstack((X_with_imputed , X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [16]:
X_complete = np.vstack((X_with_nan , X))
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

5.5 : handling Imbalanced Classes

In [17]:
iris = load_iris()
features = iris.data
target = iris.target
features = features[40:,:]
target = target[40:]
target = np.where((target ==0),0,1)
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [18]:
weights = {0:0.9,1:0.1}
RandomForestClassifier(class_weight=weights)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
RandomForestClassifier(class_weight='balanced')

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
i_class0 = np.where(target ==0)[0]
i_class1 = np.where(target ==1)[0]

n_class0 = len(i_class0)
n_class1 = len(i_class1)

i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)
np.hstack((target[i_class0], target[i_class1_downsampled]))

np.vstack((features[i_class0,:], features[i_class1_downsampled,:]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

In [21]:
i_class0_upsampled = np.random.choice(i_class0, size=n_class1, replace=True)
np.concatenate((target[i_class0_upsampled], target[i_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [22]:
np.vstack((features[i_class0_upsampled,:], features[i_class1,:]))[0:5]

array([[5. , 3.3, 1.4, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [4.8, 3. , 1.4, 0.3],
       [4.8, 3. , 1.4, 0.3]])