In [1]:
import pandas as pd
import numpy as np

category_of_service = ["Terrible","Poor", "Ok", "Good","Great"]

quality = ["Bad", "Good", "Really good", "Perfect"]

survey = {"service" : ["Terrible", "Poor", "Ok",          "Good",    "Great", "Terrible" ],
         "food":      ["Bad",      "Good", "Really good", "Perfect", "Bad",   "Good"]}

# 0: occacional customer.  1: frecuent customer
kind_customer = [0,0,1,1,0,1]

pd.DataFrame(survey)


Unnamed: 0,service,food
0,Terrible,Bad
1,Poor,Good
2,Ok,Really good
3,Good,Perfect
4,Great,Bad
5,Terrible,Good


### Ordinal encoder
#### No to dimensionality

In [3]:
from sklearn.preprocessing import OrdinalEncoder

data_ord = pd.DataFrame(survey)

encoder = OrdinalEncoder(categories=[category_of_service, quality])

data_ord = pd.DataFrame(encoder.fit_transform(data_ord),
                       columns=["service", "food"])
data_ord

Unnamed: 0,service,food
0,0.0,0.0
1,1.0,1.0
2,2.0,2.0
3,3.0,3.0
4,4.0,0.0
5,0.0,1.0


In [4]:
# To see the categories:
print(data_ord)
print(encoder.categories_)

   service  food
0      0.0   0.0
1      1.0   1.0
2      2.0   2.0
3      3.0   3.0
4      4.0   0.0
5      0.0   1.0
[array(['Terrible', 'Poor', 'Ok', 'Good', 'Great'], dtype=object), array(['Bad', 'Good', 'Really good', 'Perfect'], dtype=object)]


### ...

### Encoder OneHotEncoder

In [11]:
from sklearn.preprocessing import OneHotEncoder

data_one = pd.DataFrame(survey)
print(data_one)

encoder = OneHotEncoder()

print(encoder.fit_transform(data_one).toarray()) # To create a sparse matrix - Matriz dispersa
print(encoder.categories_)

    service         food
0  Terrible          Bad
1      Poor         Good
2        Ok  Really good
3      Good      Perfect
4     Great          Bad
5  Terrible         Good
[[0. 0. 0. 0. 1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 1. 0. 0.]]
[array(['Good', 'Great', 'Ok', 'Poor', 'Terrible'], dtype=object), array(['Bad', 'Good', 'Perfect', 'Really good'], dtype=object)]


In [14]:
print(np.concatenate(encoder.categories_))


['Good' 'Great' 'Ok' 'Poor' 'Terrible' 'Bad' 'Good' 'Perfect'
 'Really good']


In [15]:
data_one = pd.DataFrame(encoder.fit_transform(data_one).toarray(),
                       columns=np.concatenate(encoder.categories_))
data_one

Unnamed: 0,Good,Great,Ok,Poor,Terrible,Bad,Good.1,Perfect,Really good
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


### What method is better, OneHotEncoder or OrdinalEncoder?
We need to try 

### Comparative between both encoders
#### We will use LogisticRegression

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

print("\n*** Scaling data for OrdinalEncoder model") 
# The OneHotEncoder doesn't need to be scale, because the values are between 0..1
scaler = MinMaxScaler()
print(data_ord)
data_ord = scaler.fit_transform(data_ord)
print(data_ord)

print("\n*** Clasification with encoded data with OrdinalEncoder")
model = LogisticRegression().fit(data_ord, kind_customer)
print("Predicions: ", model.predict(data_ord))
print("Correct_classes: ", kind_customer)
print(model.predict_proba(data_ord))

print("\n*** Clasification with encoded data with OneHotEncoder")
model = LogisticRegression().fit(data_one, kind_customer)
print("Predicions: ", model.predict(data_one))
print("Correct_classes: ", kind_customer)
print(model.predict_proba(data_one))




*** Scaling data for OrdinalEncoder model
[[0.         0.        ]
 [0.25       0.33333333]
 [0.5        0.66666667]
 [0.75       1.        ]
 [1.         0.        ]
 [0.         0.33333333]]
[[0.         0.        ]
 [0.25       0.33333333]
 [0.5        0.66666667]
 [0.75       1.        ]
 [1.         0.        ]
 [0.         0.33333333]]

*** Clasification with encoded data with OrdinalEncoder
Predicions:  [0 0 1 1 0 0]
Correct_classes:  [0, 0, 1, 1, 0, 1]
[[0.56489598 0.43510402]
 [0.50845928 0.49154072]
 [0.45180614 0.54819386]
 [0.39637512 0.60362488]
 [0.57175647 0.42824353]
 [0.50671189 0.49328811]]

*** Clasification with encoded data with OneHotEncoder
Predicions:  [0 0 1 1 0 1]
Correct_classes:  [0, 0, 1, 1, 0, 1]
[[0.62301432 0.37698568]
 [0.57519021 0.42480979]
 [0.32279763 0.67720237]
 [0.32279763 0.67720237]
 [0.70505958 0.29494042]
 [0.45116842 0.54883158]]


#### Conclusion: In this case, the OneHotEncoder did not have error of prediction, otherwise the OrdinalEncoder did.