In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

#Encoding Ordinal Data

In [28]:
data = pd.DataFrame({'Nota' : ['A', 'B', 'A', 'C', 'A']})
data

Unnamed: 0,Nota
0,A
1,B
2,A
3,C
4,A


In [29]:
mapping = {'A' : 10, 'B' : 8, 'C' : 6, 'D' : 4, 'F' : 2}

In [30]:
data['Notam_Num'] = data['Nota'].map(mapping)
data

Unnamed: 0,Nota,Notam_Num
0,A,10
1,B,8
2,A,10
3,C,6
4,A,10


#Label Encoding

In [31]:
import numpy as np

In [32]:
data = pd.DataFrame({'Fabricante' : ['Toyota', 'Ford', 'Ford', 'Mercedes', 'Ford']})
data

Unnamed: 0,Fabricante
0,Toyota
1,Ford
2,Ford
3,Mercedes
4,Ford


In [33]:
mapping = {label : idx for idx, label in enumerate(np.unique(data['Fabricante'].dropna()))}
mapping

{'Ford': 0, 'Mercedes': 1, 'Toyota': 2}

In [34]:
data['Fabricante_Label'] = data['Fabricante'].map(mapping)
data

Unnamed: 0,Fabricante,Fabricante_Label
0,Toyota,2
1,Ford,0
2,Ford,0
3,Mercedes,1
4,Ford,0


Via Sklearn

In [35]:
from sklearn.preprocessing import LabelEncoder

In [36]:
encoder = LabelEncoder().fit(data['Fabricante'])
data['Fabricante_Label'] = encoder.transform(data['Fabricante'])
data

Unnamed: 0,Fabricante,Fabricante_Label
0,Toyota,2
1,Ford,0
2,Ford,0
3,Mercedes,1
4,Ford,0


#One-Hot Encoding

In [37]:
data = pd.DataFrame({'Fabricante' : ['Toyota', 'Ford', 'Ford', 'Mercedes', 'Ford']})
data

Unnamed: 0,Fabricante
0,Toyota
1,Ford
2,Ford
3,Mercedes
4,Ford


In [38]:
one_hot_encodings = pd.get_dummies(data, columns=['Fabricante'] )
one_hot_encodings

Unnamed: 0,Fabricante_Ford,Fabricante_Mercedes,Fabricante_Toyota
0,False,False,True
1,True,False,False
2,True,False,False
3,False,True,False
4,True,False,False


In [39]:
data.join(one_hot_encodings)

Unnamed: 0,Fabricante,Fabricante_Ford,Fabricante_Mercedes,Fabricante_Toyota
0,Toyota,False,False,True
1,Ford,True,False,False
2,Ford,True,False,False
3,Mercedes,False,True,False
4,Ford,True,False,False


Via Sklearn

In [40]:
from sklearn.preprocessing import OneHotEncoder

In [41]:
fabricante_enc = OneHotEncoder()
print(data['Fabricante'].values.reshape(-1, 1))

fabricante_enc.fit(data['Fabricante'].values.reshape(-1, 1))
fabricante_transformed = fabricante_enc.transform(data['Fabricante'].values.reshape(-1, 1)).toarray()

print(fabricante_transformed)

[['Toyota']
 ['Ford']
 ['Ford']
 ['Mercedes']
 ['Ford']]
[[0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [42]:
fabricante_enc.inverse_transform([[1,0,0]])

array([['Ford']], dtype=object)

In [43]:
ohe_df = pd.DataFrame(fabricante_transformed, columns=fabricante_enc.get_feature_names_out())
ohe_df

Unnamed: 0,x0_Ford,x0_Mercedes,x0_Toyota
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0


#Rare Label Encoding

In [44]:
!pip install -U feature-engine




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [45]:
from feature_engine.encoding import RareLabelEncoder

In [46]:
def load_titanic():
    data = pd.read_csv(
        'https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
    data = data.replace('?', np.nan)
    data['cabin'] = data['cabin'].astype(str).str[0]
    data['pclass'] = data['pclass'].astype('O')
    data['embarked'].fillna('C', inplace=True)
    return data

In [47]:
data = load_titanic()

# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['survived', 'name', 'ticket'], axis=1),
    data['survived'], test_size=0.3, random_state=0)

X_train.cabin.value_counts()


cabin
n    702
C     71
B     42
E     32
D     32
A     17
F     15
G      4
T      1
Name: count, dtype: int64

In [48]:
# set up the encoder
#tolfloat, default=0.05
#The minimum frequency a label should have to be considered frequent. Categories with frequencies lower than tol will be grouped.
encoder = RareLabelEncoder(tol=0.03, n_categories=2,
                                         variables=['cabin', 'pclass', 'embarked'],
                                         replace_with='Rare')
# fit the encoder
encoder.fit(X_train)


In [49]:
# transform the data
train_t = encoder.transform(X_train)
test_t = encoder.transform(X_test)


In [50]:
encoder.encoder_dict_

{'cabin': ['n', 'C', 'B', 'E', 'D'],
 'pclass': [3, 1, 2],
 'embarked': ['S', 'C', 'Q']}

In [51]:
train_t.cabin.value_counts()

cabin
n       702
C        71
B        42
Rare     37
E        32
D        32
Name: count, dtype: int64

#Count Frequency

In [52]:
from feature_engine.encoding import CountFrequencyEncoder

In [53]:
data = load_titanic()

X_train, X_test, y_train, y_test = train_test_split(
                data.drop(['survived', 'name', 'ticket'], axis=1),
                data['survived'], test_size=0.3, random_state=0)

# set up the encoder
encoder = CountFrequencyEncoder(encoding_method='frequency',
                         variables=['cabin', 'pclass', 'embarked'])

# fit the encoder
encoder.fit(X_train)

# transform the data
train_t= encoder.transform(X_train)
test_t= encoder.transform(X_test)

encoder.encoder_dict_

{'cabin': {'n': 0.7663755458515283,
  'C': 0.07751091703056769,
  'B': 0.04585152838427948,
  'E': 0.034934497816593885,
  'D': 0.034934497816593885,
  'A': 0.018558951965065504,
  'F': 0.016375545851528384,
  'G': 0.004366812227074236,
  'T': 0.001091703056768559},
 'pclass': {3: 0.5436681222707423,
  1: 0.25109170305676853,
  2: 0.2052401746724891},
 'embarked': {'S': 0.7117903930131004,
  'C': 0.19759825327510916,
  'Q': 0.0906113537117904}}

In [54]:
train_t

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked,boat,body,home.dest
501,0.205240,female,13,0,1,19.5,0.766376,0.711790,14,,"England / Bennington, VT"
588,0.205240,female,4,1,1,23,0.766376,0.711790,14,,"Cornwall / Akron, OH"
402,0.205240,female,30,1,0,13.8583,0.766376,0.197598,12,,"Barcelona, Spain / Havana, Cuba"
1193,0.543668,male,,0,0,7.725,0.766376,0.090611,,,
686,0.543668,female,22,0,0,7.725,0.766376,0.090611,13,,"Kingwilliamstown, Co Cork, Ireland Glens Falls..."
...,...,...,...,...,...,...,...,...,...,...,...
763,0.543668,female,0.1667,1,2,20.575,0.766376,0.711790,10,,"Devon, England Wichita, KS"
835,0.543668,male,,0,0,8.05,0.766376,0.711790,,,
1216,0.543668,female,,0,0,7.7333,0.766376,0.090611,13,,
559,0.205240,female,20,0,0,36.75,0.766376,0.711790,11,,"Cornwall / Hancock, MI"


#Target Encoding

In [55]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.2-py2.py3-none-any.whl (81 kB)
                                              0.0/81.8 kB ? eta -:--:--
     ---------------------------------------- 81.8/81.8 kB 4.5 MB/s eta 0:00:00
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.2



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [56]:
data = [['Salt Lake City', 10, 120], ['Seattle', 5, 120], ['San Franscisco', 5, 140],
        ['Seattle', 3, 100], ['Seattle', 1, 70], ['San Franscisco', 2, 100],['Salt Lake City', 1, 60],
        ['San Franscisco', 2, 110], ['Seattle', 4, 100],['Salt Lake City', 2, 70] ]
df = pd.DataFrame(data, columns = ['City', 'Years OF Exp','Yearly Salary in Thousands'])
df

Unnamed: 0,City,Years OF Exp,Yearly Salary in Thousands
0,Salt Lake City,10,120
1,Seattle,5,120
2,San Franscisco,5,140
3,Seattle,3,100
4,Seattle,1,70
5,San Franscisco,2,100
6,Salt Lake City,1,60
7,San Franscisco,2,110
8,Seattle,4,100
9,Salt Lake City,2,70


In [57]:
import category_encoders as ce

In [58]:
tenc=ce.TargetEncoder(min_samples_leaf=0, smoothing=0)
df_city=tenc.fit_transform(df['City'],df['Yearly Salary in Thousands'])
df_city.rename(columns={'City': 'City_Encoded'},inplace=True)
df_city

df_new = df_city.join(df)
df_new

Unnamed: 0,City_Encoded,City,Years OF Exp,Yearly Salary in Thousands
0,83.333333,Salt Lake City,10,120
1,97.5,Seattle,5,120
2,116.666667,San Franscisco,5,140
3,97.5,Seattle,3,100
4,97.5,Seattle,1,70
5,116.666667,San Franscisco,2,100
6,83.333333,Salt Lake City,1,60
7,116.666667,San Franscisco,2,110
8,97.5,Seattle,4,100
9,83.333333,Salt Lake City,2,70
