In [5]:
from sklearn.datasets import fetch_openml

data = fetch_openml(name='titanic', version=1, as_frame=True)

X, y = data['data'], data['target']

In [6]:
X

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [7]:
X.embarked.value_counts()

embarked
S    914
C    270
Q    123
Name: count, dtype: int64

In [8]:
X.embarked.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1309 entries, 0 to 1308
Series name: embarked
Non-Null Count  Dtype   
--------------  -----   
1307 non-null   category
dtypes: category(1)
memory usage: 1.5 KB


In [9]:
X = X[X.embarked.notna()]

### Label Encoder
Useful for categorical values with two values mostly (but not exclusively).

In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
X['embarked_label_encoder'] = label_encoder.fit_transform(X['embarked'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['embarked_label_encoder'] = label_encoder.fit_transform(X['embarked'])


In [11]:
X.sample(5)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,embarked_label_encoder
188,1,"Lines, Mrs. Ernest H (Elizabeth Lindsey James)",female,51.0,0,1,PC 17592,39.4,D28,S,9.0,,"Paris, France",2
681,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C,,,"Syria Kent, ON",0
1282,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S,,,,2
880,3,"Jalsevac, Mr. Ivan",male,29.0,0,0,349240,7.8958,,C,15.0,,,0
521,2,"Nye, Mrs. (Elizabeth Ramell)",female,29.0,0,0,C.A. 29395,10.5,F33,S,11.0,,"Folkstone, Kent / New York, NY",2


### Ordinal Encoder
For values with inherent order.

In [12]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(categories=[['S', 'C', 'Q']])
X['embarked_ordinal_encoded'] = ordinal_encoder.fit_transform(X[['embarked']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['embarked_ordinal_encoded'] = ordinal_encoder.fit_transform(X[['embarked']])


In [14]:
X.sample(5)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,embarked_label_encoder,embarked_ordinal_encoded
1262,3,"van Billiard, Master. James William",male,,1,1,A/5. 851,14.5,,S,,,,2,0.0
1129,3,"Petterson, Mr. Johan Emil",male,25.0,1,0,347076,7.775,,S,,,,2,0.0
558,2,"Silven, Miss. Lyyli Karoliina",female,18.0,0,2,250652,13.0,,S,16.0,,"Finland / Minneapolis, MN",2,0.0
427,2,"Hamalainen, Master. Viljo",male,0.6667,1,1,250649,14.5,,S,4.0,,"Detroit, MI",2,0.0
413,2,"Gale, Mr. Harry",male,38.0,1,0,28664,21.0,,S,,,"Cornwall / Clear Creek, CO",2,0.0


### One-Hot Encoding
For categorical features that do not have an inherent order and are mutually exclusive. If not too high cardinality.

In [15]:
import pandas as pd

X = X.join(pd.get_dummies(X['embarked'], dtype='int32', prefix='embarked'))

In [16]:
X.sample(5)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,embarked_label_encoder,embarked_ordinal_encoded,embarked_C,embarked_Q,embarked_S
412,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0,,S,,322.0,"Liverpool / Montreal, PQ",2,0.0,0,0,1
956,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S,,,,2,0.0,0,0,1
581,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0.0,,S,,,Belfast,2,0.0,0,0,1
953,3,"Leeni, Mr. Fahim ('Philip Zenni')",male,22.0,0,0,2620,7.225,,C,6.0,,,0,1.0,1,0,0
936,3,"Kink-Heilmann, Mrs. Anton (Luise Heilmann)",female,26.0,1,1,315153,22.025,,S,2.0,,,2,0.0,0,0,1


### Binary Encoding
Useful for high cardinality features. Especially when using decision trees or random forests.

In [19]:
import category_encoders as ce

binary_encoder = ce.BinaryEncoder()
binary_encoded = binary_encoder.fit_transform(X['embarked'])

X = pd.concat([X, binary_encoded], axis = 1)



In [20]:
X.sample(5)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,embarked_label_encoder,embarked_ordinal_encoded,embarked_C,embarked_Q,embarked_S,embarked_0,embarked_1
181,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S,8,,"New York, NY",2,0.0,0,0,1,0,1
882,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S,,,,2,0.0,0,0,1,0,1
709,3,"Carr, Miss. Helen 'Ellen'",female,16.0,0,0,367231,7.75,,Q,16,,"Co Longford, Ireland New York, NY",1,2.0,0,1,0,1,1
170,1,"Ismay, Mr. Joseph Bruce",male,49.0,0,0,112058,0.0,B52 B54 B56,S,C,,Liverpool,2,0.0,0,0,1,0,1
328,2,"Angle, Mr. William A",male,34.0,1,0,226875,26.0,,S,,,"Warwick, England",2,0.0,0,0,1,0,1


### Frequency Encoding
Loses data. Use when strong correlation with target variable

In [21]:
freq_encoding = X['embarked'].value_counts(normalize=True)
X['embarked_freq_encoded'] = X['embarked'].map(freq_encoding)

In [22]:
X.sample(5)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,...,body,home.dest,embarked_label_encoder,embarked_ordinal_encoded,embarked_C,embarked_Q,embarked_S,embarked_0,embarked_1,embarked_freq_encoded
513,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,...,,"New York, NY",0,1.0,1,0,0,1,0,0.20658
655,3,"Backstrom, Mr. Karl Alfred",male,32.0,1,0,3101278,15.85,,S,...,,"Ruotsinphytaa, Finland New York, NY",2,0.0,0,0,1,0,1,0.699311
1034,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C,...,,,0,1.0,1,0,0,1,0,0.20658
1233,3,"Sundman, Mr. Johan Julian",male,44.0,0,0,STON/O 2. 3101269,7.925,,S,...,,,2,0.0,0,0,1,0,1,0.699311
333,2,"Ball, Mrs. (Ada E Hall)",female,36.0,0,0,28551,13.0,D,S,...,,"Bristol, Avon / Jacksonville, FL",2,0.0,0,0,1,0,1,0.699311


### Target Encoding (Mean)
Can handle high cardinality. Useful when lots of rows and columns. Can lead to overfitting

In [24]:
df = X.copy()
df['survived'] = y
df['survived'] = df['survived'].astype('int')

target_means = df.groupby('embarked', observed=True)['survived'].mean()

X['embarked_target_encoded'] = X['embarked'].map(target_means)

In [26]:
X

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,...,home.dest,embarked_label_encoder,embarked_ordinal_encoded,embarked_C,embarked_Q,embarked_S,embarked_0,embarked_1,embarked_freq_encoded,embarked_target_encoded
0,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,...,"St Louis, MO",2,0.0,0,0,1,0,1,0.699311,0.332604
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,...,"Montreal, PQ / Chesterville, ON",2,0.0,0,0,1,0,1,0.699311,0.332604
2,1,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,...,"Montreal, PQ / Chesterville, ON",2,0.0,0,0,1,0,1,0.699311,0.332604
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,...,"Montreal, PQ / Chesterville, ON",2,0.0,0,0,1,0,1,0.699311,0.332604
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,...,"Montreal, PQ / Chesterville, ON",2,0.0,0,0,1,0,1,0.699311,0.332604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,...,,0,1.0,1,0,0,1,0,0.206580,0.555556
1305,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,...,,0,1.0,1,0,0,1,0,0.206580,0.555556
1306,3,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,...,,0,1.0,1,0,0,1,0,0.206580,0.555556
1307,3,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,...,,0,1.0,1,0,0,1,0,0.206580,0.555556


### Embeddings (Trainable Representation)
Learns representation for deep learning

In [30]:
import torch

unique_values = 150
embedding_dim = 10

embedding = torch.nn.Embedding(unique_values, embedding_dim)
values = torch.LongTensor([1,2,4,5,66,140]) # but not 150 or higher
output = embedding(values)

print(output)

tensor([[ 0.2950, -0.1805, -2.2300,  1.4923, -0.1795,  0.6752,  0.2929, -0.4946,
          0.5360, -0.5277],
        [ 1.3852, -2.2665, -0.0234, -1.9375, -1.1209, -1.3289, -1.0551, -0.7809,
          0.6957, -0.3872],
        [-0.5064,  0.0152,  0.2171,  1.3709, -1.0681,  2.1291,  0.9408, -0.1876,
         -1.2418, -1.9735],
        [-1.7130,  1.0816,  2.0229,  1.2472,  1.2254, -0.2021, -1.0276, -1.3359,
         -0.9827,  1.5086],
        [ 0.6056,  0.3935,  0.9728, -0.1306, -2.2593,  1.7702,  0.8050,  3.4107,
         -1.0124, -1.6242],
        [-0.9068, -0.9731, -0.4704, -0.6918, -0.1818,  0.1708,  0.2723, -1.7565,
          0.2125, -0.2454]], grad_fn=<EmbeddingBackward0>)


In [38]:
embarked_col = X['embarked']

embarked_mapping = {val: idx for idx, val in enumerate(embarked_col.dropna().unique())}
embarked_indices = embarked_col.map(embarked_mapping).fillna(0).astype(int)

embedding = torch.nn.Embedding(len(embarked_mapping), 8)

embarked_tensor = torch.LongTensor(embarked_indices)
embedded_output = embedding(embarked_tensor)

embedded_output

tensor([[ 0.9119, -0.4015, -0.5908,  ...,  0.2616, -0.1952,  2.1184],
        [ 0.9119, -0.4015, -0.5908,  ...,  0.2616, -0.1952,  2.1184],
        [ 0.9119, -0.4015, -0.5908,  ...,  0.2616, -0.1952,  2.1184],
        ...,
        [ 0.7046,  0.0311,  0.2837,  ...,  2.0730, -0.5195,  0.9095],
        [ 0.7046,  0.0311,  0.2837,  ...,  2.0730, -0.5195,  0.9095],
        [ 0.9119, -0.4015, -0.5908,  ...,  0.2616, -0.1952,  2.1184]],
       grad_fn=<EmbeddingBackward0>)

torch.Size([1307, 8])