# Imports

In [1]:
import pandas as pd
import numpy as np

# Load Data (Categorical)

In [2]:
df = pd.DataFrame({
    'id': [2343, 2344, 2345],
    'ShirtSize': ['S', 'M', 'L'],
    'color': ['Red', 'Blue', 'Green'],
    'price': [15.99, 29.99, 49.99],
    'stock': [120, 50, 30]
})

df.head()

Unnamed: 0,id,ShirtSize,color,price,stock
0,2343,S,Red,15.99,120
1,2344,M,Blue,29.99,50
2,2345,L,Green,49.99,30


# Preprocess Data

### Encode Ordinal Data

In [None]:
df['ShirtSize'] = df['ShirtSize'].map({'S': 0, 'M': 1, 'L': 2})
df.head()

Unnamed: 0,id,ShirtSize,color,price,stock
0,2343,0,Red,15.99,120
1,2344,1,Blue,29.99,50
2,2345,2,Green,49.99,30


## Encode Nominal Data

In [None]:
df = pd.get_dummies(df, columns=['color'], dtype=int)
df.head()

Unnamed: 0,id,ShirtSize,price,stock,color_Blue,color_Green,color_Red
0,2343,0,15.99,120,0,0,1
1,2344,1,29.99,50,1,0,0
2,2345,2,49.99,30,0,1,0


# Load Data (with NaN values)

In [24]:
df = pd.DataFrame({
    'id': [2343, 2344, 2345, 2346],
    'ShirtSize': ['S', 'M', 'L', 'M'],
    'color': ['Red', 'Blue', np.nan, 'Red'],
    'price': [15.99, 29.99, 49.99, 29.99],
    'stock': [120, np.nan, 30, 50]
})
df.head()

Unnamed: 0,id,ShirtSize,color,price,stock
0,2343,S,Red,15.99,120.0
1,2344,M,Blue,29.99,
2,2345,L,,49.99,30.0
3,2346,M,Red,29.99,50.0


### Fill NaN Values (Numerical)

In [25]:
df['stock'] = df['stock'].fillna(df['stock'].mean())
df.head()

Unnamed: 0,id,ShirtSize,color,price,stock
0,2343,S,Red,15.99,120.0
1,2344,M,Blue,29.99,66.666667
2,2345,L,,49.99,30.0
3,2346,M,Red,29.99,50.0


### Fill NaN Values (Categorical)

In [26]:
df['color'] = df['color'].fillna(df['color'].mode()[0])
df.head()

Unnamed: 0,id,ShirtSize,color,price,stock
0,2343,S,Red,15.99,120.0
1,2344,M,Blue,29.99,66.666667
2,2345,L,Red,49.99,30.0
3,2346,M,Red,29.99,50.0


# This looks simple and easy - why might this be a suboptimal encoding approach in some scenarios?