# Ordinal number encoding

In [1]:
import datetime

In [2]:
today_date = datetime.datetime.today()

In [3]:
today_date

datetime.datetime(2021, 8, 11, 12, 33, 24, 786842)

In [4]:
today_date-datetime.timedelta(3)

datetime.datetime(2021, 8, 8, 12, 33, 24, 786842)

In [5]:
# List comprehension
days=[today_date-datetime.timedelta(x) for x in range(0,15)]

In [6]:
import pandas as pd
data=pd.DataFrame(days)
data.columns=['Day']

In [11]:
data.head()

Unnamed: 0,Day
0,2021-08-11 12:33:24.786842
1,2021-08-10 12:33:24.786842
2,2021-08-09 12:33:24.786842
3,2021-08-08 12:33:24.786842
4,2021-08-07 12:33:24.786842


In [13]:
data['weekday'] = data['Day'].dt.day_name()
data.head()

Unnamed: 0,Day,weekday
0,2021-08-11 12:33:24.786842,Wednesday
1,2021-08-10 12:33:24.786842,Tuesday
2,2021-08-09 12:33:24.786842,Monday
3,2021-08-08 12:33:24.786842,Sunday
4,2021-08-07 12:33:24.786842,Saturday


In [14]:
dictionary = {'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

In [15]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [18]:
data['weekday_ordinal']=data['weekday'].map(dictionary)

In [20]:
data

Unnamed: 0,Day,weekday,weekday_ordinal
0,2021-08-11 12:33:24.786842,Wednesday,3
1,2021-08-10 12:33:24.786842,Tuesday,2
2,2021-08-09 12:33:24.786842,Monday,1
3,2021-08-08 12:33:24.786842,Sunday,7
4,2021-08-07 12:33:24.786842,Saturday,6
5,2021-08-06 12:33:24.786842,Friday,5
6,2021-08-05 12:33:24.786842,Thursday,4
7,2021-08-04 12:33:24.786842,Wednesday,3
8,2021-08-03 12:33:24.786842,Tuesday,2
9,2021-08-02 12:33:24.786842,Monday,1


# Target Guided Ordinal Encoding

In [1]:
import pandas as pd
df = pd.read_csv('titanic.csv',usecols=['Cabin','Survived'])

In [3]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [10]:
df['Cabin'].fillna('Missing',inplace=True)

In [11]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,m
1,1,C
2,1,m
3,1,C
4,0,m


In [12]:
df['Cabin']=df['Cabin'].astype(str).str[0]

In [13]:
df

Unnamed: 0,Survived,Cabin
0,0,m
1,1,C
2,1,m
3,1,C
4,0,m
...,...,...
886,0,m
887,1,B
888,0,m
889,1,C


In [14]:
df.Cabin.unique()

array(['m', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [17]:
df.groupby(['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'm', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [18]:
ordinal_labels=df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'm', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [19]:
enumerate(ordinal_labels,0)

<enumerate at 0x7ff0350704c0>

In [20]:
ordinal_labels1={k:i for i,k in enumerate(ordinal_labels,0)}

In [21]:
ordinal_labels1

{'T': 0, 'm': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [22]:
df['Cabin_ordinal_labels']=df['Cabin'].map(ordinal_labels1)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,m,1
1,1,C,4
2,1,m,1
3,1,C,4
4,0,m,1


# Mean Encoding

In [24]:
mean_ordinal=df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [25]:
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'T': 0.0,
 'm': 0.29985443959243085}

In [26]:
df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,m,1,0.299854
1,1,C,4,0.59322
2,1,m,1,0.299854
3,1,C,4,0.59322
4,0,m,1,0.299854


# Probability Ratio Encoding

In [27]:
import pandas as pd

In [28]:
df = pd.read_csv('titanic.csv',usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [29]:
df['Cabin'].fillna('Missing',inplace=True)
df.head(20)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
5,0,Missing
6,0,E46
7,0,Missing
8,1,Missing
9,1,Missing


In [31]:
df['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [32]:
df['Cabin']=df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [34]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [35]:
prob_df=df.groupby(['Cabin'])['Survived'].mean()

In [36]:
prob_df

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [37]:
prob_df=pd.DataFrame(prob_df)

In [38]:
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [39]:
 prob_df["Died"]=1-prob_df['Survived']

In [41]:
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [42]:
prob_df['probability_ratio']=prob_df['Survived']/prob_df['Died']
prob_df.head()

Unnamed: 0_level_0,Survived,Died,probability_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [43]:
probability_encoded=prob_df['probability_ratio'].to_dict()

In [44]:
probability_encoded

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [45]:
df['Cabin_encoded']=df['Cabin'].map(probability_encoded)
df.head(20)

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
5,0,M,0.428274
6,0,E,3.0
7,0,M,0.428274
8,1,M,0.428274
9,1,M,0.428274
