A bit of a play around to see how sklearn's ordinal encoder works

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [391]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import missingno as msno

In [3]:
train = pd.read_csv('../raw_data/train.csv')
test = pd.read_csv('../raw_data/test.csv')

train.shape, test.shape

((8693, 14), (4277, 13))

In [4]:
train.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False


In [5]:
X_train = train.drop(columns='Transported')
X_train.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent


In [6]:
X_train[['Cabin_Deck', 'Cabin_Level', 'Cabin_Side']] = X_train['Cabin'].str.split('/', expand=True)
for col in ['Cabin_Deck', 'Cabin_Level', 'Cabin_Side']:
    print(f'feature: {col} \t dtype: {X_train[col].dtype} \t unique values: {len(pd.unique(X_train[col]))}')

feature: Cabin_Deck 	 dtype: object 	 unique values: 9
feature: Cabin_Level 	 dtype: object 	 unique values: 1818
feature: Cabin_Side 	 dtype: object 	 unique values: 3


In [7]:
X_train['Cabin_Level'] = X_train['Cabin_Level'].astype(float)

In [28]:
for col in ['Cabin_Deck', 'Cabin_Level', 'Cabin_Side']:
    print(f'feature: {col: <20}dtype: {str(X_train[col].dtype): <10} unique values: {len(pd.unique(X_train[col])): >5}')

feature: Cabin_Deck          dtype: object     unique values:     9
feature: Cabin_Level         dtype: float64    unique values:  1818
feature: Cabin_Side          dtype: object     unique values:     3


In [9]:
X_train['Cabin_Deck'].value_counts(), X_train['Cabin_Side'].value_counts()

(F    2794
 G    2559
 E     876
 B     779
 C     747
 D     478
 A     256
 T       5
 Name: Cabin_Deck, dtype: int64,
 S    4288
 P    4206
 Name: Cabin_Side, dtype: int64)

In [10]:
feat_ord = ['HomePlanet', 'CryoSleep', 'VIP', 'Cabin_Deck', 'Cabin_Side']

ord_enc = OrdinalEncoder()


ord_preproc = ColumnTransformer([
    ('ord_tr', ord_enc, feat_ord)],
    remainder='passthrough'
)

In [11]:
X_train_trans = ord_preproc.fit_transform(X_train)
X_train_ord = pd.DataFrame(X_train_trans, columns=ord_preproc.get_feature_names_out())
X_train_ord

Unnamed: 0,ord_tr__HomePlanet,ord_tr__CryoSleep,ord_tr__VIP,ord_tr__Cabin_Deck,ord_tr__Cabin_Side,remainder__PassengerId,remainder__Cabin,remainder__Destination,remainder__Age,remainder__RoomService,remainder__FoodCourt,remainder__ShoppingMall,remainder__Spa,remainder__VRDeck,remainder__Name,remainder__Cabin_Level
0,1.0,0.0,0.0,1.0,0.0,0001_01,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0.0
1,0.0,0.0,0.0,5.0,1.0,0002_01,F/0/S,TRAPPIST-1e,24.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,0.0
2,1.0,0.0,1.0,0.0,1.0,0003_01,A/0/S,TRAPPIST-1e,58.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0.0
3,1.0,0.0,0.0,0.0,1.0,0003_02,A/0/S,TRAPPIST-1e,33.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0.0
4,0.0,0.0,0.0,5.0,1.0,0004_01,F/1/S,TRAPPIST-1e,16.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1.0,0.0,1.0,0.0,0.0,9276_01,A/98/P,55 Cancri e,41.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,98.0
8689,0.0,1.0,0.0,6.0,1.0,9278_01,G/1499/S,PSO J318.5-22,18.0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,1499.0
8690,0.0,0.0,0.0,6.0,1.0,9279_01,G/1500/S,TRAPPIST-1e,26.0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1500.0
8691,1.0,0.0,0.0,4.0,1.0,9280_01,E/608/S,55 Cancri e,32.0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,608.0


In [12]:
def missing_values(frame, colskip=[]):
    print('Missing Values:')
    for col in frame.columns:
        if col in colskip:
            continue
        else:
            print(f"{col: <30}{frame[col].isna().sum(): >5} values{round(frame[col].isnull().sum() / len(frame) * 100, 2): >6} %")

In [13]:
missing_values(X_train_ord)

Missing Values:
ord_tr__HomePlanet              201 values  2.31 %
ord_tr__CryoSleep               217 values   2.5 %
ord_tr__VIP                     203 values  2.34 %
ord_tr__Cabin_Deck              199 values  2.29 %
ord_tr__Cabin_Side              199 values  2.29 %
remainder__PassengerId            0 values   0.0 %
remainder__Cabin                199 values  2.29 %
remainder__Destination          182 values  2.09 %
remainder__Age                  179 values  2.06 %
remainder__RoomService          181 values  2.08 %
remainder__FoodCourt            183 values  2.11 %
remainder__ShoppingMall         208 values  2.39 %
remainder__Spa                  183 values  2.11 %
remainder__VRDeck               188 values  2.16 %
remainder__Name                 200 values   2.3 %
remainder__Cabin_Level          199 values  2.29 %


In [147]:
X_train_ord.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ord_tr__HomePlanet       8492 non-null   object
 1   ord_tr__CryoSleep        8476 non-null   object
 2   ord_tr__VIP              8490 non-null   object
 3   ord_tr__Cabin_Deck       8494 non-null   object
 4   ord_tr__Cabin_Side       8494 non-null   object
 5   remainder__PassengerId   8693 non-null   object
 6   remainder__Cabin         8494 non-null   object
 7   remainder__Destination   8511 non-null   object
 8   remainder__Age           8514 non-null   object
 9   remainder__RoomService   8512 non-null   object
 10  remainder__FoodCourt     8510 non-null   object
 11  remainder__ShoppingMall  8485 non-null   object
 12  remainder__Spa           8510 non-null   object
 13  remainder__VRDeck        8505 non-null   object
 14  remainder__Name          8493 non-null  

In [14]:
X_train_ord['ord_tr__Cabin_Deck'].value_counts()

5.0    2794
6.0    2559
4.0     876
1.0     779
2.0     747
3.0     478
0.0     256
7.0       5
Name: ord_tr__Cabin_Deck, dtype: int64

In [15]:
X_train_ord['ord_tr__Cabin_Side'].value_counts()

1.0    4288
0.0    4206
Name: ord_tr__Cabin_Side, dtype: int64

In [63]:
str(X_train['HomePlanet'].dtypes)

'object'

In [57]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Cabin_Deck    8494 non-null   object 
 14  Cabin_Level   8494 non-null   float64
 15  Cabin_Side    8494 non-null   object 
dtypes: float64(7), object(9)
memory usage: 1.1+ MB


In [56]:
X_train.index

RangeIndex(start=0, stop=8693, step=1)

In [329]:
def infoplus(frame):
    infod = {'col_ind': [], 'col_name': [], 'non_null': [], 'null': [],
             'null_per': [], 'unique': [],'dtype': []}
    max_len = []

    for i, col in enumerate(frame.columns):
        infod['col_ind'].append(i)
        infod['col_name'].append(col)
        infod['non_null'].append(frame[col].value_counts().sum())
        infod['null'].append(frame[col].isna().sum())
        infod['null_per'].append(round(infod['null'][-1] / len(X_train), 5))
        infod['unique'].append(len(frame[col].unique()))
        infod['dtype'].append(str(frame[col].dtype))

    for key in infod.keys():
        max_len.append(len(max(map(str, infod[key]), key=len)))
    for i, key in enumerate(infod.keys()):
        max_len[i] = max(len(key), max_len[i])

    OFFSET = 1
    print(type(frame))
    print(f'Range Index: {len(frame)} entries, {frame.index[0]} to {frame.index[-1]}')
    print(f'Shape: {frame.shape}')
    print(f'''\
{'#': ^{max_len[0] + OFFSET}}\
{'Column': <{max_len[1] + OFFSET}}\
{'Non-Null': <{max_len[2] + OFFSET}}\
{'Null': >{max_len[3] + OFFSET}}\
{'% Null': >{max_len[4] + OFFSET}}\
{'Unique': >{max_len[5] + OFFSET}}  \
{'Dtype': <{max_len[6] + OFFSET}}\
''')
    print(f'''\
{'-' * (max(3, max_len[0] - 2)): ^{max_len[0] + OFFSET}}\
{'-' * max_len[1]: <{max_len[1] + OFFSET}}\
{'-' * max_len[2]: <{max_len[2] + OFFSET}}\
{'-' * max_len[3]: >{max_len[3] + OFFSET}}\
{'-' * max_len[4]: >{max_len[4] + OFFSET}}\
{'-' * max_len[5]: >{max_len[5] + OFFSET}}  \
{'-' * max_len[6]: <{max_len[6] + OFFSET}}\
''')
    for i in infod['col_ind']:
        print(f'''{str(infod['col_ind'][i]): ^{max(3, max_len[0] + OFFSET)}}\
{infod['col_name'][i]: <{max_len[1] + OFFSET}}\
{infod['non_null'][i]: <{max_len[2] + OFFSET}}\
{infod['null'][i]: >{max_len[3] + OFFSET}}\
{round(infod['null_per'][i] * 100, 2): >{max_len[4] + OFFSET}}\
{infod['unique'][i]: >{max_len[5] + OFFSET}}  \
{infod['dtype'][i]: <{max_len[6] + OFFSET}}\
              ''')
    
    dtypes = [dtype for dtype in set(infod['dtype'])]
    for dtype in dtypes:
        dtype_count = [f"{dtype}({infod['dtype'].count(dtype)})" for dtype in dtypes]
    print(f"dtypes: {', '.join(dtype_count)}")

In [330]:
infoplus(X_train)

<class 'pandas.core.frame.DataFrame'>
Range Index: 8693 entries, 0 to 8692
Shape: (8693, 16)
   #    Column       Non-Null  Null   % Null Unique  Dtype   
 -----  ------------ --------  ---- -------- ------  ------- 
   0    PassengerId  8693         0      0.0   8693  object                
   1    HomePlanet   8492       201     2.31      4  object                
   2    CryoSleep    8476       217      2.5      3  object                
   3    Cabin        8494       199     2.29   6561  object                
   4    Destination  8511       182     2.09      4  object                
   5    Age          8514       179     2.06     81  float64               
   6    VIP          8490       203     2.34      3  object                
   7    RoomService  8512       181     2.08   1274  float64               
   8    FoodCourt    8510       183      2.1   1508  float64               
   9    ShoppingMall 8485       208     2.39   1116  float64               
   10   Spa          85

In [235]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Cabin_Deck    8494 non-null   object 
 14  Cabin_Level   8494 non-null   float64
 15  Cabin_Side    8494 non-null   object 
dtypes: float64(7), object(9)
memory usage: 1.1+ MB


In [334]:
X_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Cabin_Deck,Cabin_Level,Cabin_Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,B,0.0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,F,0.0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,A,0.0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,A,0.0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,F,1.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,A,98.0,P
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,G,1499.0,S
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,G,1500.0,S
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,E,608.0,S


In [337]:
for col in X_train.columns:
    print(len(X_train[col].unique()))

8693
4
3
6561
4
81
3
1274
1508
1116
1328
1307
8474
9
1818
3


In [389]:
def byte_conversion(framemem):
    ending_dict = {1: 'B', 2: 'KB', 3: 'MB', 4: 'GB'}
    power = 1
    while framemem / (1024 ** power) > 1:
        power += 1
        if power == 4:
            break
    return f'{round(framemem / (1024 ** (power - 1)), 2)} {ending_dict[power]}' 

In [390]:
byte_conversion(904200_000_000)

'842.1 GB'