In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import missingno as msno

In [3]:
train = pd.read_csv('../raw_data/train.csv')
test = pd.read_csv('../raw_data/test.csv')

train.shape, test.shape

((8693, 14), (4277, 13))

In [8]:
train.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False


In [9]:
X_train = train.drop(columns='Transported')
X_train.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent


In [22]:
feat_ord = ['HomePlanet', 'CryoSleep', 'VIP']

ord_enc = OrdinalEncoder()

ord_preproc = ColumnTransformer([
    ('ord_tr', ord_enc, feat_ord)],
    remainder='passthrough'
)

In [23]:
X_train_trans = ord_preproc.fit_transform(X_train)
X_train_ord = pd.DataFrame(X_train_trans, columns=ord_preproc.get_feature_names_out())
X_train_ord

Unnamed: 0,ord_tr__HomePlanet,ord_tr__CryoSleep,ord_tr__VIP,remainder__PassengerId,remainder__Cabin,remainder__Destination,remainder__Age,remainder__RoomService,remainder__FoodCourt,remainder__ShoppingMall,remainder__Spa,remainder__VRDeck,remainder__Name
0,1.0,0.0,0.0,0001_01,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0.0,0.0,0.0,0002_01,F/0/S,TRAPPIST-1e,24.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,1.0,0.0,1.0,0003_01,A/0/S,TRAPPIST-1e,58.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,1.0,0.0,0.0,0003_02,A/0/S,TRAPPIST-1e,33.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0.0,0.0,0.0,0004_01,F/1/S,TRAPPIST-1e,16.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1.0,0.0,1.0,9276_01,A/98/P,55 Cancri e,41.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther
8689,0.0,1.0,0.0,9278_01,G/1499/S,PSO J318.5-22,18.0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley
8690,0.0,0.0,0.0,9279_01,G/1500/S,TRAPPIST-1e,26.0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon
8691,1.0,0.0,0.0,9280_01,E/608/S,55 Cancri e,32.0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre


In [24]:
def missing_values(frame, colskip=[]):
    print('Missing Values:')
    for col in frame.columns:
        if col in colskip:
            continue
        else:
            print(f"{col: <20}{frame[col].isna().sum(): >5} values{round(frame[col].isnull().sum() / len(frame) * 100, 2): >6} %")

In [25]:
missing_values(X_train_ord)

Missing Values:
ord_tr__HomePlanet    201 values  2.31 %
ord_tr__CryoSleep     217 values   2.5 %
ord_tr__VIP           203 values  2.34 %
remainder__PassengerId    0 values   0.0 %
remainder__Cabin      199 values  2.29 %
remainder__Destination  182 values  2.09 %
remainder__Age        179 values  2.06 %
remainder__RoomService  181 values  2.08 %
remainder__FoodCourt  183 values  2.11 %
remainder__ShoppingMall  208 values  2.39 %
remainder__Spa        183 values  2.11 %
remainder__VRDeck     188 values  2.16 %
remainder__Name       200 values   2.3 %


In [28]:
X_train_ord['ord_tr__HomePlanet'].value_counts()

0.0    4602
1.0    2131
2.0    1759
Name: ord_tr__HomePlanet, dtype: int64