In [89]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score

In [149]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [150]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [151]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [152]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [153]:
train.isnull().sum() # 결측치 수

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [154]:
# train['Passenger'] = train['PassengerId'].str.slice(0,-3).astype('object')
# test['Passenger'] = test['PassengerId'].str.slice(0,-3).astype('object')
# train.info()

In [155]:
train['deck'] = train['Cabin'].str.slice(0,1)
train['side'] = train['Cabin'].str.slice(-1)
# train['num'] = train['Cabin'].str.slice(2,-2)

test['deck'] = test['Cabin'].str.slice(0,1)
test['side'] = test['Cabin'].str.slice(-1)
# test['num'] = test['Cabin'].str.slice(2,-2)

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  deck          8494 non-null   object 
 15  side          8494 non-null   object 
dtypes: bool(1), float64(6), object(9)
memory usage: 1.0+ MB


In [156]:
train_x = train.drop(['Transported','PassengerId','Cabin','Name'], axis=1)
train_y = train['Transported']
test_x = test.drop(['PassengerId','Cabin','Name'], axis=1)


object_features = train_x.select_dtypes(include = 'object').columns
numeric_features = train_x.select_dtypes(exclude = 'object').columns

print(numeric_features)
print(object_features)

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')
Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'deck', 'side'], dtype='object')


In [157]:
numeric_pipe = Pipeline([('impute', SimpleImputer()),
                        ('scaler', StandardScaler())])

In [158]:
object_pipe = Pipeline([('impute', SimpleImputer(strategy='most_frequent')),
                       ('encoder', OneHotEncoder())])

In [159]:
transformer = ColumnTransformer([('numeric_preprocessing', numeric_pipe, numeric_features),
                                ('object_preprocessing', object_pipe, object_features)])

In [160]:
ml_pipe = Pipeline([('all_preprocessing', transformer),
                   ('medel', XGBClassifier())])

In [161]:
ml_pipe.fit(train_x, train_y)

Pipeline(steps=[('all_preprocessing',
                 ColumnTransformer(transformers=[('numeric_preprocessing',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')),
                                                 ('object_preprocessing',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('enc...
                               feature_types=None, gamma=None, gpu_id=None,
     

In [162]:
ml_pipe.predict(train_x)

array([1, 0, 0, ..., 1, 0, 1])

In [163]:
accuracy_score(train_y, ml_pipe.predict(train_x))

0.8851949844702635

In [174]:
test['pred'] = ml_pipe.predict(test_x)
test['Transported'] = (test['pred'] == 1)

In [175]:
test[['PassengerId','Transported']].to_csv('submission.csv', index = False)