### 목표
- 2912년 우주항해를 하던 타이타닉호의 승무원, 승선인원들을 찾아라!
- 운행중에 우주 먼지를 만나서 시스템 오류로 강제 전송됨
- 누가 전송되었는지 찾아서 복귀를 시켜야 합니다.

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor

In [29]:
train = pd.read_csv("./data/spaceship-titanic/train.csv")
test = pd.read_csv("./data/spaceship-titanic/test.csv")

#### 모델링 실습
1. 데이터 탐색
    - 전체 데이터 수 확인, 컬럼 수 확인
    - 컬럼 종류 확인(범주형, 수치형)
    - 결측치 처리
    - 인코딩 처리
2. 모델링
     - 모델정의
     - 모델학습
         1. 학습에 사용할 컬럼 선택
         2. 문제와 정답으로 분리
         3. 훈련 데이터 검증데이터로 분리
     - 모델 예측
         - 검증데이터 예측
     - 모델 평가
         - 검증데이터 평가(정확도)
     - 모델 최적화(하이퍼파라미터 튜닝)
3. 테스트 데이터 예측 후 kaggle 사이트에 업로드

In [30]:
space_model = DecisionTreeRegressor()

In [31]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [32]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


#### 1. 데이터 탐색 및 전처리

In [33]:
# 전체 데이터 수 확인, 컬럼 수 확인

display(test.shape) # 훈련용 데이터
display(train.shape) # 평가용 데이터

(4277, 13)

(8693, 14)

In [34]:
# 컬럼 종류 확인= 범주형, 수치형
display(train.info())
display(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


None

In [35]:
# 범주형 - PassengerId, HomePlanet, CryoSleep, Cabin, Destination, VIP , Name 
# 수치형-Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck 

In [36]:
# 결측치 처리
# train 결측치 x 컬럼: PassengerId
# test 결측치 x 컬럼: PassengerId

In [37]:
# 기술통게확인 - 범주형
train[['HomePlanet','CryoSleep', 'Cabin', 'Destination', 'VIP' , 'Name']].describe()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name
count,8492,8476,8494,8511,8490,8493
unique,3,2,6560,3,2,8473
top,Earth,False,G/734/S,TRAPPIST-1e,False,Gollux Reedall
freq,4602,5439,8,5915,8291,2


In [42]:
train['HomePlanet'].fillna('Earth', inplace=True)
train['VIP'].fillna(False,inplace=True)

test['HomePlanet'].fillna('Earth', inplace=True)
test['VIP'].fillna(False,inplace=True)

In [43]:
# 기술통게확인 - 수치형
train[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' ]].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8693.0,8693.0,8510.0,8485.0,8510.0,8505.0
mean,28.790291,224.6733,458.077203,173.729169,311.138778,304.854791
std,14.341404,659.739371,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,78.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [44]:
train['Age'].fillna(27, inplace=True)
train['RoomService'].fillna(224,inplace=True)

test['Age'].fillna(27, inplace=True)
test['RoomService'].fillna(224,inplace=True)

In [46]:
# 평가용 데이터 원핫인코딩
type_onehot = pd.get_dummies(train['VIP'])
type_onehot

# 훈련용 데이터 원핫인코딩
type_onehot_test = pd.get_dummies(test['VIP'])
type_onehot_test

# 원래 데이터에 채워준다. 
train = pd.concat([train,type_onehot], axis=1)
train

test = pd.concat([test, type_onehot_test], axis=1)
test



Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,False,True
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,1,0
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,1,0
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,1,0
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,1,0
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,1,0
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,1,0
4274,9271_01,Mars,True,D/296/P,55 Cancri e,27.0,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,1,0
4275,9273_01,Europa,False,D/297/P,,27.0,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,1,0


In [47]:
# 평가용 데이터 원핫인코딩
type_onehot1 = pd.get_dummies(train['HomePlanet'])
type_onehot1

# 훈련용 데이터 원핫인코딩
type_onehot_test1 = pd.get_dummies(test['HomePlanet'])
type_onehot_test1

# 원래 데이터에 채워준다. 
train = pd.concat([train,type_onehot1], axis=1)
train

test = pd.concat([test, type_onehot_test1], axis=1)
test


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,False,True,Earth,Europa,Mars
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,1,0,1,0,0
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,1,0,1,0,0
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,1,0,0,1,0
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,1,0,0,1,0
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,1,0,1,0,0
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,1,0,1,0,0
4274,9271_01,Mars,True,D/296/P,55 Cancri e,27.0,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,1,0,0,0,1
4275,9273_01,Europa,False,D/297/P,,27.0,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,1,0,0,1,0


In [49]:
# 평가용 데이터 원핫인코딩
type_onehot = pd.get_dummies(train['Transported'])
type_onehot

# 원래 데이터에 채워준다. 
train = pd.concat([train,type_onehot], axis=1)
train


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,VRDeck,Name,Transported,False,True,Earth,Europa,Mars,False.1,True.1
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0.0,Maham Ofracculy,False,1,0,0,1,0,1,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,44.0,Juanna Vines,True,1,0,1,0,0,0,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,49.0,Altark Susent,False,0,1,0,1,0,1,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,193.0,Solam Susent,False,1,0,0,1,0,1,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,2.0,Willy Santantines,True,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,...,74.0,Gravior Noxnuther,False,0,1,0,1,0,1,0
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,...,0.0,Kurta Mondalley,False,1,0,1,0,0,1,0
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,...,0.0,Fayey Connon,True,1,0,1,0,0,0,1
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,...,3235.0,Celeon Hontichre,False,1,0,0,1,0,1,0


In [50]:
# 원하는 컬럼 선택
X_train= train[['Age','RoomService','False','True','Earth','Europa','Mars']]
y_train= train['False','True']

KeyError: "['False', 'True'] not in index"