Import Libraries

In [152]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score

Load Data

In [153]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [154]:
train.shape

(8693, 14)

In [155]:
test.shape

(4277, 13)

In [156]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [157]:
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [158]:
test.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

As you see, in test dataframe 'transported' feature does not made yet.

Data Preprocessing

In [159]:
imputer = SimpleImputer(strategy='median')
train['Age'] = imputer.fit_transform(train[['Age']])
test['Age'] = imputer.transform(test[['Age']])

Fill missing values

In [160]:
train['HomePlanet'].fillna('Unknown', inplace=True)
test['HomePlanet'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['HomePlanet'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['HomePlanet'].fillna('Unknown', inplace=True)


One-Hot Encoding for Categorical geatures

In [161]:
train = pd.get_dummies(train, columns=['HomePlanet','Destination'], drop_first=True)
test = pd.get_dummies(test, columns=['HomePlanet','Destination'], drop_first=True)

Feature Engineering

In [162]:
train['Total_Billed'] = train[['RoomService',
                               'FoodCourt',
                               'ShoppingMall',
                               'Spa',
                               'VRDeck']].sum(axis=1)
test['Total_Billed'] = test[['RoomService',
                               'FoodCourt',
                               'ShoppingMall',
                               'Spa',
                               'VRDeck']].sum(axis=1)

Features and Target (We focuse to make model from the 'train' dataframe)

In [163]:
X = train.drop(['PassengerId','Name','Cabin','Transported'], axis=1)
y = train['Transported'].astype(int)

In [164]:
X.isna().sum()

CryoSleep                    217
Age                            0
VIP                          203
RoomService                  181
FoodCourt                    183
ShoppingMall                 208
Spa                          183
VRDeck                       188
HomePlanet_Europa              0
HomePlanet_Mars                0
HomePlanet_Unknown             0
Destination_PSO J318.5-22      0
Destination_TRAPPIST-1e        0
Total_Billed                   0
dtype: int64

In [165]:
X.fillna(0,inplace=True)

Split Data

In [166]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, 
                                                  random_state=42)

Model Training

In [167]:
model = RandomForestClassifier(n_estimators=100, 
                               random_state=42)
model.fit(X_train, y_train)

Making Prediction and Validate it

In [168]:
y_pred = model.predict(X_val)

accuracy_score(y_pred, y_val)

0.7787576687116564

Okay, the model is ready now.

We now check the 'test' dataframe

In [169]:
X_test = test.drop(['PassengerId','Name','Cabin'], axis=1)

X_test = X_test.reindex(columns=X.columns,
                        fill_value=0)

X_test.fillna(0, inplace=True)

As you see, we 'reindex' the X_test to suitable with 'X', which is the original model prediction.

Finally, making our prediction.

In [170]:
test_pred = model.predict(X_test)

Then, submit it into submission.

But, first we look into the submission template answer.

In [171]:
sample_submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


Okay.... let's execute it!!


In [172]:
submission = pd.DataFrame({'PassengerId': test['PassengerId'],
                           'Transported': test_pred})

submission.to_csv('submission.csv', index=False)