In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression 

In [2]:
train = pd.read_csv('/home/necati/machine_learning/Kaggle/Kaggle_datasets/trainSpaceship.csv')
test = pd.read_csv('/home/necati/machine_learning/Kaggle/Kaggle_datasets/testSpaceship.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train.drop(columns=['PassengerId', 'Name', 'Cabin'], inplace=True)
Id_column = test['PassengerId']
test.drop(columns=['PassengerId', 'Name', 'Cabin'], inplace=True)

In [5]:
print(train.isnull().sum())
print('\nThe shape of the training data is', train.shape)
print(test.isnull().sum())
print(test.shape)

HomePlanet      201
CryoSleep       217
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

The shape of the training data is (8693, 11)
HomePlanet       87
CryoSleep        93
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64
(4277, 10)


In [6]:
train.fillna(train.median(numeric_only=True), inplace=True)
for col in train.select_dtypes(include='object').columns:
    train.fillna({col:train[col].mode()[0]}, inplace=True)
test.fillna(test.median(numeric_only=True), inplace=True)
for col in test.select_dtypes(include=['object']).columns:
    test.fillna({col: test[col].mode()[0]}, inplace=True)

  train.fillna({col:train[col].mode()[0]}, inplace=True)
  test.fillna({col: test[col].mode()[0]}, inplace=True)


In [7]:
scaler = StandardScaler()
numeric_columns = train.select_dtypes(include=['number']).columns
train[numeric_columns] = scaler.fit_transform(train[numeric_columns])
test[numeric_columns] = scaler.transform(test[numeric_columns])
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,TRAPPIST-1e,0.711945,False,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,False
1,Earth,False,TRAPPIST-1e,-0.334037,False,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,True
2,Europa,False,TRAPPIST-1e,2.036857,True,-0.268001,1.959998,-0.283579,5.695623,-0.219796,False
3,Europa,False,TRAPPIST-1e,0.293552,False,-0.333105,0.52301,0.336851,2.687176,-0.092818,False
4,Earth,False,TRAPPIST-1e,-0.891895,False,0.125652,-0.237159,-0.031059,0.231374,-0.26124,True


In [8]:
train_wo_target = train.drop(columns=['Transported'])
train_wo_target = pd.get_dummies(train_wo_target, columns=train_wo_target.select_dtypes(include=['object']).columns)
bool_columns = train_wo_target.select_dtypes(include=[bool]).columns
train_wo_target[bool_columns] = train_wo_target[bool_columns].astype(int)
train_wo_target.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0,0.711945,0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0,1,0,0,0,1
1,0,-0.334037,0,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,1,0,0,0,0,1
2,0,2.036857,1,-0.268001,1.959998,-0.283579,5.695623,-0.219796,0,1,0,0,0,1
3,0,0.293552,0,-0.333105,0.52301,0.336851,2.687176,-0.092818,0,1,0,0,0,1
4,0,-0.891895,0,0.125652,-0.237159,-0.031059,0.231374,-0.26124,1,0,0,0,0,1


In [9]:
train = pd.concat([train_wo_target, train[train.columns[-1]]], axis=1)
train['Transported'] = train['Transported'].astype(int)
train.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Transported
0,0,0.711945,0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0,1,0,0,0,1,0
1,0,-0.334037,0,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,1,0,0,0,0,1,1
2,0,2.036857,1,-0.268001,1.959998,-0.283579,5.695623,-0.219796,0,1,0,0,0,1,0
3,0,0.293552,0,-0.333105,0.52301,0.336851,2.687176,-0.092818,0,1,0,0,0,1,0
4,0,-0.891895,0,0.125652,-0.237159,-0.031059,0.231374,-0.26124,1,0,0,0,0,1,1


In [10]:
train_set, val_set = train_test_split(train, test_size=0.2, random_state=13)

X_train = train_set[train_set.columns[:-1]]
y_train = train_set[train_set.columns[-1]]

X_val = val_set[val_set.columns[:-1]]
y_val = val_set[val_set.columns[-1]]


In [16]:
test = pd.get_dummies(test, columns= test.select_dtypes(include='object').columns)
bool_columns_test = test.select_dtypes(include=[bool]).columns
test[bool_columns_test] = test[bool_columns_test].astype(int)
test.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,1,-0.124841,0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,1,0,0,0,0,1
1,0,-0.682698,0,-0.333105,-0.275387,-0.283579,2.237598,-0.263003,1,0,0,0,0,1
2,1,0.154088,0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0,1,0,1,0,0
3,0,0.642213,0,-0.333105,3.88768,-0.283579,-0.109808,0.252842,0,1,0,0,0,1
4,0,-0.612966,0,-0.317964,-0.281027,0.778343,-0.270626,-0.263003,1,0,0,0,0,1


## Random Forest Classifier

In [205]:
model1 = RandomForestClassifier(random_state=13)
model1.fit(X_train, y_train)

In [206]:
train_pred1 = model1.predict(X_train)
val_pred1 = model1.predict(X_val)

In [207]:
print(accuracy_score(y_train, train_pred1))
print(accuracy_score(y_val, val_pred1))

0.9314063848144952
0.7855089131684876


In [210]:
test_pred1 = model1.predict(test)

In [213]:
submission1 = pd.DataFrame({'PassengerId': Id_column,
                            'Transported': test_pred1.astype(bool)})
submission1.to_csv('Spaceship_submission1.csv', index=False)

## Hyper-Parameter Tuning

In [24]:
param_grid = {
            'n_estimators': [50, 100, 150],
            'max_depth': [None, 5, 10, 15],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=13), param_grid=param_grid, cv=5, n_jobs=-2, verbose=2)
grid_search.fit(train_wo_target, train['Transported'])

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.3s
[CV] END max_de

In [27]:
model3 = RandomForestClassifier(n_estimators=50, max_depth=10, min_samples_split=10, random_state=13)
model3.fit(X_train, y_train)

In [None]:
train_pred3 = model3.predict(X_train)
val_pred3 = model3.predict(X_val)
print(accuracy_score(train_pred3,y_train), accuracy_score(val_pred3, y_val))

0.8359217716422203 0.7987349051178838


In [29]:
test_pred3 = model3.predict(test).astype(bool)

In [30]:
submission3 = pd.DataFrame({'PassengerId': Id_column,
                            'Transported': test_pred3})
submission3.to_csv('Spaceship_submission3.csv', index=False)

## Logistic Regression

In [11]:
model2 = LogisticRegression(random_state=13)
model2.fit(X_train, y_train)

In [12]:
train_pred2 = model2.predict(X_train)
val_pred2 = model2.predict(X_val)

In [17]:
test_pred2 = model2.predict(test)

In [19]:
submission2 = pd.DataFrame({'PassengerId': Id_column,
                            'Transported': test_pred2.astype(bool)})
submission2.to_csv('Spaceship_submission2.csv', index=False)