In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, matthews_corrcoef

# Model
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans

train_frame = pd.read_csv("train.csv")

In [2]:
train_frame.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


In [3]:
train_frame.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [4]:
for i in train_frame.columns[1:]:
    print(f"\t\t-------{i}-------")
    display(train_frame[i].unique())

		-------HomePlanet-------


array(['Europa', 'Earth', 'Mars', nan], dtype=object)

		-------CryoSleep-------


array([False, True, nan], dtype=object)

		-------Cabin-------


array(['B/0/P', 'F/0/S', 'A/0/S', ..., 'G/1499/S', 'G/1500/S', 'E/608/S'],
      dtype=object)

		-------Destination-------


array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

		-------Age-------


array([39., 24., 58., 33., 16., 44., 26., 28., 35., 14., 34., 45., 32.,
       48., 31., 27.,  0.,  1., 49., 29., 10.,  7., 21., 62., 15., 43.,
       47.,  2., 20., 23., 30., 17., 55.,  4., 19., 56., nan, 25., 38.,
       36., 22., 18., 42., 37., 13.,  8., 40.,  3., 54.,  9.,  6., 64.,
       67., 61., 50., 41., 57., 11., 52., 51., 46., 60., 63., 59.,  5.,
       79., 68., 74., 12., 53., 65., 71., 75., 70., 76., 78., 73., 66.,
       69., 72., 77.])

		-------VIP-------


array([False, True, nan], dtype=object)

		-------RoomService-------


array([   0.,  109.,   43., ..., 1569., 8586.,  745.])

		-------FoodCourt-------


array([   0.,    9., 3576., ..., 3208., 6819., 4688.])

		-------ShoppingMall-------


array([   0.,   25.,  371., ..., 1085.,  510., 1872.])

		-------Spa-------


array([   0.,  549., 6715., ..., 2868., 1107., 1643.])

		-------VRDeck-------


array([   0.,   44.,   49., ..., 1164.,  971., 3235.])

		-------Name-------


array(['Maham Ofracculy', 'Juanna Vines', 'Altark Susent', ...,
       'Fayey Connon', 'Celeon Hontichre', 'Propsh Hontichre'],
      dtype=object)

		-------Transported-------


array([False,  True])

In [2]:
def cab_split(x):
    if len(str(x).split('/')) < 3:
        return ['Missing','Missing', 'Missing']
    else:
        return str(x).split('/')

In [3]:
def preprocessing(train_frame):
    # fill missing values pada HomePlanet
    train_frame['HomePlanet'].fillna('Missing', inplace=True)
    # Fill missing values pada Cryosleep
    train_frame['CryoSleep'].fillna('Missing', inplace=True)
    # Preprocessing cabin
    train_frame['TempCabin'] = train_frame['Cabin'].apply(lambda x:cab_split(x))
    train_frame['Deck'] = train_frame['TempCabin'].apply(lambda x:x[0])
    train_frame['Side'] = train_frame['TempCabin'].apply(lambda x:x[2])
    train_frame.drop(['TempCabin', 'Cabin'], axis=1, inplace=True)
    # Fill missing values pada Destination
    train_frame['Destination'].fillna('Missing', inplace=True)
    # Fill missing values pada Age dengan nilai rata2 pada kolom age
    train_frame['Age'].fillna(train_frame['Age'].mean(), inplace=True)
    # Fill missing values pada VIP
    train_frame['VIP'].fillna('Missing', inplace=True)
    # Fill missing values pada amenities feature yang terdapat pada dataframe, yaitu RoomService, FoodCourt, ShoppingMall, Spa, dan VRDeck
    train_frame['RoomService'].fillna(0, inplace=True)
    train_frame['FoodCourt'].fillna(0, inplace=True)
    train_frame['ShoppingMall'].fillna(0, inplace=True)
    train_frame['Spa'].fillna(0, inplace=True)
    train_frame['VRDeck'].fillna(0, inplace=True)
    # Drop feature Name karena merupakan high cardinality dan tidak memiliki korelasi yang signifikan
    train_frame.drop('Name', axis=1, inplace=True)

In [4]:
dupl = train_frame.copy()

In [5]:
preprocessing(dupl)
dupl.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S


In [6]:
dupl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   object 
 3   Destination   8693 non-null   object 
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   object 
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
 11  Transported   8693 non-null   bool   
 12  Deck          8693 non-null   object 
 13  Side          8693 non-null   object 
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [7]:
dupl.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Deck            0
Side            0
dtype: int64

# Modelling

In [10]:
x = dupl.drop(['Transported','PassengerId'], axis=1)
x = pd.get_dummies(x)
y = dupl['Transported']

Standarized Independent Variable

In [8]:
scaler = StandardScaler()

In [13]:
x = scaler.fit_transform(x)

KMeans Clustering

In [16]:
km = KMeans(n_clusters=12, random_state=42)

In [17]:
km_cluster = km.fit_predict(x)

In [18]:
x_cluster = np.column_stack((x, km_cluster))

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x_cluster, y, test_size=0.2, random_state=42, stratify=y)

 Logistic Regression

In [20]:
lr = LogisticRegression()

In [21]:
lr.fit(x_train, y_train)

In [22]:
lr_pred = lr.predict(x_test)
lr_acc = accuracy_score(y_test, lr_pred)
print(f'Logistic Regression Accuracy Score : {lr_acc}')

Logistic Regression Accuracy Score : 0.7901092581943646


In [23]:
print(classification_report(y_test,lr_pred))

              precision    recall  f1-score   support

       False       0.79      0.78      0.79       863
        True       0.79      0.80      0.79       876

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



Decision Tree Classifier

In [24]:
dt = DecisionTreeClassifier()

In [25]:
dt.fit(x_train, y_train)

In [26]:
dt_pred = dt.predict(x_test)
dt_acc = accuracy_score(y_test, dt_pred)
print(f'Decision Tree Accuracy Score : {dt_acc}')

Decision Tree Accuracy Score : 0.7343300747556066


In [27]:
print(classification_report(y_test,dt_pred))

              precision    recall  f1-score   support

       False       0.73      0.74      0.73       863
        True       0.74      0.73      0.73       876

    accuracy                           0.73      1739
   macro avg       0.73      0.73      0.73      1739
weighted avg       0.73      0.73      0.73      1739

