# STARTING PROJECT

### IMPORT NECCESSARY MODULE

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

### IMPORT FILE

In [None]:
original_df = pd.read_csv('train.csv')
original_test_df = pd.read_csv('test.csv')

In [None]:
original_df.head()

### OVERVIEW AND CORRELATION

##### 1. DESCRIBE BASIC STATISTICAL INFO

In [None]:
original_df.describe(percentiles=[i/10 for i in range(1,10)])

##### 2. HEATMAP

In [None]:
corr = original_df.corr()
heatmap = sb.heatmap(corr, cmap="summer_r", annot=True)

##### 3. BOXPLOT

In [None]:
f, (ax1, ax2) = plt.subplots(nrows=2, ncols=3, figsize=(20, 10))
sb.boxplot(x=original_df["Age"], hue='Transported', data=original_df,ax = ax1[0])
sb.boxplot(x=original_df["RoomService"], hue='Transported', data=original_df, ax=ax1[1])
sb.boxplot(x=original_df["FoodCourt"], hue='Transported', data=original_df, ax=ax1[2])
sb.boxplot(x=original_df["ShoppingMall"], hue='Transported', data=original_df, ax=ax2[0])
sb.boxplot(x=original_df["Spa"], hue='Transported', data=original_df, ax=ax2[1])
sb.boxplot(x=original_df["VRDeck"], hue='Transported', data=original_df, ax=ax2[2])
plt.show()

##### 4. KDE - Kernel Density Estimate

In [None]:
f, (ax1, ax2) = plt.subplots(nrows=2, ncols=3, figsize=(20, 10))
sb.kdeplot(x=original_df["Age"], hue='Transported', data=original_df,ax = ax1[0])
sb.kdeplot(x=original_df["RoomService"], hue='Transported', data=original_df, ax=ax1[1])
sb.kdeplot(x=original_df["FoodCourt"], hue='Transported', data=original_df, ax=ax1[2])
sb.kdeplot(x=original_df["ShoppingMall"], hue='Transported', data=original_df, ax=ax2[0])
sb.kdeplot(x=original_df["Spa"], hue='Transported', data=original_df, ax=ax2[1])
sb.kdeplot(x=original_df["VRDeck"], hue='Transported', data=original_df, ax=ax2[2])
plt.show()

# DATA CLEANSING

### DROP UNNECCESSARY COLUMNS AND SPLIT CABIN

In [2]:
original_df = pd.read_csv('train.csv')
original_df[['Cabin1','Cabin2', 'Cabin3']] = original_df['Cabin'].str.split('/', expand=True)
df = original_df.drop(columns=['Name','PassengerId','VIP','Cabin','Cabin2'])

### TEST ###
original_test_df = pd.read_csv('test.csv')
original_test_df[['Cabin1','Cabin2', 'Cabin3']] = original_test_df['Cabin'].str.split('/', expand=True)
test_df = original_test_df.drop(columns=['Name','PassengerId','VIP','Cabin','Cabin2'])

### HANDLE OUTLIER

In [3]:
def handleoutlier(colname):
    q1 = df[colname].quantile(0.05)
    q3 = df[colname].quantile(0.95)
    iqr = q3-q1
    lowerlimit = q1 - (1.5*iqr)
    upperlimit = q3 + (1.5*iqr)
    df.loc[df[colname] < lowerlimit, [colname]] = lowerlimit
    df.loc[df[colname] > upperlimit, [colname]] = upperlimit
for j in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    handleoutlier(j)
    
### TEST ###
def handleoutlier_test_df(colname):
    q1 = test_df[colname].quantile(0.05)
    q3 = test_df[colname].quantile(0.95)
    iqr = q3-q1
    lowerlimit = q1 - (1.5*iqr)
    upperlimit = q3 + (1.5*iqr)
    test_df.loc[test_df[colname] < lowerlimit, [colname]] = lowerlimit
    test_df.loc[test_df[colname] > upperlimit, [colname]] = upperlimit
for j in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    handleoutlier_test_df(j)

### FILL NULL

##### 1. CHECK NULL

In [4]:
df.isnull().sum()

HomePlanet      201
CryoSleep       217
Destination     182
Age             179
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
Cabin1          199
Cabin3          199
dtype: int64

##### 2. FILL NULL

In [5]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

df["Cabin1"] = df["Cabin1"].fillna("F")
df["Cabin3"] = df["Cabin3"].fillna("S")

x = imputer.fit_transform(df[['Age', 'RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])
df[['Age', 'RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = x

df["HomePlanet"] = df["HomePlanet"].fillna("Earth")
df["Destination"] = df["Destination"].fillna("TRAPPIST-1e")
df['CryoSleep'] = df['CryoSleep'].fillna(False)


### TEST ###
imputer = KNNImputer(n_neighbors=5)

test_df["Cabin1"] = test_df["Cabin1"].fillna("F")
test_df["Cabin3"] = test_df["Cabin3"].fillna("S")

x = imputer.fit_transform(test_df[['Age', 'RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])
test_df[['Age', 'RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = x

test_df["HomePlanet"] = test_df["HomePlanet"].fillna("Earth")
test_df["Destination"] = test_df["Destination"].fillna("TRAPPIST-1e")
test_df['CryoSleep'] = test_df['CryoSleep'].fillna(False)

### CREATE NEW COLUMNS (SumSpend from all amount of spending)

In [6]:
df["SumSpend"] = df[['RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
df['AgeCat'] = pd.cut(df.Age,bins=[-1, 5, 12, 18, 50, 150], labels=[0,1,2,3,4])
df['AgeCat'] = df['AgeCat'].astype(int)
df = df.drop(columns=['Age'])

### TEST ###
test_df["SumSpend"] = test_df[['RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_df['AgeCat'] = pd.cut(test_df.Age,bins=[-1, 5, 12, 18, 50, 150], labels=[0,1,2,3,4])
test_df['AgeCat'] = test_df['AgeCat'].astype(int)
test_df = test_df.drop(columns=['Age'])

### ENCODE STR AND BOOL TO NUMBER

In [7]:
from sklearn.preprocessing import LabelEncoder

for col in df.select_dtypes(include = ['object','bool']):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

uhp, indiceuhp = np.unique(np.array(df["HomePlanet"]),return_inverse=True)
ud, indiceud = np.unique(np.array(df["Destination"]),return_inverse=True)
    
one_hot_homeplanet = np.zeros((indiceuhp.size, indiceuhp.max()+1))
one_hot_homeplanet[np.arange(indiceuhp.size),indiceuhp] = 1
one_hot_homeplanet = one_hot_homeplanet.astype(int)

one_hot_destination = np.zeros((indiceud.size, indiceud.max()+1))
one_hot_destination[np.arange(indiceud.size),indiceud] = 1
one_hot_destination = one_hot_destination.astype(int)

df[["Earth","Europa","Mars"]] = one_hot_homeplanet
df[['55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e']] = one_hot_destination
df = df.drop(columns=["HomePlanet","Destination"])

######## TEST ########
for col in test_df.select_dtypes(include = ['object','bool']):
    le = LabelEncoder()
    test_df[col] = le.fit_transform(test_df[col])
    
uhp, indiceuhp = np.unique(np.array(test_df["HomePlanet"]),return_inverse=True)
ud, indiceud = np.unique(np.array(test_df["Destination"]),return_inverse=True)
    
one_hot_homeplanet = np.zeros((indiceuhp.size, indiceuhp.max()+1))
one_hot_homeplanet[np.arange(indiceuhp.size),indiceuhp] = 1
one_hot_homeplanet = one_hot_homeplanet.astype(int)

one_hot_destination = np.zeros((indiceud.size, indiceud.max()+1))
one_hot_destination[np.arange(indiceud.size),indiceud] = 1
one_hot_destination = one_hot_destination.astype(int)

test_df[["Earth","Europa","Mars"]] = one_hot_homeplanet
test_df[['55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e']] = one_hot_destination
test_df = test_df.drop(columns=["HomePlanet","Destination"])

### STANDARD SCALER(not use)

In [None]:
from sklearn.preprocessing import StandardScaler


for i in ['RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','SumSpend']:
    scaler = StandardScaler()
    x = np.array(df[i]).reshape(-1,1)
    scaler.fit(x)
    df[i] = scaler.transform(x).flatten()
    

### TEST ###

for i in ['RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','SumSpend']:
    scaler = StandardScaler()
    x = np.array(test_df[i]).reshape(-1,1)
    scaler.fit(x)
    test_df[i] = scaler.transform(x).flatten()

# MODELING AND EVALUATION

### IMPORT TRAINING MODULE

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split,cross_validate,cross_val_score
from sklearn.preprocessing import StandardScaler

### MAKE TRAIN AND TARGET

In [9]:
train_df = df.drop(columns=["Transported"])
target_df = df["Transported"]

### SELECT MODEL

In [10]:
RandomForestClassifier = RandomForestClassifier()
print("---------- RandomForest ----------")
scores = cross_val_score(RandomForestClassifier, train_df, target_df, cv=5)
print("Mean Score of Cross Validation :", "%.3f"%(np.mean(scores)*100), "%")
print("Max Score of Cross Validation :", "%.3f"%(np.max(scores)*100), "%")


logistic = LogisticRegression()
dfcopy = train_df.copy()
for i in ['RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','SumSpend']:
    scaler = StandardScaler()
    x = np.array(dfcopy[i]).reshape(-1,1)
    scaler.fit(x)
    dfcopy[i] = scaler.transform(x).flatten()

print("---------- Logistic ----------")
scores = cross_val_score(logistic, dfcopy, target_df, cv=5)
print("Mean Score of Cross Validation :", "%.3f"%(np.mean(scores)*100), "%")
print("Max Score of Cross Validation :", "%.3f"%(np.max(scores)*100), "%")


CatBoostClassifier = CatBoostClassifier(verbose = False)
print("---------- CatBoost ----------")
scores = cross_val_score(CatBoostClassifier, train_df, target_df, cv=5)
print("Mean Score of Cross Validation :", "%.3f"%(np.mean(scores)*100), "%")
print("Max Score of Cross Validation :", "%.3f"%(np.max(scores)*100), "%")

---------- RandomForest ----------
Mean Score of Cross Validation : 79.386 %
Max Score of Cross Validation : 80.725 %
---------- Logistic ----------
Mean Score of Cross Validation : 79.179 %
Max Score of Cross Validation : 80.265 %
---------- CatBoost ----------
Mean Score of Cross Validation : 80.605 %
Max Score of Cross Validation : 82.451 %


### FEATURE SELECTION

In [24]:
from sklearn.feature_selection import SequentialFeatureSelector
from catboost import CatBoostClassifier
import time

In [12]:
catboost = CatBoostClassifier(learning_rate = 0.05,iterations = 500,verbose=False)
selector = SequentialFeatureSelector(catboost,direction='backward')

In [13]:
start = time.time()
selector.fit(train_df,target_df)
end = time.time() - start
minute = int(end // 60)
sec = end % 60
print(f"Runtime : {minute}m {sec}s")

Runtime : 9m 57.63s


In [21]:
selector.get_support()

array([ True, False,  True,  True, False, False,  True,  True,  True,
       False, False,  True, False, False,  True, False])

### GET ONLY NECCESSARY FEATURE

In [25]:
train_df = train_df[['CryoSleep', 'RoomService', 'Spa', 'VRDeck', 'Cabin1', 'Cabin3', 'SumSpend']]
### TEST ###
test_df = test_df[['CryoSleep', 'RoomService', 'Spa', 'VRDeck', 'Cabin1', 'Cabin3', 'SumSpend']]

### MAKE TRAIN TEST SPLIT

In [26]:
X_train,X_test,y_train,y_test = train_test_split(train_df,target_df,test_size=0.3,random_state=5981)

### FIT MODEL

In [27]:
#clf = LogisticRegression()
#clf.fit(X_train, y_train)

from catboost import CatBoostClassifier
clf = CatBoostClassifier(verbose = False)
clf.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x1c82917c320>

### PREDICT AND EVALUATE

In [28]:
y_pred = clf.predict(X_test)
print("Accuracy =",accuracy_score(y_test, y_pred)*100, "%")
print("LogLoss =",log_loss(y_test, y_pred))

Accuracy = 82.78374233128835 %
LogLoss = 5.9463678267708895


# TEST

In [None]:
original_test_df

In [None]:
test_df

### PREDICT

In [29]:
test_pred = clf.predict(test_df)
test_pred = test_pred.astype(bool)

### TRANSFORM TO SUBMISSION PATTERN

In [30]:
final = pd.DataFrame({"PassengerId": original_test_df["PassengerId"],
                      "Transported" : test_pred 
})
final

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


### SUBMISSION

In [31]:
final.to_csv('submission.csv',index = False)