# import library

In [48]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

# load data

In [49]:
train_data = pd.read_csv("../preprocessed_data/preprocessed_train.csv", encoding = 'euc-kr')
test_data = pd.read_csv("../preprocessed_data/preprocessed_test.csv", encoding = 'euc-kr')

train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,2,1,0,0.027567,2.0
1,1,1,1,3,1,0,0.271039,0.0
2,1,3,1,2,0,0,0.030133,2.0
3,1,1,1,3,1,0,0.201901,2.0
4,0,3,0,3,0,0,0.030608,2.0


In [50]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,0,3,0,0,0.015282,1
1,893,3,1,4,1,0,0.013663,2
2,894,2,0,6,0,0,0.018909,1
3,895,3,0,2,0,0,0.016908,2
4,896,3,1,2,1,1,0.023984,2


# preprocessing data

In [51]:
X = train_data.drop(columns = ['Survived'])
y = train_data["Survived"]

In [52]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2024)

In [53]:
test_data_id = test_data['PassengerId']
x_test = test_data.drop(columns = ['PassengerId'])
x_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,3,0,0,0.015282,1
1,3,1,4,1,0,0.013663,2
2,2,0,6,0,0,0.018909,1
3,3,0,2,0,0,0.016908,2
4,3,1,2,1,1,0.023984,2


# modeling

In [54]:
# 나이브 베이즈 모델
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
naive_bayes_pred = naive_bayes.predict(X_val)
naive_bayes_accuracy = accuracy_score(y_val, naive_bayes_pred)
naive_bayes_MAE = mean_absolute_error(y_val, naive_bayes_pred)
naive_bayes_MSE = mean_squared_error(y_val, naive_bayes_pred)


# 의사결정트리 모델
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
decision_tree_pred = decision_tree.predict(X_val)
decision_tree_accuracy = accuracy_score(y_val, decision_tree_pred)
decision_tree_MAE = mean_absolute_error(y_val, decision_tree_pred)
decision_tree_MSE = mean_squared_error(y_val, decision_tree_pred)

# 랜덤포레스트 모델
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
random_forest_pred = random_forest.predict(X_val)
random_forest_accuracy = accuracy_score(y_val, random_forest_pred)
random_forest_MAE = mean_absolute_error(y_val, random_forest_pred)
random_forest_MSE = mean_squared_error(y_val, random_forest_pred)

# 서포트 벡터 머신 모델
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_val)
svm_accuracy = accuracy_score(y_val, svm_pred)
svm_MAE = mean_absolute_error(y_val, svm_pred)
svm_MSE = mean_squared_error(y_val, svm_pred)

# 로지스틱 회귀 모델
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
logistic_regression_pred = logistic_regression.predict(X_val)
logistic_regression_accuracy = accuracy_score(y_val, logistic_regression_pred)
logistic_regression_MAE = mean_absolute_error(y_val, logistic_regression_pred)
logistic_regression_MSE = mean_squared_error(y_val, logistic_regression_pred)

# K-최근접 이웃 모델
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
knn_accuracy = accuracy_score(y_val, knn_pred)
knn_MAE = mean_absolute_error(y_val, knn_pred)
knn_MSE = mean_squared_error(y_val, knn_pred)

In [55]:
print(f'''
Naive Bayes
      accuracy : {naive_bayes_accuracy}
      MAE: {naive_bayes_MAE}
      MSE: {naive_bayes_MSE}

Decision Tree
      accuracy : {decision_tree_accuracy}
      MAE: {decision_tree_MAE}
      MSE: {decision_tree_MSE}

Random Forest
      accuracy : {random_forest_accuracy}
      MAE: {random_forest_MAE}
      MSE: {random_forest_MSE}

Support Vector Machine
      accuracy : {svm_accuracy}
      MAE: {svm_MAE}
      MSE: {svm_MSE}

Logistic Regression
      accuracy : {logistic_regression_accuracy}
      MAE: {logistic_regression_MAE}
      MSE: {logistic_regression_MSE}

KNN
      accuracy : {knn_accuracy}
      MAE: {knn_MAE}
      MSE: {knn_MSE}
''')


Naive Bayes
      accuracy : 0.7865168539325843
      MAE: 0.21348314606741572
      MSE: 0.21348314606741572

Decision Tree
      accuracy : 0.7808988764044944
      MAE: 0.21910112359550563
      MSE: 0.21910112359550563

Random Forest
      accuracy : 0.8314606741573034
      MAE: 0.16853932584269662
      MSE: 0.16853932584269662

Support Vector Machine
      accuracy : 0.8370786516853933
      MAE: 0.16292134831460675
      MSE: 0.16292134831460675

Logistic Regression
      accuracy : 0.8146067415730337
      MAE: 0.1853932584269663
      MSE: 0.1853932584269663

KNN
      accuracy : 0.7921348314606742
      MAE: 0.20786516853932585
      MSE: 0.20786516853932585



In [56]:
naive_bayes_test_predict = naive_bayes.predict(x_test)
decision_tree_test_predict = decision_tree.predict(x_test)
random_forest_test_predict = random_forest.predict(x_test)
svm_test_predict = svm.predict(x_test)
logistic_regression_test_predict = logistic_regression.predict(x_test)
knn_test_predict = knn.predict(x_test)

In [57]:
naive_bayes_result_df = pd.DataFrame({'PassengerId': test_data_id, 'Survived': naive_bayes_test_predict})
decision_tree_result_df = pd.DataFrame({'PassengerId': test_data_id, 'Survived': decision_tree_test_predict})
random_forest_result_df = pd.DataFrame({'PassengerId': test_data_id, 'Survived': random_forest_test_predict})
svm_result_df = pd.DataFrame({'PassengerId': test_data_id, 'Survived': svm_test_predict})
logistic_regression_result_df = pd.DataFrame({'PassengerId': test_data_id, 'Survived': logistic_regression_test_predict})
knn_result_df = pd.DataFrame({'PassengerId': test_data_id, 'Survived': knn_test_predict})


In [58]:
naive_bayes_result_df.to_csv('../result/naive_bayes_result.csv', index = False)
decision_tree_result_df.to_csv('../result/decision_tree_result.csv', index = False)
random_forest_result_df.to_csv('../result/random_forest_result.csv', index = False)
svm_result_df.to_csv('../result/svm_result.csv', index = False)
logistic_regression_result_df.to_csv('../result/logistic_regression_result.csv', index = False)
knn_result_df.to_csv('../result/knn_result.csv', index = False)