# Import and Split

In [2]:
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import scipy.stats as stats
# import numpy as np
# from IPython.display import IFrame

from sklearn.model_selection  import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
# from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

df = pd.read_csv("./data/train.csv", index_col=0)
kaggle_test = pd.read_csv("./data/test.csv", index_col=0)
kaggle_test = kaggle_test.reset_index()

In [3]:
y = df["Survived"]
X = df.drop(["Survived", "Name", "Ticket", "Cabin"], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 314)

# Feature engineering

In [4]:
cat_pipe = make_pipeline(SimpleImputer(strategy = "most_frequent"), OneHotEncoder(handle_unknown = "ignore", sparse=False))
age_pipe = make_pipeline(SimpleImputer(strategy = "median"), MinMaxScaler())
bin_pipe = make_pipeline(SimpleImputer(strategy = "median"), KBinsDiscretizer(n_bins = 4, strategy = "kmeans"))

feature_transform = ColumnTransformer(transformers = [
    ("age", age_pipe, ["Age"]),
    ("cat", cat_pipe, ["Pclass", "Embarked", "Sex"]),
    ("fare", bin_pipe, ["Fare"]),
    ("do_nothing", "passthrough", ["SibSp", "Parch"])
])

# Fit and Test Models

In [5]:
# Logistic regression model

pipeline = make_pipeline(feature_transform, LogisticRegression(max_iter = 400))

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))

0.7991573033707865
0.8212290502793296




In [6]:
# Random forest model

forest = make_pipeline(feature_transform, RandomForestClassifier(max_depth = 5, n_estimators = 100))

forest.fit(X_train, y_train)

print(forest.score(X_train, y_train))
print(forest.score(X_test, y_test))



0.8370786516853933
0.8268156424581006


In [7]:
# Voting Classifier to make an ensemble model

models = [
          ('logreg', LogisticRegression()),
          ('forest', RandomForestClassifier(max_depth = 5, n_estimators=120)),
]
m = make_pipeline(feature_transform, VotingClassifier(models))

m.fit(X_train, y_train)

print(m.score(X_train, y_train))
print(m.score(X_test, y_test))



0.8202247191011236
0.8212290502793296


# Export for Upload

In [8]:
kaggle_forest = forest.predict(kaggle_test)
kaggle_ensemble = m.predict(kaggle_test)

kaggle_forest = pd.DataFrame({"PassengerId": kaggle_test["PassengerId"], "Survived": kaggle_forest[0:]})
kaggle_ensemble = pd.DataFrame({"PassengerId": kaggle_test["PassengerId"], "Survived": kaggle_ensemble[0:]})

kaggle_forest.to_csv("./output/kaggle_forest.csv", index = False)
kaggle_ensemble.to_csv("./output/kaggle_ensemble.csv", index = False)