In [69]:
import pandas as pd
from pathlib import Path

train_data = pd.read_csv(Path("train.csv"))
test_data = pd.read_csv(Path("test.csv"))


In [70]:
train_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [71]:
train_data = train_data.set_index("PassengerId")
test_data = test_data.set_index("PassengerId")


In [72]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


The **Cabin** attribute has a lot of missing values (77 %), so we ignore it. For the **Age** attribute it would probably be reasonable to set the median age for the missing values. 

First, let's build pipelines for numerical and categorical attributes.

In [73]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("ordinal_encoder", OrdinalEncoder()),
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("cat_encoder", OneHotEncoder(sparse_output=False)),
])

num_attributes = ["Age", "SibSp", "Parch", "Fare"]
cat_attributes = ["Pclass", "Sex", "Embarked"]

preprocessing_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attributes),
    ("cat", cat_pipeline, cat_attributes),
])



In [74]:
X_train = preprocessing_pipeline.fit_transform(train_data)
x_test = preprocessing_pipeline.transform(test_data)

y_train = train_data["Survived"]


In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(n_estimators=100, random_state=17)
forest_clf.fit(X_train, y_train)

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()


0.8126716604244694

81% accuracy without tuning the hyperparameters isn't all that bad. Using grid search to tune the hyperparameters turns out to be really slow so we will try randomized search. 

In [76]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

parameters = {"n_estimators": np.arange(50, 500),
              "criterion": ("gini", "entropy", "log_loss"),
              }

random_search_forest = RandomizedSearchCV(forest_clf, parameters, cv=10)
random_search_forest.fit(X_train, y_train)

random_search_forest.best_params_


{'n_estimators': 70, 'criterion': 'entropy'}

In [77]:
random_search_forest.best_estimator_.fit(X_train, y_train)

forest_scores_random = cross_val_score(
    random_search_forest.best_estimator_, X_train, y_train, cv=10)
forest_scores_random.mean()


0.8171535580524345

Very slight improvement with randomized search. Let's try using support vector machines. 

In [78]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto", random_state=17)
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()


0.8249313358302123

In [79]:
from sklearn.model_selection import GridSearchCV

parameters = {"C": np.arange(1, 30), "kernel": (
    "linear", "poly", "rbf", "sigmoid")}

grid_search_scv = GridSearchCV(svm_clf, parameters, cv=10)
grid_search_scv.fit(X_train, y_train)

grid_search_scv.best_params_


{'C': 11, 'kernel': 'rbf'}

In [80]:
svc_scores_grid = cross_val_score(
    grid_search_scv.best_estimator_, X_train, y_train, cv=10)
svc_scores_grid.mean()


0.832796504369538