In [23]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from features.transformers import ItemSelector
from sklearn.linear_model import LogisticRegression

In [3]:
DATA_PATH = "../data/"

In [4]:
df = pd.read_csv(DATA_PATH + "train.csv")

In [5]:
df.shape

(891, 12)

In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [20]:
cols = ["Pclass", "Sex", "Survived"]

In [22]:
X = df[["Pclass", "Sex"]]
y = df["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [24]:
params = {
    "C":1e-05,
    "class_weight":'balanced'
}

In [30]:
transformer_list = [("sex", Pipeline(steps=[
                        ('sex_selector', ItemSelector(key="Sex")),
                        ('pid_one_hot', OneHotEncoder(categories='auto',
                                                      handle_unknown='ignore'))])),
                    ('pclass', ItemSelector(key="Pclass"))]

model = Pipeline([
    ("features", FeatureUnion(transformer_list=transformer_list)),
    ("clf", LogisticRegression(**params))
])

model.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('sex',
                                                 Pipeline(steps=[('sex_selector',
                                                                  ItemSelector(key='Sex')),
                                                                 ('pid_one_hot',
                                                                  OneHotEncoder(handle_unknown='ignore'))])),
                                                ('pclass',
                                                 ItemSelector(key='Pclass'))])),
                ('clf', LogisticRegression())])

In [31]:
y_pred = model.predict(X_test)

In [32]:
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2}')
print(f'Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred):.2}')
print(f'F1: {f1_score(y_test, y_pred):.2}')

Accuracy: 0.78
Balanced Accuracy: 0.77
F1: 0.72
