In [1]:
import pandas as pd
df = pd.read_csv('data/train.csv', index_col=0)

In [2]:
from sklearn.model_selection import train_test_split
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [3]:
def make_dummy(x):
    if x > 0:
        return 1
    return 0

def family_on_board(data):
    data['fam_dum'] = data[data.columns[0]] + data[data.columns[1]]
    data['fam_dum'] = data['fam_dum'].apply(make_dummy)
    return data[['fam_dum']]

def makesq(data):
    return data**2

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import FunctionTransformer

impute_then_scale = make_pipeline(
    SimpleImputer(strategy='median'),
    MinMaxScaler()
)
impute_then_scale_then_sq = make_pipeline(
    SimpleImputer(strategy='median'),
    FunctionTransformer(makesq),
    MinMaxScaler(),
)
impute_then_onehot = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse=False, handle_unknown='ignore')
)

fe = ColumnTransformer([
    ('impute then scale', impute_then_scale,  ['Age', 'Fare']),
    ('impute then scale and sq', impute_then_scale_then_sq,  ['Age']),
    ('impute then onehot', impute_then_onehot, ['Embarked']),
    ('onehot', OneHotEncoder(sparse=False), ['Sex']),
    ('scale', MinMaxScaler(), ['Pclass']),
    ('family', FunctionTransformer(family_on_board), ['Parch', 'SibSp'])

])

In [5]:
fe.fit(X_train)

ColumnTransformer(transformers=[('impute then scale',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 ['Age', 'Fare']),
                                ('impute then scale and sq',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('functiontransformer',
                                                  FunctionTransformer(func=<function makesq at 0x7f69d9181040>)),
                                                 ('minmax...
                                ('impute then onehot',
                                 Pipeline(steps=[('simpleimputer',
                         

In [6]:
Xtrans = fe.transform(X_train)
Xtrans.round(2)

array([[0.57, 0.06, 0.32, ..., 1.  , 0.  , 0.  ],
       [0.28, 0.03, 0.08, ..., 1.  , 0.5 , 0.  ],
       [0.4 , 0.02, 0.16, ..., 1.  , 1.  , 0.  ],
       ...,
       [0.51, 0.03, 0.26, ..., 1.  , 1.  , 1.  ],
       [0.17, 0.23, 0.03, ..., 0.  , 0.  , 1.  ],
       [0.26, 0.15, 0.07, ..., 1.  , 0.  , 1.  ]])

In [7]:
Xtrans_test = fe.transform(X_test)

In [8]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [9]:
m = DecisionTreeClassifier(max_depth=100)
m.fit(Xtrans, y_train)

DecisionTreeClassifier(max_depth=100)

In [10]:
ypred = m.predict(Xtrans)

In [11]:
round(m.score(Xtrans, y_train), 3)

0.979

In [12]:
round(m.score(Xtrans_test, y_test), 3)

0.777

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
n = RandomForestClassifier(max_depth=12, n_estimators=100)
n.fit(Xtrans, y_train)

RandomForestClassifier(max_depth=12)

In [15]:
round(n.score(Xtrans, y_train), 3)

0.969

In [16]:
round(n.score(Xtrans_test, y_test), 3)

0.832

In [17]:
fe.fit(X_train)
X_tr = fe.transform(X)
n.fit(X_tr, y)

RandomForestClassifier(max_depth=12)

In [18]:
round(n.score(X_tr, y), 3)

0.969

In [19]:
pred = n.predict(X_tr)

In [20]:
test = pd.read_csv('test.csv', index_col=0)

In [21]:
fe.fit(test)
test_trans = fe.transform(test)

In [22]:
test['Survived'] = n.predict(test_trans)

In [23]:
test

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1
...,...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0


In [24]:
predict= test[['Survived']]

In [25]:
predict

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


In [26]:
predict.to_csv('predict.csv')