In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from xgboost.sklearn import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
data.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
405,406,0,2,"Gale, Mr. Shadrach",male,34.0,1,0,28664,21.0,,S
239,240,0,2,"Hunt, Mr. George Henry",male,33.0,0,0,SCO/W 1585,12.275,,S
727,728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q
218,219,1,1,"Bazzani, Miss. Albina",female,32.0,0,0,11813,76.2917,D15,C
636,637,0,3,"Leinonen, Mr. Antti Gustaf",male,32.0,0,0,STON/O 2. 3101292,7.925,,S
313,314,0,3,"Hendekovic, Mr. Ignjac",male,28.0,0,0,349243,7.8958,,S
34,35,0,1,"Meyer, Mr. Edgar Joseph",male,28.0,1,0,PC 17604,82.1708,,C
282,283,0,3,"de Pelsmaeker, Mr. Alfons",male,16.0,0,0,345778,9.5,,S
778,779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
244,245,0,3,"Attalah, Mr. Sleiman",male,30.0,0,0,2694,7.225,,C


In [4]:
test.sample()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
95,987,3,"Tenglin, Mr. Gunnar Isidor",male,25.0,0,0,350033,7.7958,,S


In [5]:
data['Family'] = data.SibSp + data.Parch + 1
test['Family'] = test.SibSp + test.Parch + 1

In [6]:
data = data.drop(["Ticket","Parch", "SibSp", "PassengerId"], axis=1)
test = test.drop(["Ticket","Parch", "SibSp"], axis=1)

In [7]:
data.Name = data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
data.Name = data.Name.replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 0)
data.Name = data.Name.replace(['Mlle','Miss','Ms'], 1)
data.Name = data.Name.replace(['Mme', 'Mrs'],2)
data.Name = data.Name.replace(['Mr', 'Master'],3)

test.Name = test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
test.Name = test.Name.replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 0)
test.Name = test.Name.replace(['Mlle','Miss','Ms'], 1)
test.Name = test.Name.replace(['Mme', 'Mrs'],2)
test.Name = test.Name.replace(['Mr', 'Master'],3)

In [8]:
data.Sex.replace(["male", "female"], [0,1], inplace = True)
test.Sex.replace(["male", "female"], [0,1], inplace = True)

In [9]:
data.Embarked = data.Embarked.fillna(data.Embarked.mode()[0])
data.Fare = data.Fare.astype(float)
data.Fare = data.Fare.fillna(value = np.mean(data.Fare))
data.Cabin = data.Cabin.str[0]
data.Cabin.fillna('X', inplace = True)

test.Embarked = test.Embarked.fillna(test.Embarked.mode()[0])
test.Fare = test.Fare.astype(float)
test.Fare = test.Fare.fillna(value = np.mean(test.Fare))
test.Cabin = test.Cabin.str[0]
test.Cabin.fillna('X', inplace = True)

In [10]:
data.Age

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
5       NaN
6      54.0
7       2.0
8      27.0
9      14.0
10      4.0
11     58.0
12     20.0
13     39.0
14     14.0
15     55.0
16      2.0
17      NaN
18     31.0
19      NaN
20     35.0
21     34.0
22     15.0
23     28.0
24      8.0
25     38.0
26      NaN
27     19.0
28      NaN
29      NaN
       ... 
861    21.0
862    48.0
863     NaN
864    24.0
865    42.0
866    27.0
867    31.0
868     NaN
869     4.0
870    26.0
871    47.0
872    33.0
873    47.0
874    28.0
875    15.0
876    20.0
877    19.0
878     NaN
879    56.0
880    25.0
881    33.0
882    22.0
883    28.0
884    25.0
885    39.0
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [11]:
for i in range(0,890):
    if data.loc[i, "Age"] <= 15:
        data.loc[i, "Age"] = 0
    elif data.loc[i, "Age"] >= 60:
        data.loc[i, "Age"] = 2
    else:
        data.loc[i, "Age"] =1

In [12]:
for i in range(0,417):
    if test.loc[i, "Age"] <= 15:
        test.loc[i, "Age"] = 0
    elif test.loc[i, "Age"] >= 60:
        test.loc[i, "Age"] = 2
    else:
        test.loc[i, "Age"] =1

In [13]:
X = pd.get_dummies(data)
y = data.Survived
X_test = pd.get_dummies(test)

In [14]:
X.drop(["Survived", "Cabin_X", "Embarked_S"], axis=1, inplace=True)
X_test.drop(["Cabin_X", "Embarked_S"], axis=1, inplace=True)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=200)

In [16]:
models = [
        Pipeline([('scaler', StandardScaler()),('model', LogisticRegression())]),
        Pipeline([('model', DecisionTreeClassifier())]),
        Pipeline([('scaler', StandardScaler()),('model', SVC())]),
        Pipeline([('model', RandomForestClassifier())]),
        Pipeline([('model', BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5, max_features=0.3))])
]

param_grids = [
    {"model__penalty":["l1","l2"], "model__C":[0.1,1,10,100,1000]},
    {"model__min_samples_leaf":[5,10,15]},
    [
        {"model__C":[0.01,0.1,1,10,100],
         "model__kernel":["linear"]},
        {"model__C":[0.01,0.1,1,10,100],
         "model__kernel":["rbf"],
         "model__gamma":["auto",0.001,0.005]}
    ],
    {"model__n_estimators":[10,50,100]}
]

from sklearn.metrics import accuracy_score

for model, param_grid in zip(models, param_grids):
    gs = GridSearchCV(model,param_grid,cv=5, n_jobs=-1)
    gs.fit(X_train, y_train)
    print(accuracy_score(gs.best_estimator_.predict(X_test),y_test), gs.best_params_)

0.84 {'model__C': 1, 'model__penalty': 'l2'}
0.88 {'model__min_samples_leaf': 5}
0.835 {'model__C': 100, 'model__gamma': 'auto', 'model__kernel': 'rbf'}
0.865 {'model__n_estimators': 100}
