## The second try to improve my score
the reslut was 77%

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [5]:
trainData = pd.read_csv('data/train.csv')
testData = pd.read_csv('data/test.csv')

# Title

In [7]:
trainData["Title"] = trainData["Name"].apply(lambda x: x.split(',')[1].split('.')[0].strip())
testData["Title"] = testData["Name"].apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [9]:
trainData['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [10]:
trainData['Title'].value_counts()

Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64

In [11]:
# Let's group common titles together
titleGroups = {
    'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
    'Don': 'Mr', 'Rev' : 'Rev', 'Dr' : 'Dr', 'Mme': 'Mrs',
    'Ms': 'Miss', 'Major': 'Mr', 'Lady': 'Mrs', 'Sir': 'Mr', 
    'Mlle': 'Miss', 'Col': 'Mr', 'Capt': 'Mr', 
    'the Countess': 'Mrs','Jonkheer': 'Mr'
}

trainData['Title'] = trainData['Title'].map(titleGroups)
testData['Title'] = testData['Title'].map(titleGroups)
trainData['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Rev', 'Dr'], dtype=object)

# Cabin and Ticket

In [13]:
import re
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
data = [trainData, testData]

for dataset in data:
    dataset['Cabin'] = dataset['Cabin'].fillna("U0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int)


# out of the loop
trainData['Deck'].value_counts()

Deck
8    687
3     59
2     47
4     33
5     32
1     15
6     13
7      4
0      1
Name: count, dtype: int64

In [14]:
# we can now drop the cabin feature
trainData = trainData.drop(['Cabin'], axis=1)
testData = testData.drop(['Cabin'], axis=1)

# Family size

In [15]:
trainData['Fam_size'] = trainData['SibSp'] + trainData['Parch'] + 1
testData['Fam_size'] = testData['SibSp'] + testData['Parch'] + 1

# Family Type

In [16]:
trainData["Fam_type"] = pd.cut(trainData.Fam_size, [0,1,4,7,11], labels=['Solo', 'Small', 'Big', 'Huge'])
testData["Fam_type"] = pd.cut(testData.Fam_size, [0,1,4,7,11], labels=['Solo', 'Small', 'Big', 'Huge'])

# Train the models

In [18]:
# import all sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [17]:
y = trainData['Survived']
features = ['Pclass', 'Fare', 'Title', 'Embarked', 'Fam_type', 'Deck']
X = trainData[features]
X.head()

Unnamed: 0,Pclass,Fare,Title,Embarked,Fam_type,Deck
0,3,7.25,Mr,S,Small,8
1,1,71.2833,Mrs,C,Small,3
2,3,7.925,Miss,S,Solo,8
3,1,53.1,Mrs,S,Small,3
4,3,8.05,Mr,S,Solo,8


## the pipeline

In [19]:
numerical_cols = ['Fare']
categorical_cols = ['Pclass', 'Title', 'Embarked', 'Fam_type', 'Deck']

# inputing numerical values with median
numerical_transformer = SimpleImputer(strategy='median')

# inputing missing values with mosst frequent one for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# bundle preprocessing anad modeling code
titanic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=0, n_estimators=500, max_depth=5))
])

# Training
titanic_pipeline.fit(X, y)

print('cross_val_score: {:.3f}'.format(cross_val_score(titanic_pipeline, X, y, cv=10).mean()))

cross_val_score: 0.824


In [20]:
X_test = testData[features]
predictions = titanic_pipeline.predict(X_test)

# submission

In [21]:
output = pd.DataFrame({'PassengerId': testData.PassengerId, 'Survived': predictions})
output.to_csv('data/my_sub.csv', index=False)