# Titanic 

[Kaggle](https://www.kaggle.com/c/titanic)

In [2]:
import numpy as np
import pandas as pd

INPUT_PREFIX = 'input/titanic'
OUTPUT_PREFIX = 'output/titanic'

Load train data:

In [72]:
train_data = pd.read_csv(f'{INPUT_PREFIX}/train.csv')

train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


Load test data:

In [69]:
test_data = pd.read_csv(f'{INPUT_PREFIX}/test.csv')

test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


### Random Forest Classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test) 

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})

output.head()

0.0


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


Lets test it with pruning

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data, train_data['Survived'], test_size=0.33, random_state=42)

features = ["Pclass", "Sex", "SibSp", "Parch"]

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
X_train_dummies = pd.get_dummies(X_train[features])
X_test_dummies = pd.get_dummies(X_test[features])
model.fit(X_train_dummies, y_train)

predictions = model.predict(X_test_dummies)

output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': predictions, 'Original': X_test.Survived})

output.head()

Unnamed: 0,PassengerId,Survived,Original
709,710,0,1
439,440,0,0
840,841,0,0
720,721,1,1
39,40,0,1


Сколько совпали?

In [18]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

predicted_ok = output[output["Survived"] == output["Original"]].shape[0]

print(f'Right ({predicted_ok}) / Wrong ({output.shape[0] - predicted_ok}) / Size ({output.shape[0]})')

# средняя ошибка
mae = mean_absolute_error(output["Original"], output["Survived"])

print(f'Mean absolute error = {mae}')

# среднеквадратичная ошибка
mse = mean_squared_error(output["Original"], output["Survived"])

print(f'Mean squared error = {mse}')

print(f'Model score = {model.score(X_test_dummies, y_test)}')

Right (236) / Wrong (59) / Size (295)
Mean absolute error = 0.2
Mean squared error = 0.2
Model score = 0.8


## Logistic regression

In [112]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

features = [
            "Pclass", "Sex", "Family_members", 
            "Embarked_C","Embarked_Q","Embarked_S",
           ]

for x in range(7):
    features.append(f'AG_{x}')

idf = 'PassengerId'
surf = 'Survived'

def get_age_group(age):
    if age >0 and age < 16:
        return 1
    elif age >= 16 and age < 20:
        return 2
    elif age >= 20 and age < 30:
        return 3
    elif age >= 30 and age < 40:
        return 4
    elif age >= 40 and age < 50:
        return 5
    elif age >= 50:
        return 6
    else:
        return 0

def prepare(df):
    df = df.replace('male', 1)
    df = df.replace('female', 0)
    df = df.replace(np.nan, 0)
    
    df['Family_members'] = df['SibSp'] + df['Parch']
    df['Age_Group'] = df["Age"].apply(get_age_group)
    
    embarked = pd.get_dummies(df['Embarked'], prefix="Embarked")
    df = pd.concat([df, embarked], axis=1)
    
    age_groups = pd.get_dummies(df['Age_Group'], prefix="AG")
    df = pd.concat([df, age_groups], axis=1)
    
    return df

data_only = prepare(train_data)

idf = 'PassengerId'
surf = 'Survived'
origf = 'Original'

X_train, X_test, y_train, y_test = train_test_split(data_only, train_data[surf], test_size=0.33, random_state=42)

X_train_dummies = pd.get_dummies(X_train[features])
X_test_dummies = pd.get_dummies(X_test[features])

model = LogisticRegression(solver='saga', penalty='l1', multi_class='multinomial', max_iter=10000, verbose=1, n_jobs=-1)

model.fit(X_train_dummies, y_train)

predicted = model.predict(X_test_dummies)

df_tmp = {}
df_tmp[idf] = X_test[idf]
df_tmp[surf] = predicted
df_tmp[origf] = y_test 

output = pd.DataFrame(df_tmp)

predicted_ok = output[output[surf] == output[origf]].shape[0]

print(f'Right ({predicted_ok}) / Wrong ({output.shape[0] - predicted_ok}) / Size ({output.shape[0]})')

# средняя ошибка
mae = mean_absolute_error(output[origf], output[surf])

print(f'Mean absolute error = {mae}')

# среднеквадратичная ошибка
mse = mean_squared_error(output[origf], output[surf])

print(f'Mean squared error = {mse}')

print(f'Model score = {model.score(X_test_dummies, y_test)}')

output.head()

convergence after 126 epochs took 0 seconds
Right (242) / Wrong (53) / Size (295)
Mean absolute error = 0.17966101694915254
Mean squared error = 0.17966101694915254
Model score = 0.8203389830508474


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


Unnamed: 0,PassengerId,Survived,Original
709,710,0,1
439,440,0,0
840,841,0,0
720,721,1,1
39,40,1,1


### Чистовик

In [201]:
idf = 'PassengerId'
surf = 'Survived'

def get_age_group(age):
    if age >0 and age < 16:
        return 1
    elif age >= 16 and age < 20:
        return 2
    elif age >= 20 and age < 30:
        return 3
    elif age >= 30 and age < 40:
        return 4
    elif age >= 40 and age < 50:
        return 5
    elif age >= 50:
        return 6
    else:
        return 0
    
def get_deck(cabin):
    if isinstance(cabin, str):
        return cabin[0]
    
    return 'M'

def get_embarked(em):
    if isinstance(em, str):
        return em
    
    return 'UNK'

def get_fare(f):
    if f < 30:
        return 1
    elif f >= 30 and f < 50:
        return 2
    elif f >= 50 and f < 100:
        return 3
    elif f >= 100:
        return 4
    else:
        return 0

deck = {
    'A': 9,
    'B': 8,
    'C': 7,
    'D': 6,
    'E': 5,
    'F': 4,
    'G': 3,
    'T': 2,
    'M': 1
}

embarked = {
    'C': 1,
    'Q': 2,
    'S': 3,
    'UNK': 0
}

title = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

least_occuring = [ 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess','Dona',
       'Jonkheer']

sex = {
    'male': 1,
    'female': 0
}

def prepare(df):
    df = df.replace('test', 'me')
    df['Sex'] = df['Sex'].map(sex)
    
    df['Family_members'] = df['SibSp'] + df['Parch']
    
    df['Alone'] = df['Family_members'].apply(lambda x: 1 if x == 0 else 0)
    
    df['Age'].fillna(df['Age'].mean(), inplace = True)
    df['Age'] = df["Age"].apply(get_age_group)
    
    df['Deck'] = df['Cabin'].apply(get_deck)#.map(deck)
    deck = pd.get_dummies(df['Deck'], prefix='Deck')
    df = pd.concat([df, deck], axis=1)
    
    df['Fare'].fillna(df['Fare'].mean(), inplace = True)
    df['Fare'] = df['Fare'].apply(get_fare)
    
    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand = False)
    df['Title'] = df['Title'].replace(least_occuring, 'Rare')
    #df['Title'] = df['Title'].map(title)
    title = pd.get_dummies(df['Title'], prefix='Title')
    df = pd.concat([df, title], axis=1)
    
    df['Embarked'].fillna('S', inplace = True)
    df['Embarked'] = df['Embarked'].apply(get_embarked)#.map(embarked)
    embarked = pd.get_dummies(df['Embarked'], prefix='Embarked')
    df = pd.concat([df, embarked], axis=1)
    
    
    df = df.drop(['Name', 'Ticket', 'PassengerId','Family_members', 'Cabin', 'SibSp', 'Parch', 
                  'Title', 'Embarked', 'Deck'], axis=1)
    
    return df

X_train, X_test, y_train, y_test = train_test_split(train_data, train_data[surf], test_size=0.33, random_state=42)

X_train = prepare(X_train).drop(['Survived', 'Deck_T'], axis=1)

X_test = prepare(X_test).drop(['Survived'], axis=1)

print(list(X_train.columns))
print(list(X_test.columns))

X_train.head(20)

['Pclass', 'Sex', 'Age', 'Fare', 'Alone', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_M', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
['Pclass', 'Sex', 'Age', 'Fare', 'Alone', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_M', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']


Unnamed: 0,Pclass,Sex,Age,Fare,Alone,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,...,Deck_G,Deck_M,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Embarked_C,Embarked_Q,Embarked_S
6,1,1,6,3,1,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
718,3,1,3,1,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
685,2,1,3,2,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
73,3,1,3,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
882,3,0,3,1,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
328,3,0,4,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
453,1,1,5,3,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
145,2,1,2,2,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
234,2,1,3,1,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
220,3,1,2,1,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1


In [202]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import VotingClassifier

KNN=KNeighborsClassifier(5)
NAIVE=GaussianNB()
SVM=SVC()
DT=DecisionTreeClassifier()
LR = LogisticRegression(max_iter=1000, multi_class='multinomial')
RF = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)
Ensemble = VotingClassifier( estimators= [('KNN',KNN),('NB',NAIVE),('SVM',SVM),('DT',DT),('LR',LR),('RF',RF)], voting = 'hard')

Ensemble.fit(X_train,y_train)

predicted = Ensemble.predict(X_test)

df_tmp = {}
df_tmp[surf] = predicted
df_tmp[origf] = y_test 

output = pd.DataFrame(df_tmp)
output.head()

Unnamed: 0,Survived,Original
709,0,1
439,0,0
840,0,0
720,1,1
39,1,1


In [203]:
from sklearn import metrics

predicted_ok = output[output[surf] == output[origf]].shape[0]

print(f'Right ({predicted_ok}) / Wrong ({output.shape[0] - predicted_ok}) / Size ({output.shape[0]})')

# средняя ошибка
mae = mean_absolute_error(output[origf], output[surf])

print(f'Mean absolute error = {mae}')

# среднеквадратичная ошибка
mse = mean_squared_error(output[origf], output[surf])

print(f'Mean squared error = {mse}')

print(f'Model score = {Ensemble.score(X_test, y_test)}')

print('======')

Y_pred_rand = (Ensemble.predict(X_train) > 0.5).astype(int)
print('Precision : ', np.round(metrics.precision_score(y_train, Y_pred_rand)*100,2))
print('Accuracy : ', np.round(metrics.accuracy_score(y_train, Y_pred_rand)*100,2))
print('Recall : ', np.round(metrics.recall_score(y_train, Y_pred_rand)*100,2))
print('F1 score : ', np.round(metrics.f1_score(y_train, Y_pred_rand)*100,2))
print('AUC : ', np.round(metrics.roc_auc_score(y_train, Y_pred_rand)*100,2))

Right (239) / Wrong (56) / Size (295)
Mean absolute error = 0.18983050847457628
Mean squared error = 0.18983050847457628
Model score = 0.8101694915254237
Precision :  89.56
Accuracy :  86.91
Recall :  73.42
F1 score :  80.69
AUC :  84.17
