In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Import data from a .csv file and show some information about it
titanic_df = pd.read_csv('titanic_data_treated.csv')

In [6]:
# Function to compare the accuracy of predictions
def accuracy_score(truth, pred):
    """ Returns accuracy score for input truth and predictions. """
    
    # Ensure that the number of predictions matches number of outcomes
    if len(truth) == len(pred): 
        
        # Calculate and return the accuracy as a percent
        return "Predictions have an accuracy of {:.2f}%.".format((truth == pred).mean()*100)
    
    else:
        return "Number of predictions does not match number of outcomes!"

In [8]:
# Now let's try to predict without use any machine learning tecnique
# Just with if else statement and graphs that we see in last post
def survived_or_not(data):
    
    predictions = []
    for _, passenger in data.iterrows():
        
        if (passenger['Sex'] == 'male'):
            # Males with less then 18 and that aren't form 3 class survived more
            if (passenger['Age'] < 18) and (passenger['Pclass'] != 3):
                predictions.append(1)
            else:
                predictions.append(0)
                
        if (passenger['Sex'] == 'female'):
            # Females from 3 class died more when have more loved ones on board
            if (passenger['Pclass'] == 3) and (passenger['SibSp'] > 0):
                predictions.append(0)
            elif (passenger['Pclass'] == 3) and (passenger['Parch'] > 2):
                predictions.append(0)
            else:
                predictions.append(1)
    
    # Return our predictions
    return pd.Series(predictions)
# Calling function
predictions = survived_or_not(titanic_df)
print(accuracy_score(titanic_df['Survived'], predictions))

Predictions have an accuracy of 81.79%.


In [9]:
# Change female and male in Sex column to 0 and 1, for training
sex_gender_dict = {'female': 0, 'male': 1}
titanic_df['Sex'] = titanic_df['Sex'].map(sex_gender_dict)
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
0,1,0,3,1,22.0,1,0
1,2,1,1,0,38.0,1,0
2,3,1,3,0,26.0,0,0
3,4,1,1,0,35.0,1,0
4,5,0,3,1,35.0,0,0


In [120]:
# Now let's try with a machine learning algorithm
import sklearn
from sklearn import preprocessing

# Shuffle dataframe to avoid bias in dataset
titanic_df = sklearn.utils.shuffle(titanic_df)

# Define input and output of a model
X = titanic_df.drop('Survived', axis=1).values # Input
X = preprocessing.scale(X) # Decrease dataset complexity to training move faster
y = titanic_df['Survived'].values # Output

# Define test dataset size
test_size = 100

# Defining test and train dataset
X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

In [134]:
# Training suport vector machine model
from sklearn import svm
# model = svm.SVC(kernel='rbf', gamma='auto')
model = svm.SVC(kernel='poly', degree=3, gamma='auto')
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.85

In [130]:
from sklearn import linear_model
log = linear_model.LogisticRegression(solver='lbfgs', C=1e5,multi_class='multinomial')
log.fit(X_train, y_train)
log.score(X_test, y_test)

0.86

In [None]:
# Note: The score can be different because of the way that dataset is shuffle
# To see predict value of each passenger run the command below
for X, y in zip(X_test, y_test):
    print(f'Model: {log.predict([X])[0]}, Actual: {y}')