The sinking of the Titanic is one of the most infamous shipwrecks in history.

On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.

While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.

In this challenge, we ask you to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).

In [0]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import OrdinalEncoder
import time
import math
from sklearn.preprocessing import KBinsDiscretizer
import os

Processing train data

In [0]:
def loadTrainData(filename):
    dataframe = pd.read_csv(filename, delimiter=",")
    label = 'Survived'
    dataframe = dataframe.drop(['PassengerId'], axis=1)
    dataframe = dataframe.drop(['Name'], axis=1)
    dataframe = dataframe.drop(['Cabin'], axis=1)
    dataframe = dataframe.drop(['Ticket'], axis=1)
    dataframe = dataframe.drop(['Parch'], axis=1)
    dataframe = dataframe.drop(['SibSp'], axis=1)
    print('Before removing data with missing label :',dataframe.shape)
    dataframe = dataframe.dropna(axis=0, subset=[label])
    dataframe = dataframe.reset_index(drop=True)
    print('Before removing data with missing label :',dataframe.shape)
    non_categorical =['Age','Fare']
    for column in dataframe:
        dataframe[column] = dataframe[column].replace(r'^\s*$', np.nan, regex=True)
        #print(dataframe[column].isnull().sum())
        if dataframe[column].isnull().sum():
            #print dataframe[column].values

            if column not in non_categorical:
                #use most_frequent data for missing data for non_catagorical data
                imp = SimpleImputer(strategy="most_frequent")
                dataframe[column] = imp.fit_transform(dataframe[column].values.reshape(-1, 1)).reshape(-1)
            else:
                #use mean for missing data for catagorical data
                imp = SimpleImputer(missing_values=np.nan, strategy='mean')
                dataframe[column] = imp.fit_transform(dataframe[column].values.reshape(-1, 1)).reshape(-1)
            print (column,' has missing values')


    attributes = list(dataframe.columns.values)
    attributes.remove(label)

    return dataframe,attributes,label


processing test data

In [0]:
def loadTestData(filename):
    dataframe = pd.read_csv(filename, delimiter=",")
    dataframe = dataframe.drop(['Name'], axis=1)
    dataframe = dataframe.drop(['Cabin'], axis=1)
    dataframe = dataframe.drop(['Ticket'], axis=1)
    dataframe = dataframe.drop(['Parch'], axis=1)
    dataframe = dataframe.drop(['SibSp'], axis=1)
    non_categorical =['Age','Fare']
    for column in dataframe:
        dataframe[column] = dataframe[column].replace(r'^\s*$', np.nan, regex=True)
        #print(dataframe[column].isnull().sum())
        if dataframe[column].isnull().sum():
            #print dataframe[column].values

            if column not in non_categorical:
                #use most_frequent data for missing data for non_catagorical data
                imp = SimpleImputer(strategy="most_frequent")
                dataframe[column] = imp.fit_transform(dataframe[column].values.reshape(-1, 1)).reshape(-1)
            else:
                #use mean for missing data for catagorical data
                imp = SimpleImputer(missing_values=np.nan, strategy='mean')
                dataframe[column] = imp.fit_transform(dataframe[column].values.reshape(-1, 1)).reshape(-1)
            print (column,' has missing values') 
            #print(dataframe[column].head())
    #dataframe.to_csv('telco-customer-churn/fixed.csv', sep=',')
    return dataframe


load train data

In [0]:
trainData,attributes,label = loadTrainData('/content/train.csv')
print(trainData.describe)
trainData.loc[trainData["Sex"] == "male","Sex"] = 0
trainData.loc[trainData["Sex"] == "female","Sex"] = 1

trainData['Embarked'] = trainData['Embarked'].map( {'S': 0, 'Q': 1, 'C': 2} ).astype(int)

trainData.loc[trainData["Age"] < 15,"Age"] = 0
trainData.loc[(trainData["Age"] >= 15) & (trainData["Age"] < 30),"Age"] = 1
trainData.loc[(trainData["Age"] >= 30) & (trainData["Age"] < 45),"Age"] = 2
trainData.loc[(trainData["Age"] >= 45) & (trainData["Age"] < 60),"Age"] = 3
trainData.loc[(trainData["Age"] >= 60), 'Age'] = 4

trainData.loc[trainData["Fare"] < 10,"Fare"] = 0
trainData.loc[(trainData["Fare"] >= 10) & (trainData["Fare"] < 25),"Fare"] = 1
trainData.loc[(trainData["Fare"] >= 25) & (trainData["Fare"] < 50),"Fare"] = 2
trainData.loc[(trainData["Fare"] >= 50) & (trainData["Fare"] < 100),"Fare"] = 3
trainData.loc[(trainData["Fare"] >= 100), 'Fare'] = 4

print(trainData.head())

Create Training and Test Sets and Apply Scaling

In [0]:
from sklearn.model_selection import train_test_split
trainLabel = trainData[label]
trainData = trainData.drop([label], axis=1)

print(trainData.head())
print(trainLabel.head())
# X_train, X_test, y_train, y_test = train_test_split(trainData, trainLabel, random_state=0)
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)
# print(X_train.shape)
# print(X_test.shape)


Logistic Regression


In [0]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(trainData, trainLabel)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(trainData, trainLabel)))
# print('Accuracy of Logistic regression classifier on test set: {:.2f}'
#      .format(logreg.score(X_test, y_test)))

Decision tree

In [0]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(trainData, trainLabel)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(trainData, trainLabel)))
# print('Accuracy of Decision Tree classifier on test set: {:.2f}'
#      .format(clf.score(X_test, y_test)))

K-nearest neighbour

In [0]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(trainData, trainLabel)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(trainData, trainLabel)))
# print('Accuracy of K-NN classifier on test set: {:.2f}'
#      .format(knn.score(X_test, y_test)))

Linear Discriminant Analysis

In [0]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(trainData, trainLabel)
print('Accuracy of LDA classifier on training set: {:.2f}'
     .format(lda.score(trainData, trainLabel)))
# print('Accuracy of LDA classifier on test set: {:.2f}'
#      .format(lda.score(X_test, y_test)))

Gaussian Naive Bayes


In [0]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(trainData, trainLabel)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(trainData, trainLabel)))
# print('Accuracy of GNB classifier on test set: {:.2f}'
#      .format(gnb.score(X_test, y_test)))

In [0]:

df = loadTestData('/content/test.csv')
print(df.describe)

#fillup values
df['Age'] = df.Age.fillna(meanAge)
df['Embarked'] = df.Embarked.fillna('S')
df['Fare'] = df.Fare.fillna(0.0)

#cast values
df['Age'] = df['Age'].astype('int64')

#map values
df.loc[df["Sex"] == "male","Sex"] = 0
df.loc[df["Sex"] == "female","Sex"] = 1

df.loc[df["Age"] < 15,"Age"] = 0
df.loc[(df["Age"] >= 15) & (df["Age"] < 30),"Age"] = 1
df.loc[(df["Age"] >= 30) & (df["Age"] < 45),"Age"] = 2
df.loc[(df["Age"] >= 45) & (df["Age"] < 60),"Age"] = 3
df.loc[(df["Age"] >= 60), 'Age'] = 4

df.loc[df["Fare"] < 10,"Fare"] = 0
df.loc[(df["Fare"] >= 10) & (df["Fare"] < 25),"Fare"] = 1
df.loc[(df["Fare"] >= 25) & (df["Fare"] < 50),"Fare"] = 2
df.loc[(df["Fare"] >= 50) & (df["Fare"] < 100),"Fare"] = 3
df.loc[(df["Fare"] >= 100), 'Fare'] = 4

df['Embarked'] = df['Embarked'].map( {'S': 0, 'Q': 1, 'C': 2} ).astype(int)

print(df.head())
my_submission = pd.DataFrame({'PassengerId': df.index})
my_submission['PassengerId'] = df['PassengerId']
df = df.drop(['PassengerId'], axis=1)
pred = gnb.predict(df)

my_submission['Survived'] = pred

my_submission['Survived'] = my_submission['Survived'].astype('int64')
print(my_submission.head())
my_submission.to_csv('submission.csv', index=False)