# Titanic survival prediction



---
We will predict the survival of a person based on given set of attributes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from IPython.display import HTML
import warnings


In [2]:
def numericValues(pdSeries, fillNullValues = True):
    itemNo = 0
    for item in pdSeries.unique():
        if pd.notna(item):
            pdSeries = pdSeries.replace(item,itemNo)
        elif fillNullValues:
            pdSeries = pdSeries.fillna(itemNo)
        itemNo+=1
    return pdSeries

In [3]:
csvTrainData = pd.read_csv('./train.csv')
trainData = csvTrainData
csvTestData = pd.read_csv('./test.csv')
testData = csvTestData
csvSurvivedData = pd.read_csv('./gender_submission.csv')
survivedData = csvSurvivedData



In [4]:
shapeTrain = trainData.shape
print("Shape of data : "+str((shapeTrain))+"\n")

cols = list(trainData.columns)
print("columns : "+str(cols)+"\n")

# print("\n")
(trainData.info())


Shape of data : (891, 12)

columns : ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


---

### Null Values
Now I will check null values present in the training dataset


In [5]:
nullPercent = ((trainData.isnull().sum()/891).sort_values(ascending=False)*100).round(2)
nullCount = trainData.isnull().sum().sort_values(ascending=False)
nullValues =pd.concat([nullCount,nullPercent], axis=1, keys=["count","percentage"])

nullValues.head(5)

Unnamed: 0,count,percentage
Cabin,687,77.1
Age,177,19.87
Embarked,2,0.22
Fare,0,0.0
Ticket,0,0.0


We can see that the Cabin column is having highest number of null values, i.e., 687 which is 77% of total. This column will not be useful for us because of that reason.

In [6]:
print(trainData.Sex.unique())

['male' 'female']


In [7]:
# trainData.Sex = numericValues(trainData.Sex)
trainData['Sex'] = np.where(trainData['Sex'] == 'male',1,0)
print(trainData.Sex.unique())

[1 0]


In [8]:
print(trainData.Parch.unique())

[0 1 2 5 3 4 6]


In [9]:
trainData.Parch = pd.to_numeric(trainData['Parch'])
print(trainData.Parch.unique())

[0 1 2 5 3 4 6]


In [10]:
trainData.Cabin.isnull().sum()

687

Since most of the values are null in Cabin column, we will remove this column altogether.

In [11]:
print(cols)
trainDataCleaned = trainData.drop(columns='Cabin',axis=1)
cols = list(trainDataCleaned.columns)
print("\nUpdated columns:\n"+str(cols))

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

Updated columns:
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked']


In [12]:
# Embarked: The port in which a passenger has embarked. 
#     C - Cherbourg, 
#     S - Southampton, 
#     Q = Queenstown
print(trainDataCleaned.Embarked.unique())

['S' 'C' 'Q' nan]


In [13]:
trainDataCleaned.Embarked = numericValues(trainDataCleaned.Embarked)
print(trainDataCleaned.Embarked.unique())

[0. 1. 2. 3.]


In [14]:
trainDataCleaned.info()
trainDataCleaned.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int64
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       891 non-null float64
dtypes: float64(3), int64(6), object(2)
memory usage: 76.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,1.0


In [15]:
x=pd.pivot_table(trainDataCleaned,index = 'Embarked')
print(x)

                Age       Fare     Parch  PassengerId    Pclass       Sex  \
Embarked                                                                    
0.0       29.445397  27.079812  0.413043   449.527950  2.350932  0.684783   
1.0       30.814769  59.954144  0.363095   445.357143  1.886905  0.565476   
2.0       28.089286  13.276030  0.168831   417.896104  2.909091  0.532468   
3.0       50.000000  80.000000  0.000000   446.000000  1.000000  0.000000   

             SibSp  Survived  
Embarked                      
0.0       0.571429  0.336957  
1.0       0.386905  0.553571  
2.0       0.428571  0.389610  
3.0       0.000000  1.000000  


In [16]:
display(trainDataCleaned[trainDataCleaned["Embarked"]==3.0])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
61,62,1,1,"Icard, Miss. Amelie",0,38.0,0,0,113572,80.0,3.0
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",0,62.0,0,0,113572,80.0,3.0


In [17]:
survivedData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId    418 non-null int64
Survived       418 non-null int64
dtypes: int64(2)
memory usage: 6.6 KB


In [18]:
print("Survived :")
print(list(survivedData["PassengerId"]))
print("Train data : ")
(trainData.head())

Survived :
[892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [19]:
trainDataCleaned = trainDataCleaned.drop(['Name','Ticket'],axis=1) 

In [20]:
trainDataCleaned['Age'] = trainDataCleaned.Age.fillna(trainDataCleaned[trainDataCleaned.Age.notnull()].Age.mean())

In [21]:
trainDataCleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       891 non-null float64
dtypes: float64(3), int64(6)
memory usage: 62.7 KB


In [22]:
x_train = trainDataCleaned.iloc[:,2:]
y_train = trainDataCleaned.iloc[:,1]
display(x_train.head())
display(y_train.head())


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,0.0
1,1,0,38.0,1,0,71.2833,1.0
2,3,0,26.0,0,0,7.925,0.0
3,1,0,35.0,1,0,53.1,0.0
4,3,1,35.0,0,0,8.05,0.0


0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [23]:
logRegr = LogisticRegression()
logRegr.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [24]:
x_test = testData.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'],axis=1)


In [25]:
x_test['Age'] = x_test.Age.fillna(x_test[x_test.Age.notnull()].Age.mean())
x_test['Fare'] = x_test.Fare.fillna(x_test[x_test.Fare.notnull()].Fare.mean())
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


In [26]:
x_test.Embarked = numericValues(x_test.Embarked)

In [27]:
x_test.Sex = np.where(x_test.Sex == 'male',1,0)
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null int64
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
Embarked    418 non-null int64
dtypes: float64(2), int64(5)
memory usage: 22.9 KB


In [28]:
y_predict = logRegr.predict(x_test)

In [29]:
confidencePercent = round(logRegr.score(x_train,y_train)*100,2)
print(confidencePercent)

80.25


In [30]:
y_test_Actual = survivedData['Survived'] 

In [38]:
print(logRegr.score(x_test,y_test_Actual))
submission =  pd.DataFrame({"PassengerId" : survivedData['PassengerId'], "Survived" : y_predict})
# submission.head()
submission.to_csv("./titanicPredictionLogReg.csv",index = False)

0.937799043062201


---
Lets print out for some plots

In [32]:
# print(list(cabinRemoved.Age))
# sns.violinplot(cabinRemoved.Age)
# plt.rcParams['figure.figsize'] = [20, 15]
# fig, ax = plt.subplots()

# bins1 = [0,10,20,30,40,50,60,70,80,90];

# sns.distplot(cabinRemoved[(cabinRemoved.Sex==0) & cabinRemoved.Age.notnull()].Age, \
#              bins = bins1, hist_kws={"alpha": 0.7, "edgecolor" : "black"}, color = 'pink', kde=False);
# sns.distplot(cabinRemoved[(cabinRemoved.Sex==0) & cabinRemoved.Age.notnull() & cabinRemoved.Survived==1].Age, \
#              bins =bins1, hist_kws={"alpha": 0.5}, kde=False);

# for i in range(len(bins1)):
#     ax.text(bins1[i]-0.05,cabinRemoved[(cabinRemoved.Sex==0) & cabinRemoved.Age.notnull()].Age.values[i]+10, \
#             str(cabinRemoved[(cabinRemoved.Sex==0) & cabinRemoved.Age.notnull()].Age.values[i]),fontsize='large',\
#             horizontalalignment='center')



# plt.show()

In [33]:
# cabinRemoved[(cabinRemoved.Sex==0) & cabinRemoved.Age.notnull()].Age.idxmax()
# cabinRemoved.iloc[630]

In [34]:
# sns.distplot(cabinRemoved[(cabinRemoved.Sex==0) & cabinRemoved.Age.notnull()].Age, kde=False);
# plt.rcParams['figure.figsize'] = [20, 15]
# fig, ax = plt.subplots()

# sns.distplot(cabinRemoved[(cabinRemoved.Sex==1) & cabinRemoved.Age.notnull()].Age, color = 'pink', \
#              bins =[0,10,20,30,40,50,60,70], hist_kws={"alpha": 0.7, "edgecolor":"red"}, kde = False);
# sns.distplot(cabinRemoved[(cabinRemoved.Sex==1) & cabinRemoved.Age.notnull() & cabinRemoved.Survived==1].Age, \
#              bins =[0,10,20,30,40,50,60,70], hist_kws={"alpha": 0.5, "edgecolor":"red"}, kde=False);
# plt.show()

In [35]:
# cabinRemoved.Age.notnull
# cabinRemoved.info()


In [36]:
# missingno.matrix(cabinRemoved,figsize = (15,10))