In [1]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Converting csv file into data frame 'df'
df = pd.read_csv('passengers.csv')

# Analysing first five rows
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Categorizing the variable 'Sex' in the order of ['male','female']
df['Sex'] = pd.Categorical(df['Sex'],['male','female'], ordered = True)

# Converting the above categorized data ['male','female'] into value male = 0, & female = 1 in a new column 'SexValue'
df['SexValue'] = df['Sex'].cat.codes

# Analysing first five rows especially 'SexValue column to see if we get intended values for male and female'
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexValue
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [4]:
# Replacing nan values in Age column with mean Age
df = df.fillna(value={'Age':df['Age'].mean()})

In [8]:
# print(df['Age'].values)
# Too many nan values
df.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [9]:
# New column FirsClass has been created to store value 1 for all passengers (refer Pclass column) in firt class(1) and 0 for all other passengers
df['FirstClass'] = df.Pclass.apply(lambda x: 1 if x == 1 else 0)

# If distinct value_counts of FirstClass column of 1 = 216 (compare output with df.Pclass.value_counts() in above cell), then we can say data is correct
print(df.FirstClass.value_counts())


0    675
1    216
Name: FirstClass, dtype: int64


In [12]:
# New column SecondClass has been created to store value 1 for all passengers (refer Pclass column) in Second(2) class and 0 for all other passengers
df['SecondClass'] = df.Pclass.apply(lambda x: 1 if x == 2 else 0)
df.head()
# We can either check the value counts and compare the output with df.Pclass.value_counts() like we did in above cell

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexValue,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,0


In [13]:
# Creating new data frame 'feature' with independent variables for regression analysis
features = df[['SexValue','Age','FirstClass','SecondClass']]

# Creating new data frame 'survival' with dependent variable for regression analysis
survival = df['Survived']

# Inspecting first 5 rows of features data frame
features.head()

Unnamed: 0,SexValue,Age,FirstClass,SecondClass
0,0,22.0,0,0
1,1,38.0,1,0
2,1,26.0,0,0
3,1,35.0,1,0
4,0,35.0,0,0


In [14]:
# Using train_test_split model
features_training, features_test, survival_training, survival_test = train_test_split(features,survival, train_size = .8, test_size = .2)


In [15]:
# Normalization before performing Logistic Regression
scaler = StandardScaler()
features_training = scaler.fit_transform(features_training)
features_test = scaler.transform(features_test)


In [16]:
# Creating LogisticRegression model 
model = LogisticRegression()

# Fitting the model with training data
model.fit(features_training, survival_training)

LogisticRegression()

In [19]:
# Testing train accuracy, usually a score(R**2) of 0.70 is considered good
training_score = model.score(features_training, survival_training)
print(training_score)

0.7935393258426966


In [20]:
# Testing test accuracy of the model
test_score = model.score(features_test, survival_test)
test_score

0.7486033519553073

In [21]:
# Getting features coefficients
features_coef = model.coef_
print(features_coef)

[[ 1.30491237 -0.48465719  1.07007222  0.48768817]]


In [22]:
# Features with larger positive coefficients will increase/determine the probability of survival.
# From analysing the above 4 coefficients of 4 independent variables [['SexValue','Age','FirstClass','SecondClass']],
# 'SexValue' column (being => 0,1 = 'male','female') determines the survival chances since this feature has highest coefficient value

In [24]:
# Sample passenger features
# PassengerName = np.array(['SexValue','Age','FirstClass','SecondClass']), adding value in float type
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You = np.array([0.0,25.0,0.0,1.0])

In [25]:
sample_passengers = np.array([Jack, Rose, You])
sample_passengers

array([[ 0., 20.,  0.,  0.],
       [ 1., 17.,  1.,  0.],
       [ 0., 25.,  0.,  1.]])

In [26]:
# Scaling sample_passengers data
sample_passengers = scaler.transform(sample_passengers)
print(sample_passengers)

[[-0.74698519 -0.76557783 -0.55571893 -0.53621134]
 [ 1.33871463 -0.99932711  1.79947082 -0.53621134]
 [-0.74698519 -0.37599571 -0.55571893  1.8649363 ]]


In [90]:
# Predicitng who will survive
# ".predict() takes a matrix of features as a parameter and returns a vector of labels 1 or 0 for each sample."
survival_predict = model.predict(sample_passengers)
print(survival_predict)

[0 1 0]


In [27]:
# From out survival_predict output we can see 'Rose' will survive, since [0 1 0] => [Jack, Rose, You], and the value 0 refers to not surviving and 1 refers to survivig

In [28]:
# Predicting survival probability
survival_prob = model.predict_proba(sample_passengers)

print(survival_prob) 
# 1st is the probability of passenger perishing on the Titanic and the 2nd column is the probability of surviving

[[0.88514035 0.11485965]
 [0.03512344 0.96487656]
 [0.74265954 0.25734046]]
