In [118]:
import warnings
warnings.simplefilter(action = 'ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
def ignore_warn(*args, **kwargs):
    pass
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Load the data

In [119]:
# Load the passenger data
passengers = pd.read_csv('passengers.csv')
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Data Cleaning

In [120]:
passengers.isna().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [121]:
passengers['Sex'] = passengers['Sex'].map({'male':0,'female':1})
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [122]:
passengers['Age'].fillna(value=round(passengers.Age.mean()), inplace=True)

In [123]:
passengers.isna().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [124]:
passengers.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.033019,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,0.543351,-0.070657,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.329727,0.083081,0.018443,-0.5495
Sex,-0.042939,0.543351,-0.1319,1.0,-0.08466,0.114631,0.245489,0.182333
Age,0.033019,-0.070657,-0.329727,-0.08466,1.0,-0.23244,-0.18033,0.090632
SibSp,-0.057527,-0.035322,0.083081,0.114631,-0.23244,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,0.245489,-0.18033,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.182333,0.090632,0.159651,0.216225,1.0


In [125]:
passengers['FirstClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 1 else 0)

In [126]:
passengers['SecondClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 2 else 0)

In [127]:
passengers['ThirdClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 3 else 0)

In [128]:
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass,ThirdClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1,0,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0,1


Select and Split the Data

In [129]:
features = passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival = passengers['Survived']

In [130]:
X_train, X_test, y_train, y_test = train_test_split(features, survival, train_size= 0.8, test_size = 0.2)

Normalize the Data

In [131]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [132]:
# Create and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Score the model on the train data
print(model.score(X_train, y_train))

# Score the model on the test data
print(model.score(X_test, y_test))


0.797752808988764
0.7877094972067039


In [133]:
# Analyze the coefficients
print(model.coef_)
print(list(zip(['Sex','Age','FirstClass','SecondClass'],model.coef_[0])))

[[ 1.21843142 -0.41779978  1.02340094  0.5547322 ]]
[('Sex', 1.218431423705817), ('Age', -0.4177997784036993), ('FirstClass', 1.0234009438698604), ('SecondClass', 0.5547322012915189)]


In [134]:
# Sample passenger features
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
Isaac = np.array([0.0,20.0,0.0,1.0])

# Combine passenger arrays
sample_passengers = np.array([Jack, Rose, Isaac])

# Scale the sample passenger features
sample_passengers = scaler.transform(sample_passengers)
print(sample_passengers)
# Make survival predictions!
print(model.predict_proba(sample_passengers))
print(model.predict(sample_passengers))

[[-0.74015275 -0.73682034 -0.57735027 -0.5078883 ]
 [ 1.35107247 -0.96358591  1.73205081 -0.5078883 ]
 [-0.74015275 -0.73682034 -0.57735027  1.96893685]]
[[0.89539257 0.10460743]
 [0.05421013 0.94578987]
 [0.68418528 0.31581472]]
[0 1 0]
