### Import the required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Load the file into a dataframe

In [2]:
passengers = pd.read_csv('passengers.csv')
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


__PassengerId__     =  
__Survived__        = Did the passenger survive? _(1:Yes, 0:No)_    
__Pclass__          = The class the passenger was in _(1:1st, 2:2nd, 3:3rd)_  
__Name__            = Name of the passenger  
__Sex__             = Gender of the passenger  
__Age__             = Age  
__SipSp__           = Number of passenger's spouse or sibling on board  
__Parch__           = Number of passenger's parents or children on board  
__Ticket__          = Ticket Number  
__Fare__            = How much the passenger paid (was charged in British pound)  
__Cabin__           = The cabin number  
__Embarked__        = The port the passenger boarded the ship from _(S:Southampton, C:Cherbourg, Q:Queenstown)_  

### Clean the data 
#### update the sex column so the Male maps to 0 and Female maps to 1

In [3]:
print("Before mapping")
print(passengers['Sex'].head())

passengers["Sex"] = passengers["Sex"].map({'male':0, 'female':1})

print("\n")

print("After mapping")
print(passengers['Sex'].head())

Before mapping
0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object


After mapping
0    0
1    1
2    1
3    1
4    0
Name: Sex, dtype: int64


#### for rows that do not have values in the age column, fill with the mean of the age column

In [4]:
print("Before update")
print(passengers["Age"].head(10))

passengers["Age"].fillna(inplace=True, value=round(passengers["Age"].mean()))
print("\n")

print("After update")
passengers["Age"].head(10)

Before update
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64


After update


0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5    30.0
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

#### create a column FirstClass that stores 1 when the Pclass = 1

In [5]:
passengers["FirstClass"] = passengers['Pclass'].apply(lambda x: 1 if x==1 else 0)
print(passengers[['Pclass','FirstClass']].head())

   Pclass  FirstClass
0       3           0
1       1           1
2       3           0
3       1           1
4       3           0


#### create a column SecondClass that stores 1 when Pclass = 2

In [6]:
passengers['SecondClass'] = passengers['Pclass'].apply(lambda x: 1 if x==2 else 0)
print(passengers[['Pclass','SecondClass']].head())

   Pclass  SecondClass
0       3            0
1       1            0
2       3            0
3       1            0
4       3            0


#### create a column ThirdClass that stores 1 when Pclass = 3

In [7]:
passengers['ThirdClass'] = passengers['Pclass'].apply(lambda x: 1 if x==3 else 0)
print(passengers[['Pclass','ThirdClass']].head())

   Pclass  ThirdClass
0       3           1
1       1           0
2       3           1
3       1           0
4       3           1


### we believe the following columns are the best to determine whether a passenger survived or not
#### Sex,Age,FirstClass,SecondClass

In [8]:
# store the columns in a variable
# separate the result to another variable

features = passengers[['Sex','Age','FirstClass','SecondClass']]
survival = passengers['Survived']

### split the features dataframe into 2 parts:
#### one to train the model and the other to test the model 

In [9]:
train_features, test_features, train_labels, test_labels = train_test_split(features, survival)

### normalize the data so the huge values in some columns do not affect the model

In [10]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
print(train_features[:10])

[[-0.71506099  0.17511828 -0.54736724  1.9597397 ]
 [ 1.39848211 -0.44171322 -0.54736724  1.9597397 ]
 [ 1.39848211 -0.90433685  1.82692702 -0.51027185]
 [-0.71506099 -0.51881716 -0.54736724  1.9597397 ]
 [-0.71506099  0.0209104  -0.54736724 -0.51027185]
 [-0.71506099  0.09801434 -0.54736724  1.9597397 ]
 [-0.71506099  1.10036553 -0.54736724 -0.51027185]
 [-0.71506099 -2.21510379 -0.54736724 -0.51027185]
 [-0.71506099  0.0209104   1.82692702 -0.51027185]
 [-0.71506099  0.0209104  -0.54736724 -0.51027185]]


In [11]:
test_features = scaler.fit_transform(test_features)
print(test_features[:10])

[[-0.80737343  1.92407438  1.61145096 -0.50979114]
 [-0.80737343  0.01155061 -0.62055875 -0.50979114]
 [-0.80737343 -0.82995984 -0.62055875 -0.50979114]
 [-0.80737343 -1.97747411 -0.62055875 -0.50979114]
 [-0.80737343  0.47055632 -0.62055875  1.96158764]
 [ 1.23858424  0.20280299 -0.62055875  1.96158764]
 [ 1.23858424 -0.9064608   1.61145096 -0.50979114]
 [ 1.23858424 -1.2124646  -0.62055875 -0.50979114]
 [ 1.23858424  0.01155061 -0.62055875 -0.50979114]
 [ 1.23858424  0.39405537 -0.62055875  1.96158764]]


### create a Logistic Regression model and fit it with training data

In [12]:
model = LogisticRegression()
model.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### show the score of the model that is the percentage of correct classifications using the training data

In [13]:
print(model.score(train_features, train_labels))

0.8068862275449101


#### the model was able to predict 79.04% of the time whether a passenger would survive or not

### let us also check the accuracy of the model on the test data

In [14]:
print(model.score(test_features, test_labels))

0.7757847533632287


#### almost the same accuracy 79.37% as that of the training data

### print the coefficients of the model
#### we want to know how each column affects a passenger's survival or not
#### In order of importance: Sex, FirstClass, SecondClass and Age

In [15]:
print(model.coef_)

[[ 1.23422433 -0.45956007  0.97708017  0.4367949 ]]


### Test the model with sample data

In [16]:
# Jack and Rose are data from the main actor and actress from the Titanic movie
# You is imaginary data 
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You = np.array([1.0,35,0.0,1.0])

# combine the data into a single numpy array
sample_passengers = np.array([Jack, Rose, You])

# scale the data since we did same for the data above
sample_passengers = scaler.transform(sample_passengers)

# use the model to predict survival
print(model.predict(sample_passengers))

[0 1 1]


#### turns out our model correctly predicted Jack and Rose's fate 
#### in the movie Jack died (0) and Rose survived(1)