In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [9]:
# Load the passenger data
passengers = pd.read_csv("passengers.csv")

print(passengers.head())

   PassengerId  Survived  Pclass  ...     Fare Cabin  Embarked
0            1         0       3  ...   7.2500   NaN         S
1            2         1       1  ...  71.2833   C85         C
2            3         1       3  ...   7.9250   NaN         S
3            4         1       1  ...  53.1000  C123         S
4            5         0       3  ...   8.0500   NaN         S

[5 rows x 12 columns]


In [10]:
# Update sex column to numerical
passengers['Sex'] = passengers['Sex'].map({'male':0,'female':1})

# Fill the nan values in the age column
passengers['Age'].fillna(value=np.mean(passengers["Age"]),inplace=True)

# Create a first class column
passengers["FirstClass"] = passengers["Pclass"].apply(lambda x: 1 if x == 1 else 0)

# Create a second class column
passengers["SecondClass"] = passengers["Pclass"].apply(lambda x: 1 if x == 2 else 0)

print(passengers[["Sex","Age","FirstClass","SecondClass"]])

     Sex        Age  FirstClass  SecondClass
0      0  22.000000           0            0
1      1  38.000000           1            0
2      1  26.000000           0            0
3      1  35.000000           1            0
4      0  35.000000           0            0
..   ...        ...         ...          ...
886    0  27.000000           0            1
887    1  19.000000           1            0
888    1  29.699118           0            0
889    0  26.000000           1            0
890    0  32.000000           0            0

[891 rows x 4 columns]


In [12]:
# Select the desired features
features = passengers[["Sex","Age","FirstClass","SecondClass"]]
survival = passengers["Survived"]

# Perform train, test, split
X_train, X_test, y_train, y_test = train_test_split(features, survival, test_size=0.2, random_state=1234)

print(X_train)
print(X_test)
print(y_train)
print(X_test)

     Sex        Age  FirstClass  SecondClass
125    0  12.000000           0            0
305    0   0.920000           1            0
631    0  51.000000           0            0
643    0  29.699118           0            0
808    0  39.000000           0            1
..   ...        ...         ...          ...
204    0  18.000000           0            0
53     1  29.000000           0            1
294    0  24.000000           0            0
723    0  50.000000           0            1
815    0  29.699118           1            0

[712 rows x 4 columns]
     Sex        Age  FirstClass  SecondClass
523    1  44.000000           1            0
778    0  29.699118           0            0
760    0  29.699118           0            0
496    1  54.000000           1            0
583    0  36.000000           1            0
..   ...        ...         ...          ...
100    1  28.000000           0            0
773    0  29.699118           0            0
222    0  51.000000           0

In [13]:
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

In [15]:
# Create and train the model
model = LogisticRegression()
model.fit(X_train_normalized, y_train)

# Score the model on the train data
score = model.score(X_train_normalized, y_train)
print(score)

# Score the model on the test data
score2 = model.score(X_test_normalized, y_test)
print(score2)

0.7893258426966292
0.8268156424581006


In [16]:
# Analyze the coefficients
print(model.coef_)

[[ 1.18536811 -0.42041547  0.97001083  0.47131741]]


In [17]:
# Sample passenger features for testing
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You = np.array([1.0,30.0,1.0,0.0])

# Combine passenger arrays
sample_passengers = np.array([Jack, Rose, You])

# Scale the sample passenger features
sample_passengers_norm = scaler.transform(sample_passengers)
print(sample_passengers_norm)

# Make survival predictions!
result = model.predict(sample_passengers_norm)
probabilities = model.predict_proba(sample_passengers_norm)
print(result)
print(probabilities)

[[-0.74926865 -0.71924494 -0.57951264 -0.49472744]
 [ 1.33463478 -0.94457879  1.72558791 -0.49472744]
 [ 1.33463478  0.0318679   1.72558791 -0.49472744]]
[0 1 1]
[[0.88457319 0.11542681]
 [0.05927683 0.94072317]
 [0.08675492 0.91324508]]
