In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
survival_test = pd.read_csv('gender_submission.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
# Update sex column to numerical
train['Sex'].replace(to_replace=['male','female'], value=[0,1], inplace=True)
test['Sex'].replace(to_replace=['male','female'], value=[0,1], inplace=True)

In [7]:
# Fill the nan values in the age column
train['Age'].fillna(round(train['Age'].mean()), inplace=True)
test['Age'].fillna(round(test['Age'].mean()), inplace=True)

In [8]:
train['FirstClass'] = train['Pclass'].apply(lambda x: 1 if x == 1 else 0)
train['SecondClass'] = train['Pclass'].apply(lambda x: 1 if x == 2 else 0)
test['FirstClass'] = test['Pclass'].apply(lambda x: 1 if x == 1 else 0)
test['SecondClass'] = test['Pclass'].apply(lambda x: 1 if x == 2 else 0)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  FirstClass   891 non-null    int64  
 13  SecondClass  891 non-null    int64  
dtypes: float64(2), int64(8), object(4)
memory usage: 97.6+ KB


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    int64  
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  FirstClass   418 non-null    int64  
 12  SecondClass  418 non-null    int64  
dtypes: float64(2), int64(7), object(4)
memory usage: 42.6+ KB


In [11]:
features = train[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival = train['Survived']
features_test = test[['Sex', 'Age', 'FirstClass', 'SecondClass']]

In [12]:
print(features.head)
print(survival.head)

<bound method NDFrame.head of      Sex   Age  FirstClass  SecondClass
0      0  22.0           0            0
1      1  38.0           1            0
2      1  26.0           0            0
3      1  35.0           1            0
4      0  35.0           0            0
..   ...   ...         ...          ...
886    0  27.0           0            1
887    1  19.0           1            0
888    1  30.0           0            0
889    0  26.0           1            0
890    0  32.0           0            0

[891 rows x 4 columns]>
<bound method NDFrame.head of 0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64>


In [13]:
X_train, X_test, y_train, y_test = train_test_split(features, survival, test_size=0.2)

In [14]:
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
X_train

array([[ 1.42019335e+00, -7.15524455e-05, -5.60047441e-01,
        -5.12261015e-01],
       [ 1.42019335e+00, -4.56298489e-01,  1.78556302e+00,
        -5.12261015e-01],
       [-7.04129476e-01, -6.08374134e-01, -5.60047441e-01,
        -5.12261015e-01],
       ...,
       [ 1.42019335e+00, -1.06460107e+00,  1.78556302e+00,
        -5.12261015e-01],
       [ 1.42019335e+00, -4.56298489e-01,  1.78556302e+00,
        -5.12261015e-01],
       [-7.04129476e-01,  1.59672272e+00, -5.60047441e-01,
         1.95212981e+00]])

In [16]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [17]:
model.score(X_train, y_train)

0.8047752808988764

In [18]:
model.score(X_test, y_test)

0.776536312849162

In [19]:
print(model.coef_)
print(list(zip(['Sex','Age','FirstClass','SecondClass'],model.coef_[0])))

[[ 1.23288339 -0.45174037  1.01489929  0.51635627]]
[('Sex', 1.2328833882790264), ('Age', -0.45174036718279403), ('FirstClass', 1.0148992863274864), ('SecondClass', 0.5163562744491694)]


In [20]:
features_test =scaler.transform(features_test)

In [21]:
survival_pre = model.predict(features_test)

In [22]:
model.predict_proba(features_test[:5])

array([[0.93132395, 0.06867605],
       [0.60289235, 0.39710765],
       [0.90714896, 0.09285104],
       [0.91290136, 0.08709864],
       [0.39145445, 0.60854555]])

In [23]:
survival_pre

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [27]:
survival_test = survival_test['Survived']

In [28]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print("Accuracy: ", accuracy_score(survival_pre, survival_test))
print("Recall: ", recall_score(survival_pre, survival_test))
print("Precision: ", precision_score(survival_pre, survival_test))
print("F1: ", f1_score(survival_pre, survival_test))

Accuracy:  0.9569377990430622
Recall:  0.9407894736842105
Precision:  0.9407894736842105
F1:  0.9407894736842105
