In [1]:
# import packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
# 1) https://www.analyticsvidhya.com/blog/2021/07/titanic-survival-prediction-using-machine-learning/
# 2) https://medium.com/analytics-vidhya/your-guide-for-logistic-regression-with-titanic-dataset-784943523994

In [2]:
# Read CSV train data file into DataFrame
train_df = pd.read_csv("titanic/train.csv")

# Read CSV test data file into DataFrame
test_df = pd.read_csv("titanic/test.csv")

In [3]:
# Find null values in training data
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# Missing values for Age, Cabin & Embarked - Need to fill 
train_df = train_df.drop(columns='Cabin', axis=1)
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Cannot process strings, turn into categorical values
train_df.replace({'Sex':{'male':0,'female':1}, 'Embarked':{'S':0,'C':1,'Q':2}}, inplace=True)

In [5]:
# Ensure all the null values are filled 
train_df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [6]:
# Split into target and feature variables 
X = train_df.drop(columns = ['PassengerId','Name','Ticket','Survived'],axis=1)
Y = train_df['Survived']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)

In [7]:
# Create Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, Y_train)

X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of test data : ', test_data_accuracy)

Accuracy score of training data :  0.8146067415730337
Accuracy score of test data :  0.7821229050279329


In [11]:
confusion_matrix(Y_test, X_test_prediction)
# 9 + 30 = 39 wrongly predicted in test data 

array([[91,  9],
       [30, 49]])