### Import the relevant libraries

In [252]:
import numpy as np
import pandas as pd

### Load the dataset

In [253]:
titanic_preprocessed = pd.read_csv('titanic_preprocessed.csv')
df_preprocessed = titanic_preprocessed.copy()
df_preprocessed

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Survived
0,1,0,22.0,1,0,7.25,0,1,0
1,0,1,38.0,1,0,71.2833,0,0,1
2,1,1,26.0,0,0,7.925,0,1,1
3,0,1,35.0,1,0,53.1,0,1,1
4,1,0,35.0,0,0,8.05,0,1,0
5,0,0,54.0,0,0,51.8625,0,1,0
6,1,0,2.0,3,1,21.075,0,1,0
7,1,1,27.0,0,2,11.1333,0,1,1
8,1,1,14.0,1,0,30.0708,0,0,1
9,1,1,4.0,1,1,16.7,0,1,1


### Assess our Priors

In [254]:
# We take a look at our priors(targets) to ensure they are balanced. This to avoid our model being biased/trained on
# unbalanced priors

priors = np.sum(df_preprocessed['Survived'])/df_preprocessed.shape[0]
priors

0.4044943820224719

### Balance the Dataset

In [255]:
# We see that our priors are not well balanced as we have more people who did not survive compare to those who did
# To balance the dataset, we shuffle first then undersample

Shuffling the data

In [256]:
# Sampling randomizes the dataset, so we sample to randomise this data
shuffled_df = df_preprocessed.sample(frac=1).reset_index(drop=True)

In [257]:
shuffled_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Survived
0,1,0,42.0,0,0,7.55,0,1,0
1,1,1,33.0,0,2,26.0,0,1,1
2,1,1,28.0,1,0,26.0,0,1,1
3,1,0,44.0,0,0,8.05,0,1,0
4,1,0,19.0,0,0,7.8958,0,1,0
5,1,0,20.0,0,0,9.5,0,1,0
6,1,0,39.0,0,0,26.0,0,1,0
7,1,1,4.0,0,1,13.4167,0,0,1
8,1,1,2.0,0,1,10.4625,0,1,0
9,1,0,10.0,3,2,27.9,0,1,0


In [258]:
survival_count = np.sum(shuffled_df['Survived'])
survival_count

288

In [259]:
targets = shuffled_df.iloc[:,-1]

In [260]:
targets

0      0
1      1
2      1
3      0
4      0
5      0
6      0
7      1
8      0
9      0
10     0
11     1
12     0
13     0
14     1
15     1
16     0
17     1
18     0
19     0
20     1
21     0
22     1
23     0
24     1
25     0
26     0
27     1
28     1
29     0
30     0
31     1
32     0
33     0
34     1
35     0
36     1
37     1
38     1
39     0
40     0
41     1
42     0
43     0
44     0
45     0
46     1
47     0
48     1
49     1
50     0
51     1
52     0
53     0
54     0
55     0
56     0
57     1
58     1
59     0
60     0
61     1
62     0
63     1
64     0
65     0
66     1
67     1
68     0
69     1
70     1
71     1
72     1
73     0
74     1
75     0
76     1
77     0
78     1
79     0
80     1
81     0
82     0
83     1
84     0
85     0
86     1
87     0
88     0
89     0
90     1
91     0
92     1
93     1
94     1
95     0
96     1
97     0
98     0
99     0
100    0
101    0
102    0
103    0
104    0
105    0
106    0
107    1
108    1
109    0
110    0
1

In [261]:
shuffled_df.shape

(712, 9)

In [262]:
# Now we balance the dataset

indices_to_remove = []
zero_count = shuffled_df.shape[0] - survival_count

for i in range(shuffled_df.shape[0]):
    if targets[i] == 0:
        if zero_count > survival_count:
            indices_to_remove.append(i)
            zero_count -= 1
        else:
            break

indices_to_remove

[0,
 3,
 4,
 5,
 6,
 8,
 9,
 10,
 12,
 13,
 16,
 18,
 19,
 21,
 23,
 25,
 26,
 29,
 30,
 32,
 33,
 35,
 39,
 40,
 42,
 43,
 44,
 45,
 47,
 50,
 52,
 53,
 54,
 55,
 56,
 59,
 60,
 62,
 64,
 65,
 68,
 73,
 75,
 77,
 79,
 81,
 82,
 84,
 85,
 87,
 88,
 89,
 91,
 95,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 109,
 110,
 111,
 114,
 115,
 116,
 118,
 119,
 120,
 124,
 127,
 130,
 131,
 133,
 136,
 137,
 138,
 139,
 140,
 142,
 144,
 148,
 149,
 151,
 153,
 157,
 158,
 159,
 163,
 164,
 165,
 167,
 168,
 169,
 170,
 174,
 175,
 179,
 181,
 183,
 185,
 187,
 188,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 200,
 203,
 204,
 205,
 209,
 210,
 211,
 212,
 214,
 217,
 218,
 221,
 222,
 223,
 224,
 226,
 227,
 228,
 229,
 231,
 232,
 235]

In [263]:
# To undersample, we drop the rows with the indices above 
shuffled_df_eq_priors = shuffled_df.drop(index=indices_to_remove, axis=0)

In [264]:
shuffled_df_eq_priors

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Survived
1,1,1,33.0,0,2,26.0,0,1,1
2,1,1,28.0,1,0,26.0,0,1,1
7,1,1,4.0,0,1,13.4167,0,0,1
11,1,1,24.0,1,0,26.0,0,1,1
14,0,1,48.0,1,0,39.6,0,0,1
15,0,1,31.0,1,0,113.275,0,0,1
17,1,1,48.0,1,2,65.0,0,1,1
20,1,0,1.0,1,2,20.575,0,1,1
22,1,1,4.0,1,1,23.0,0,1,1
24,0,1,19.0,0,0,30.0,0,1,1


In [265]:
priors_dist = np.sum(shuffled_df_eq_priors['Survived'])/shuffled_df_eq_priors.shape[0]
priors_dist

0.5

In [266]:
# Now we have an equally distributed set of priors. We reshuffle the dataset again before we proceed to scaling

balanced_df = shuffled_df_eq_priors.sample(frac=1).reset_index(drop=True)

In [267]:
balanced_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Survived
0,1,0,19.0,0,0,7.65,0,1,0
1,0,1,54.0,1,0,78.2667,0,0,1
2,1,0,3.0,1,1,15.9,0,1,1
3,0,1,30.0,0,0,31.0,0,0,1
4,1,0,26.0,0,0,56.4958,0,1,1
5,1,1,20.0,0,0,8.6625,0,1,0
6,1,1,4.0,2,1,39.0,0,1,1
7,1,0,31.0,1,1,26.25,0,1,0
8,0,0,71.0,0,0,34.6542,0,0,0
9,1,1,35.0,1,1,20.25,0,1,1


In [268]:
pd.unique(balanced_df['Parch'])

array([0, 1, 2, 6, 5, 3, 4])

### Define the Inputs and Targets

In [269]:
unscaled_inputs = balanced_df.iloc[:,:-1]
targets = balanced_df.iloc[:,-1]

### Scaling the Inputs

In [270]:
from sklearn.preprocessing import StandardScaler

In [271]:
scaler = StandardScaler() # We create a scaler object here in order to scale some parts of the inputs as we do not
# want to scale the dummies, rather the Age, Fare, Parents and Children, Siblings and spouses

scaler.fit(unscaled_inputs[['Age','SibSp','Parch','Fare']])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [272]:
unscaled_inputs[['Age','SibSp','Parch','Fare']] = scaler.transform(unscaled_inputs[['Age','SibSp','Parch','Fare']])
unscaled_inputs

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,1,0,-0.701231,-0.571667,-0.527985,-0.527346,0,1
1,0,1,1.74409,0.493964,-0.527985,0.704966,0,0
2,1,0,-1.819092,0.493964,0.606788,-0.383377,0,1
3,0,1,0.067298,-0.571667,-0.527985,-0.119871,0,0
4,1,0,-0.212167,-0.571667,-0.527985,0.325048,0,1
5,1,1,-0.631365,-0.571667,-0.527985,-0.509677,0,1
6,1,1,-1.749225,1.559596,0.606788,0.019734,0,1
7,1,0,0.137165,0.493964,0.606788,-0.202762,0,1
8,0,0,2.931817,-0.571667,-0.527985,-0.056103,0,0
9,1,1,0.41663,0.493964,0.606788,-0.307467,0,1


In [273]:
# Since sklearn uses arrays which we would be using for our Logistic Regression model, we convert the inputs into an
# array
scaled_inputs = np.array(unscaled_inputs)

In [274]:
x = scaled_inputs.copy()
targets = np.array(targets)
y = targets.copy()

### Model Training

In [275]:
from sklearn.linear_model import LogisticRegression

In [276]:
log_model = LogisticRegression()
log_model.fit(x,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [277]:
log_model.score(x,y)

0.7708333333333334

In [278]:
# The score above shows us that the Model will accurately predict the survival rate of people on the Titanic 79 
# percent of the time. We fetch the coefficients and intercept of this model
log_model.coef_

array([[-1.61986849,  2.49374183, -0.49633595, -0.35850986, -0.10605152,
         0.18693006, -0.94595301, -0.268034  ]])

In [279]:
log_model.intercept_

array([0.43402252])

### Create a Summary table

The summary table will show us the Features of the dataset and their respective coefficients/weights

In [280]:
features = unscaled_inputs.columns.values
summary_table = pd.DataFrame(columns=['Features'], data=features)
summary_table['coefficients'] = np.transpose(log_model.coef_) # nd arrays are rows by default so we transpose the data

summary_table

Unnamed: 0,Features,coefficients
0,Pclass,-1.619868
1,Sex,2.493742
2,Age,-0.496336
3,SibSp,-0.35851
4,Parch,-0.106052
5,Fare,0.18693
6,Embarked_Q,-0.945953
7,Embarked_S,-0.268034


In [281]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', log_model.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Features,coefficients
0,Intercept,0.434023
1,Pclass,-1.619868
2,Sex,2.493742
3,Age,-0.496336
4,SibSp,-0.35851
5,Parch,-0.106052
6,Fare,0.18693
7,Embarked_Q,-0.945953
8,Embarked_S,-0.268034


In [282]:
# Since logistic regression deals with the log(odds), we can calculate the odds ratio of the coefficients to 
# determine which coefficients have the highest weights

summary_table['odds_ratio'] = np.exp(summary_table.coefficients)
summary_table

Unnamed: 0,Features,coefficients,odds_ratio
0,Intercept,0.434023,1.543454
1,Pclass,-1.619868,0.197925
2,Sex,2.493742,12.106492
3,Age,-0.496336,0.608757
4,SibSp,-0.35851,0.698717
5,Parch,-0.106052,0.899378
6,Fare,0.18693,1.205543
7,Embarked_Q,-0.945953,0.388309
8,Embarked_S,-0.268034,0.764882


In [283]:
summary_table = summary_table.sort_values(['odds_ratio'], ascending=False)
summary_table

Unnamed: 0,Features,coefficients,odds_ratio
2,Sex,2.493742,12.106492
0,Intercept,0.434023,1.543454
6,Fare,0.18693,1.205543
5,Parch,-0.106052,0.899378
8,Embarked_S,-0.268034,0.764882
4,SibSp,-0.35851,0.698717
3,Age,-0.496336,0.608757
7,Embarked_Q,-0.945953,0.388309
1,Pclass,-1.619868,0.197925


We can tell from the summary table above that the Sex of the passenger has the highest impact on survival

### Testing the Model

In [284]:
test_data = pd.read_csv('titanic_preprocessed_test.csv')
raw_test_data = test_data.copy()
raw_test_data

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,892,1,0,34.5,0,0,7.8292,1,0
1,893,1,1,47.0,1,0,7.0,0,1
2,894,1,0,62.0,0,0,9.6875,1,0
3,895,1,0,27.0,0,0,8.6625,0,1
4,896,1,1,22.0,1,1,12.2875,0,1
5,897,1,0,14.0,0,0,9.225,0,1
6,898,1,1,30.0,0,0,7.6292,1,0
7,899,1,0,26.0,1,1,29.0,0,1
8,900,1,1,18.0,0,0,7.2292,0,0
9,901,1,0,21.0,2,0,24.15,0,1


In [285]:
# We need to prepare the test data for testing as the PassengerId column exists here and the data is unscaled. Ideally, 
# we should create a class that scales the data for testing and export it as well as the model. 

data = raw_test_data.drop(['PassengerId'], axis=1)
data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,1,0,34.5,0,0,7.8292,1,0
1,1,1,47.0,1,0,7.0,0,1
2,1,0,62.0,0,0,9.6875,1,0
3,1,0,27.0,0,0,8.6625,0,1
4,1,1,22.0,1,1,12.2875,0,1
5,1,0,14.0,0,0,9.225,0,1
6,1,1,30.0,0,0,7.6292,1,0
7,1,0,26.0,1,1,29.0,0,1
8,1,1,18.0,0,0,7.2292,0,0
9,1,0,21.0,2,0,24.15,0,1


In [286]:
data[['Age','SibSp','Parch','Fare']] = scaler.transform(data[['Age','SibSp','Parch','Fare']])
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
data

# The test data contains some empty values in the Age and Fare column. While these could be dropped, we will 
# assign them the median age and fare instead

import statistics
import math
age_list = [x for x in data['Age'] if math.isnan(x) is False]
median_age = statistics.median(age_list)
median_age

fare_list = [x for x in data['Fare'] if math.isnan(x) is False]
median_fare = statistics.median(fare_list)

In [287]:
data['Age'].fillna(median_age, inplace=True)
data['Fare'].fillna(median_fare, inplace=True)

In [288]:
data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,1,0,0.381697,-0.571667,-0.527985,-0.524218,1,0
1,1,1,1.255026,0.493964,-0.527985,-0.538689,0,1
2,1,0,2.30302,-0.571667,-0.527985,-0.49179,1,0
3,1,0,-0.1423,-0.571667,-0.527985,-0.509677,0,1
4,1,1,-0.491632,0.493964,0.606788,-0.446418,0,1
5,1,0,-1.050562,-0.571667,-0.527985,-0.499861,0,1
6,1,1,0.067298,-0.571667,-0.527985,-0.527709,1,0
7,1,0,-0.212167,0.493964,0.606788,-0.154773,0,1
8,1,1,-0.771097,-0.571667,-0.527985,-0.534689,0,0
9,1,0,-0.561498,1.559596,-0.527985,-0.239409,0,1


In [289]:
scaled_data = np.array(data)
scaled_data

array([[ 1.        ,  0.        ,  0.38169685, ..., -0.52421838,
         1.        ,  0.        ],
       [ 1.        ,  1.        ,  1.25502567, ..., -0.53868851,
         0.        ,  1.        ],
       [ 1.        ,  0.        ,  2.30302025, ..., -0.49178972,
         1.        ,  0.        ],
       ...,
       [ 1.        ,  0.        ,  0.66116207, ..., -0.53432583,
         0.        ,  1.        ],
       [ 1.        ,  0.        , -0.14230044, ..., -0.52036526,
         0.        ,  1.        ],
       [ 1.        ,  0.        , -0.14230044, ..., -0.27067523,
         0.        ,  0.        ]])

In [290]:
# Having scaled the data, now we test it by running our prediction on the test data

prediction = log_model.predict(scaled_data)
prediction

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,

### Create a Table of predictions containing Passenger ID and Survival

In [291]:
passengers = raw_test_data['PassengerId']
summary = pd.DataFrame(columns=['PassengerId'], data=passengers)
summary

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
5,897
6,898
7,899
8,900
9,901


In [292]:
summary['Survived'] = prediction
summary

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [293]:
summary.to_csv('titanic_prediction.csv', index=False)