### Import the relevant libraries

In [84]:
import numpy as np
import pandas as pd

### Load the dataset

In [85]:
titanic_preprocessed = pd.read_csv('titanic_preprocessed.csv')
df_preprocessed = titanic_preprocessed.copy()
df_preprocessed

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Survived
0,1,0,22.0,1,0,7.2500,0,1,0
1,0,1,38.0,1,0,71.2833,0,0,1
2,1,1,26.0,0,0,7.9250,0,1,1
3,0,1,35.0,1,0,53.1000,0,1,1
4,1,0,35.0,0,0,8.0500,0,1,0
...,...,...,...,...,...,...,...,...,...
707,1,1,39.0,0,5,29.1250,1,0,0
708,1,0,27.0,0,0,13.0000,0,1,0
709,0,1,19.0,0,0,30.0000,0,1,1
710,0,0,26.0,0,0,30.0000,0,0,1


### Assess our Priors

In [86]:
# We take a look at our priors(targets) to ensure they are balanced. This to avoid our model being biased/trained on
# unbalanced priors

priors = np.sum(df_preprocessed['Survived'])/df_preprocessed.shape[0]
priors

0.4044943820224719

### Balance the Dataset

In [87]:
# We see that our priors are not well balanced as we have more people who did not survive compare to those who did
# To balance the dataset, we shuffle first then undersample

Shuffling the data

In [88]:
# Sampling randomizes the dataset, so we sample to randomise this data
shuffled_df = df_preprocessed.sample(frac=1).reset_index(drop=True)

In [89]:
shuffled_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Survived
0,0,0,34.0,0,0,26.5500,0,1,1
1,1,1,27.0,1,0,21.0000,0,1,0
2,0,0,54.0,0,0,51.8625,0,1,0
3,0,0,64.0,1,4,263.0000,0,1,0
4,1,0,51.0,0,0,8.0500,0,1,0
...,...,...,...,...,...,...,...,...,...
707,0,1,40.0,1,1,134.5000,0,0,1
708,1,0,33.0,0,0,7.8958,0,0,0
709,1,0,30.0,0,0,7.8958,0,1,0
710,1,0,25.0,0,0,13.0000,0,1,0


In [90]:
survival_count = np.sum(shuffled_df['Survived'])
survival_count

288

In [91]:
targets = shuffled_df.iloc[:,-1]

In [92]:
targets

0      1
1      0
2      0
3      0
4      0
      ..
707    1
708    0
709    0
710    0
711    1
Name: Survived, Length: 712, dtype: int64

In [93]:
shuffled_df.shape

(712, 9)

In [94]:
# Now we balance the dataset

indices_to_remove = []
zero_count = shuffled_df.shape[0] - survival_count

for i in range(shuffled_df.shape[0]):
    if targets[i] == 0:
        if zero_count > survival_count:
            indices_to_remove.append(i)
            zero_count -= 1
        else:
            break

indices_to_remove

[1,
 2,
 3,
 4,
 5,
 6,
 9,
 12,
 13,
 15,
 16,
 17,
 18,
 19,
 23,
 24,
 25,
 29,
 30,
 31,
 32,
 33,
 36,
 37,
 38,
 39,
 40,
 42,
 43,
 44,
 45,
 47,
 49,
 51,
 52,
 53,
 60,
 62,
 64,
 66,
 67,
 69,
 72,
 73,
 74,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 85,
 86,
 87,
 89,
 91,
 92,
 93,
 94,
 95,
 96,
 100,
 102,
 103,
 104,
 106,
 107,
 108,
 112,
 115,
 118,
 119,
 120,
 121,
 122,
 124,
 125,
 126,
 131,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 149,
 150,
 154,
 157,
 158,
 160,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 172,
 173,
 175,
 176,
 179,
 182,
 183,
 185,
 187,
 189,
 190,
 191,
 192,
 195,
 196,
 197,
 198,
 199,
 201,
 202,
 203,
 207,
 208,
 209,
 212]

In [95]:
# To undersample, we drop the rows with the indices above 
shuffled_df_eq_priors = shuffled_df.drop(index=indices_to_remove, axis=0)

In [96]:
shuffled_df_eq_priors

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Survived
0,0,0,34.0,0,0,26.5500,0,1,1
7,0,1,36.0,1,2,120.0000,0,1,1
8,1,1,28.0,0,0,13.0000,0,1,1
10,1,0,3.0,4,2,31.3875,0,1,1
11,1,1,40.0,0,0,15.7500,0,1,1
...,...,...,...,...,...,...,...,...,...
707,0,1,40.0,1,1,134.5000,0,0,1
708,1,0,33.0,0,0,7.8958,0,0,0
709,1,0,30.0,0,0,7.8958,0,1,0
710,1,0,25.0,0,0,13.0000,0,1,0


In [97]:
priors_dist = np.sum(shuffled_df_eq_priors['Survived'])/shuffled_df_eq_priors.shape[0]
priors_dist

0.5

In [98]:
# Now we have an equally distributed set of priors. We reshuffle the dataset again before we proceed to scaling

balanced_df = shuffled_df_eq_priors.sample(frac=1).reset_index(drop=True)

In [99]:
balanced_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Survived
0,0,0,50.0,2,0,133.6500,0,1,1
1,0,1,42.0,0,0,227.5250,0,0,1
2,0,0,38.0,1,0,90.0000,0,1,1
3,1,0,8.0,4,1,29.1250,1,0,0
4,1,1,27.0,0,0,10.5000,0,1,1
...,...,...,...,...,...,...,...,...,...
571,1,1,41.0,0,1,19.5000,0,1,1
572,1,1,25.0,1,1,30.0000,0,1,1
573,1,1,5.0,0,0,12.4750,0,1,1
574,0,1,38.0,1,0,71.2833,0,0,1


In [100]:
pd.unique(balanced_df['Parch'])

array([0, 1, 2, 5, 3, 4])

### Define the Inputs and Targets

In [101]:
unscaled_inputs = balanced_df.iloc[:,:-1]
targets = balanced_df.iloc[:,-1]

### Scaling the Inputs

In [102]:
from sklearn.preprocessing import StandardScaler

In [103]:
scaler = StandardScaler() # We create a scaler object here in order to scale some parts of the inputs as we do not
# want to scale the dummies, rather the Age, Fare, Parents and Children, Siblings and spouses

scaler.fit(unscaled_inputs[['Age','SibSp','Parch','Fare']])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [104]:
unscaled_inputs[['Age','SibSp','Parch','Fare']] = scaler.transform(unscaled_inputs[['Age','SibSp','Parch','Fare']])
unscaled_inputs

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,0,1.453858,1.779487,-0.527812,1.717530,0,1
1,0,1,0.889858,-0.552673,-0.527812,3.377214,0,0
2,0,0,0.607858,0.613407,-0.527812,0.945810,0,1
3,1,0,-1.507141,4.111648,0.688266,-0.130443,1,0
4,1,1,-0.167642,-0.552673,-0.527812,-0.459728,0,1
...,...,...,...,...,...,...,...,...
571,1,1,0.819358,-0.552673,0.688266,-0.300610,0,1
572,1,1,-0.308642,0.613407,0.688266,-0.114973,0,1
573,1,1,-1.718641,-0.552673,-0.527812,-0.424811,0,1
574,0,1,0.607858,0.613407,-0.527812,0.614904,0,0


In [105]:
# Since sklearn uses arrays which we would be using for our Logistic Regression model, we convert the inputs into an
# array
scaled_inputs = np.array(unscaled_inputs)

In [106]:
x = scaled_inputs.copy()
targets = np.array(targets)
y = targets.copy()

### Model Training

In [107]:
from sklearn.linear_model import LogisticRegression

In [108]:
log_model = LogisticRegression()
log_model.fit(x,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [109]:
log_model.score(x,y)

0.7760416666666666

In [111]:
# The score above shows us that the Model will accurately predict the survival rate of people on the Titanic 78 
# percent of the time. We fetch the coefficients and intercept of this model
log_model.coef_

array([[-1.68028362,  2.43220029, -0.48840093, -0.22762381, -0.06926937,
         0.23682212, -0.42873663, -0.07016941]])

In [112]:
log_model.intercept_

array([0.37823212])

### Create a Summary table

The summary table will show us the Features of the dataset and their respective coefficients/weights

In [113]:
features = unscaled_inputs.columns.values
summary_table = pd.DataFrame(columns=['Features'], data=features)
summary_table['coefficients'] = np.transpose(log_model.coef_) # nd arrays are rows by default so we transpose the data

summary_table

Unnamed: 0,Features,coefficients
0,Pclass,-1.680284
1,Sex,2.4322
2,Age,-0.488401
3,SibSp,-0.227624
4,Parch,-0.069269
5,Fare,0.236822
6,Embarked_Q,-0.428737
7,Embarked_S,-0.070169


In [114]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', log_model.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Features,coefficients
0,Intercept,0.378232
1,Pclass,-1.680284
2,Sex,2.4322
3,Age,-0.488401
4,SibSp,-0.227624
5,Parch,-0.069269
6,Fare,0.236822
7,Embarked_Q,-0.428737
8,Embarked_S,-0.070169


In [115]:
# Since logistic regression deals with the log(odds), we can calculate the odds ratio of the coefficients to 
# determine which coefficients have the highest weights

summary_table['odds_ratio'] = np.exp(summary_table.coefficients)
summary_table

Unnamed: 0,Features,coefficients,odds_ratio
0,Intercept,0.378232,1.459702
1,Pclass,-1.680284,0.186321
2,Sex,2.4322,11.383902
3,Age,-0.488401,0.613607
4,SibSp,-0.227624,0.796424
5,Parch,-0.069269,0.933075
6,Fare,0.236822,1.267216
7,Embarked_Q,-0.428737,0.651331
8,Embarked_S,-0.070169,0.932236


In [116]:
summary_table = summary_table.sort_values(['odds_ratio'], ascending=False)
summary_table

Unnamed: 0,Features,coefficients,odds_ratio
2,Sex,2.4322,11.383902
0,Intercept,0.378232,1.459702
6,Fare,0.236822,1.267216
5,Parch,-0.069269,0.933075
8,Embarked_S,-0.070169,0.932236
4,SibSp,-0.227624,0.796424
7,Embarked_Q,-0.428737,0.651331
3,Age,-0.488401,0.613607
1,Pclass,-1.680284,0.186321


We can tell from the summary table above that the Sex of the passenger has the highest impact on survival

### Testing the Model

In [118]:
test_data = pd.read_csv('titanic_preprocessed_test.csv')
raw_test_data = test_data.copy()
raw_test_data

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,892,1,0,34.5,0,0,7.8292,1,0
1,893,1,1,47.0,1,0,7.0000,0,1
2,894,1,0,62.0,0,0,9.6875,1,0
3,895,1,0,27.0,0,0,8.6625,0,1
4,896,1,1,22.0,1,1,12.2875,0,1
...,...,...,...,...,...,...,...,...,...
326,1301,1,1,3.0,1,1,13.7750,0,1
327,1303,0,1,37.0,1,0,90.0000,1,0
328,1304,1,1,28.0,0,0,7.7750,0,1
329,1306,0,1,39.0,0,0,108.9000,0,0


In [119]:
# We need to prepare the test data for testing as the PassengerId column exists here and the data is unscaled. Ideally, 
# we should create a class that scales the data for testing and export it as well as the model. 

data = raw_test_data.drop(['PassengerId'], axis=1)
data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,1,0,34.5,0,0,7.8292,1,0
1,1,1,47.0,1,0,7.0000,0,1
2,1,0,62.0,0,0,9.6875,1,0
3,1,0,27.0,0,0,8.6625,0,1
4,1,1,22.0,1,1,12.2875,0,1
...,...,...,...,...,...,...,...,...
326,1,1,3.0,1,1,13.7750,0,1
327,0,1,37.0,1,0,90.0000,1,0
328,1,1,28.0,0,0,7.7750,0,1
329,0,1,39.0,0,0,108.9000,0,0


In [120]:
data[['Age','SibSp','Parch','Fare']] = scaler.transform(data[['Age','SibSp','Parch','Fare']])
data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,1,0,0.361108,-0.552673,-0.527812,-0.506947,1,0
1,1,1,1.242358,0.613407,-0.527812,-0.521607,0,1
2,1,0,2.299857,-0.552673,-0.527812,-0.474093,1,0
3,1,0,-0.167642,-0.552673,-0.527812,-0.492214,0,1
4,1,1,-0.520142,0.613407,0.688266,-0.428125,0,1
...,...,...,...,...,...,...,...,...
326,1,1,-1.859641,0.613407,0.688266,-0.401827,0,1
327,0,1,0.537358,0.613407,-0.527812,0.945810,1,0
328,1,1,-0.097142,-0.552673,-0.527812,-0.507905,0,1
329,0,1,0.678358,-0.552673,-0.527812,1.279957,0,0


In [121]:
scaled_data = np.array(data)
scaled_data

array([[ 1.        ,  0.        ,  0.361108  , ..., -0.50694699,
         1.        ,  0.        ],
       [ 1.        ,  1.        ,  1.24235775, ..., -0.52160702,
         0.        ,  1.        ],
       [ 1.        ,  0.        ,  2.29985746, ..., -0.47409276,
         1.        ,  0.        ],
       ...,
       [ 1.        ,  1.        , -0.09714187, ..., -0.50790523,
         0.        ,  1.        ],
       [ 0.        ,  1.        ,  0.67835791, ...,  1.279957  ,
         0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.64310792, ..., -0.51718709,
         0.        ,  1.        ]])

In [122]:
# Having scaled the data, now we test it by running our prediction on the test data

prediction = log_model.predict(scaled_data)
prediction

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,

### Create a Table of predictions containing Passenger ID and Survival

In [123]:
passengers = raw_test_data['PassengerId']
summary = pd.DataFrame(columns=['PassengerId'], data=passengers)
summary

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
...,...
326,1301
327,1303
328,1304
329,1306


In [124]:
summary['Survived'] = prediction
summary

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
326,1301,1
327,1303,1
328,1304,1
329,1306,1


In [125]:
summary.to_csv('titanic_prediction.csv', index=False)