In [1]:
import numpy as np
import pandas as pd

### (I) Read data and pick relevant columns ###

In [2]:
# read data
df = pd.read_csv('data/kaggleTitanic/train.csv')
df.sample(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
830,831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15.0,1,0,2659,14.4542,,C
367,368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C
326,327,0,3,"Nysveen, Mr. Johan Hansen",male,61.0,0,0,345364,6.2375,,S
833,834,0,3,"Augustsson, Mr. Albert",male,23.0,0,0,347468,7.8542,,S
737,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
355,356,0,3,"Vanden Steen, Mr. Leo Peter",male,28.0,0,0,345783,9.5,,S
128,129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C


In [3]:
# transform 'Cabin' column to 'Deck'
df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else np.nan)
df.sample(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
185,186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50.0,A32,S,A
104,105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37.0,2,0,3101276,7.925,,S,
695,696,0,2,"Chapman, Mr. Charles Henry",male,52.0,0,0,248731,13.5,,S,
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C,
307,308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (M...",female,17.0,1,0,PC 17758,108.9,C65,C,C
613,614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q,
349,350,0,3,"Dimic, Mr. Jovan",male,42.0,0,0,315088,8.6625,,S,


### (II) Split data into Train and Test ###

In [4]:
X = df.drop(['Survived'], axis=1)
y = df['Survived']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

### Following code to deal with SetttingWithCopyWarning, and ensure we are working with a copy of the data and not a view
Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

### (III) Fit/Transform on Training Data (Xtrain, ytrain): ###

In [5]:
# 1) for numeric columns:
numeric_features = ['Age']

#  use SimpleImputer to impute missing values with median
from sklearn.impute import SimpleImputer
sinum = SimpleImputer(missing_values=np.nan, strategy='median')

# note: fit_transform returns a numpy array.
#  when you convert a numpy array to a dataframe, it will assign indices starting at 0 by default
#  however, due to the train-test split, Xtrain's indices are now shuffled.
#  so you will need to explicitly ask pd.DataFrame to use Xtrain.index
Xnum = pd.DataFrame(sinum.fit_transform(Xtrain[numeric_features]), 
                    columns=['imp'+x for x in numeric_features],
                    index=Xtrain.index)
Xtrain = pd.concat([Xtrain, Xnum], axis=1)

Xtrain.sample(7)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck,impAge
683,684,3,"Goodwin, Mr. Charles Edward",male,14.0,5,2,CA 2144,46.9,,S,,14.0
145,146,2,"Nicholls, Mr. Joseph Charles",male,19.0,1,1,C.A. 33112,36.75,,S,,19.0
453,454,1,"Goldenberg, Mr. Samuel L",male,49.0,1,0,17453,89.1042,C92,C,C,49.0
115,116,3,"Pekoniemi, Mr. Edvard",male,21.0,0,0,STON/O 2. 3101294,7.925,,S,,21.0
237,238,2,"Collyer, Miss. Marjorie ""Lottie""",female,8.0,0,2,C.A. 31921,26.25,,S,,8.0
806,807,1,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S,A,39.0
171,172,3,"Rice, Master. Arthur",male,4.0,4,1,382652,29.125,,Q,,4.0


In [6]:
# 2) for categorical columns:
categorical_features = ['Pclass', 'Sex', 'Deck']

#  use SimpleImputer to impute missing values with constant 'X'
from sklearn.impute import SimpleImputer
sicat = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='X') 

# note: fit_transform returns a numpy array.
#  when you convert a numpy array to a dataframe, it will assign indices starting at 0 by default
#  however, due to the train-test split, Xtrain's indices are now shuffled.
#  so you will need to explicitly ask pd.DataFrame to use Xtrain.index
Xcat = pd.DataFrame(sicat.fit_transform(Xtrain[categorical_features]), 
                    columns=['imp'+x for x in categorical_features],
                    index=Xtrain.index)      
Xtrain = pd.concat([Xtrain, Xcat], axis=1)

Xtrain.sample(7)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck,impAge,impPclass,impSex,impDeck
482,483,3,"Rouse, Mr. Richard Henry",male,50.0,0,0,A/5 3594,8.05,,S,,50.0,3,male,X
151,152,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22.0,1,0,113776,66.6,C2,S,C,22.0,1,female,C
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C,38.0,1,female,C
398,399,2,"Pain, Dr. Alfred",male,23.0,0,0,244278,10.5,,S,,23.0,2,male,X
276,277,3,"Lindblom, Miss. Augusta Charlotta",female,45.0,0,0,347073,7.75,,S,,45.0,3,female,X
348,349,3,"Coutts, Master. William Loch ""William""",male,3.0,1,1,C.A. 37671,15.9,,S,,3.0,3,male,X
604,605,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35.0,0,0,111426,26.55,,C,,35.0,1,male,X


In [7]:
# 3) for imputed categorical columns:
imputed_categorical_features =  ['impPclass', 'impSex', 'impDeck']

#  use OneHotEncoder to one-hot-encode values 
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore')

# note: fit_transform returns a numpy array.
#  when you convert a numpy array to a dataframe, it will assign indices starting at 0 by default
#  however, due to the train-test split, Xtrain's indices are now shuffled.
#  so you will need to explicitly ask pd.DataFrame to use Xtrain.index
Xcat = pd.DataFrame(ohe.fit_transform(Xtrain[imputed_categorical_features]), 
                    columns=ohe.get_feature_names(), 
                    index=Xtrain.index)
Xtrain = pd.concat([Xtrain, Xcat], axis=1)

Xtrain.sample(7)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,x1_male,x2_A,x2_B,x2_C,x2_D,x2_E,x2_F,x2_G,x2_T,x2_X
354,355,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,...,1,0,0,0,0,0,0,0,0,1
759,760,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,0,0,110152,86.5,B77,...,0,0,1,0,0,0,0,0,0,0
76,77,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,...,1,0,0,0,0,0,0,0,0,1
556,557,1,"Duff Gordon, Lady. (Lucille Christiana Sutherl...",female,48.0,1,0,11755,39.6,A16,...,0,1,0,0,0,0,0,0,0,0
516,517,2,"Lemore, Mrs. (Amelia Milley)",female,34.0,0,0,C.A. 34260,10.5,F33,...,0,0,0,0,0,0,1,0,0,0
801,802,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0,1,1,C.A. 31921,26.25,,...,0,0,0,0,0,0,0,0,0,1
570,571,2,"Harris, Mr. George",male,62.0,0,0,S.W./PP 752,10.5,,...,1,0,0,0,0,0,0,0,0,1


In [8]:
# 4) only keep imputed numeric and ohe catergorical features
Xtrain.drop(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 
             'Deck', 'impPclass', 'impSex', 'impDeck',], axis=1, inplace=True)
Xtrain.sample(7)

Unnamed: 0,impAge,x0_1,x0_2,x0_3,x1_female,x1_male,x2_A,x2_B,x2_C,x2_D,x2_E,x2_F,x2_G,x2_T,x2_X
776,29.0,0,0,1,0,1,0,0,0,0,0,1,0,0,0
29,29.0,0,0,1,0,1,0,0,0,0,0,0,0,0,1
129,45.0,0,0,1,0,1,0,0,0,0,0,0,0,0,1
12,20.0,0,0,1,0,1,0,0,0,0,0,0,0,0,1
145,19.0,0,1,0,0,1,0,0,0,0,0,0,0,0,1
301,29.0,0,0,1,0,1,0,0,0,0,0,0,0,0,1
771,48.0,0,0,1,0,1,0,0,0,0,0,0,0,0,1


In [9]:
# 5) build Logistic Regression Model by fitting to training data
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear')  
lr.fit(Xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

### (IV) Transform/Predict on Test Data (Xtest, ytest): ###

In [10]:
# 1) for numeric columns:

#  impute missing values with median
# note: this is test data. so we do not instantiate, and we do no fit. we only transform

Xnum = pd.DataFrame(sinum.transform(Xtest[numeric_features]), 
                    columns=['imp'+x for x in numeric_features],
                    index=Xtest.index)
Xtest = pd.concat([Xtest, Xnum], axis=1)

Xtest.sample(7)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck,impAge
90,91,3,"Christmann, Mr. Emil",male,29.0,0,0,343276,8.05,,S,,29.0
363,364,3,"Asim, Mr. Adola",male,35.0,0,0,SOTON/O.Q. 3101310,7.05,,S,,35.0
34,35,1,"Meyer, Mr. Edgar Joseph",male,28.0,1,0,PC 17604,82.1708,,C,,28.0
521,522,3,"Vovk, Mr. Janko",male,22.0,0,0,349252,7.8958,,S,,22.0
708,709,1,"Cleaver, Miss. Alice",female,22.0,0,0,113781,151.55,,S,,22.0
862,863,1,"Swift, Mrs. Frederick Joel (Margaret Welles Ba...",female,48.0,0,0,17466,25.9292,D17,S,D,48.0
625,626,1,"Sutton, Mr. Frederick",male,61.0,0,0,36963,32.3208,D50,S,D,61.0


In [11]:
# 2) for categorical columns:

#  impute missing values with constant 'X'
# note: this is test data. so we do not instantiate, and we do no fit. we only transform

Xcat = pd.DataFrame(sicat.transform(Xtest[categorical_features]), 
                    columns=['imp'+x for x in categorical_features],
                    index=Xtest.index)      
Xtest = pd.concat([Xtest, Xcat], axis=1)

Xtest.sample(7)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck,impAge,impPclass,impSex,impDeck
49,50,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18.0,1,0,349237,17.8,,S,,18.0,3,female,X
439,440,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S,,31.0,2,male,X
242,243,2,"Coleridge, Mr. Reginald Charles",male,29.0,0,0,W./C. 14263,10.5,,S,,29.0,2,male,X
859,860,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C,,29.0,3,male,X
525,526,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q,,40.5,3,male,X
355,356,3,"Vanden Steen, Mr. Leo Peter",male,28.0,0,0,345783,9.5,,S,,28.0,3,male,X
878,879,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S,,29.0,3,male,X


In [12]:
# 3) for imputed categorical columns:

#  one-hot-encode values 
# note: this is test data. so we do not instantiate, and we do no fit. we only transform

Xcat = pd.DataFrame(ohe.transform(Xtest[imputed_categorical_features]), 
                    columns=ohe.get_feature_names(), 
                    index=Xtest.index)
Xtest = pd.concat([Xtest, Xcat], axis=1)

Xtest.sample(7)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,x1_male,x2_A,x2_B,x2_C,x2_D,x2_E,x2_F,x2_G,x2_T,x2_X
142,143,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda ...",female,24.0,1,0,STON/O2. 3101279,15.85,,...,0,0,0,0,0,0,0,0,0,1
828,829,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,...,1,0,0,0,0,0,0,0,0,1
789,790,1,"Guggenheim, Mr. Benjamin",male,46.0,0,0,PC 17593,79.2,B82 B84,...,1,0,1,0,0,0,0,0,0,0
589,590,3,"Murdlin, Mr. Joseph",male,,0,0,A./5. 3235,8.05,,...,1,0,0,0,0,0,0,0,0,1
625,626,1,"Sutton, Mr. Frederick",male,61.0,0,0,36963,32.3208,D50,...,1,0,0,0,1,0,0,0,0,0
841,842,2,"Mudd, Mr. Thomas Charles",male,16.0,0,0,S.O./P.P. 3,10.5,,...,1,0,0,0,0,0,0,0,0,1
848,849,2,"Harper, Rev. John",male,28.0,0,1,248727,33.0,,...,1,0,0,0,0,0,0,0,0,1


In [13]:
# 4) only keep imputed numeric and ohe catergorical features
Xtest.drop(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 
             'Deck', 'impPclass', 'impSex', 'impDeck',], axis=1, inplace=True)
Xtest.sample(7)

Unnamed: 0,impAge,x0_1,x0_2,x0_3,x1_female,x1_male,x2_A,x2_B,x2_C,x2_D,x2_E,x2_F,x2_G,x2_T,x2_X
871,47.0,1,0,0,1,0,0,0,0,1,0,0,0,0,0
724,27.0,1,0,0,0,1,0,0,0,0,1,0,0,0,0
329,16.0,1,0,0,1,0,0,1,0,0,0,0,0,0,0
216,27.0,0,0,1,1,0,0,0,0,0,0,0,0,0,1
49,18.0,0,0,1,1,0,0,0,0,0,0,0,0,0,1
742,21.0,1,0,0,1,0,0,1,0,0,0,0,0,0,0
201,29.0,0,0,1,0,1,0,0,0,0,0,0,0,0,1


In [14]:
# 5) predict using Logistic Regression Model on test data

# note: this is test data. so we do not instantiate, and we do no fit. we only predict.
ypred = lr.predict(Xtest)

# evaluate Logistic Regression Model on test data
from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.7821229050279329
[[87 19]
 [20 53]]
              precision    recall  f1-score   support

           0       0.81      0.82      0.82       106
           1       0.74      0.73      0.73        73

    accuracy                           0.78       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179

