# Feature Engineering for Titanic Dataset

This notebook does the same feature engineering as Trevor Stephens did [here](http://trevorstephens.com/kaggle-titanic-tutorial/r-part-4-feature-engineering/) before. The difference is that Trevor Stephens uses R and this notebook uses python pandas library.

## Part 1: Label Encoding

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as prep

In [2]:
%ls ../input

test.csv   train.csv


In [3]:
# read csv files
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [4]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# add test df a survived cloumn

test_df['Survived'] = 0

In [6]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0


In [7]:
# concat train and test sets

concat = train_df.append(test_df, ignore_index=True)
concat.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0,373450


In [8]:
print(train_df.shape)
print(test_df.shape)
print(concat.shape)

(891, 12)
(418, 12)
(1309, 12)


In [9]:
concat.Sex.unique()

array(['male', 'female'], dtype=object)

In [10]:
# label encoder to transform categorical string data to integers
le = prep.LabelEncoder()

In [11]:
le.fit(concat.Sex)
le.classes_

array(['female', 'male'], dtype=object)

In [12]:
Sex_le = le.transform(concat.Sex)

Sex_le[0:10]

array([1, 0, 0, 0, 1, 1, 1, 1, 0, 0])

In [13]:
concat_le = concat.copy()

concat_le.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0,373450


In [14]:
concat_le.Sex = Sex_le

concat_le.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450


In [15]:
concat_le.dtypes

Age            float64
Cabin           object
Embarked        object
Fare           float64
Name            object
Parch            int64
PassengerId      int64
Pclass           int64
Sex              int64
SibSp            int64
Survived         int64
Ticket          object
dtype: object

In [16]:
print(concat.Survived.unique())
print(concat.Pclass.unique())
print(concat.Sex.unique())
print(concat.SibSp.unique())
print(concat.Parch.unique())
print(concat.Embarked.unique())

[0 1]
[3 1 2]
['male' 'female']
[1 0 3 4 2 5 8]
[0 1 2 5 3 4 6 9]
['S' 'C' 'Q' nan]


In [17]:
# remove nans and fill with '0's
embarked = concat['Embarked'].fillna('0')
embarked.unique()

array(['S', 'C', 'Q', '0'], dtype=object)

In [18]:
# label encode embarked
le.fit(embarked)
embarked = le.transform(embarked)
embarked[:10]

array([3, 1, 3, 3, 3, 2, 3, 3, 3, 1])

In [19]:
concat_le.Embarked = embarked

In [20]:
# check
concat_le.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450
5,,,2,8.4583,"Moran, Mr. James",0,6,3,1,0,0,330877
6,54.0,E46,3,51.8625,"McCarthy, Mr. Timothy J",0,7,1,1,0,0,17463
7,2.0,,3,21.075,"Palsson, Master. Gosta Leonard",1,8,3,1,3,0,349909
8,27.0,,3,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,0,0,1,347742
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,0,1,1,237736


In [21]:
# split train and test sets
train_le = concat_le.iloc[:891].copy()
test_le = concat_le.iloc[891:].copy()

In [22]:
# And save
%mkdir -p data
train_le.to_csv('./data/train_le.csv', index=False)
test_le.to_csv('./data/test_le.csv', index=False)

In [23]:
%ls data

test_final.csv   test_le.csv      train_final.csv  train_le.csv


## Part 2: Further Feature Engineering

In [24]:
train = pd.read_csv('./data/train_le.csv')
test = pd.read_csv('./data/test_le.csv')

In [25]:
# concat dfs again
concat = train.append(test)

In [26]:
# check numbers
concat.shape

(1309, 12)

In [27]:
train.shape[0] + test.shape[0]

1309

### Feature engineer names

In [28]:
NameSplit = concat.Name.str.split('[,.]')

In [29]:
NameSplit.head()

0                          [Braund,  Mr,  Owen Harris]
1    [Cumings,  Mrs,  John Bradley (Florence Briggs...
2                           [Heikkinen,  Miss,  Laina]
3     [Futrelle,  Mrs,  Jacques Heath (Lily May Peel)]
4                         [Allen,  Mr,  William Henry]
Name: Name, dtype: object

In [30]:
titles = [str.strip(name[1]) for name in NameSplit.values]
titles[:10]

['Mr', 'Mrs', 'Miss', 'Mrs', 'Mr', 'Mr', 'Mr', 'Master', 'Mrs', 'Mrs']

In [31]:
# New feature
concat['Title'] = titles

In [32]:
concat.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [33]:
# redundancy: combine Mademoiselle and Madame into a single type
concat.Title.values[concat.Title.isin(['Mme', 'Mlle'])] = 'Mlle'

In [34]:
# keep reducing the number of factor levels
concat.Title.values[concat.Title.isin(['Capt', 'Don', 'Major', 'Sir'])] = 'Sir'
concat.Title.values[concat.Title.isin(['Dona', 'Lady', 'the Countess', 'Jonkheer'])] = 'Lady'

In [35]:
# label encode new feature too
le.fit(concat.Title)
le.classes_

array(['Col', 'Dr', 'Lady', 'Master', 'Miss', 'Mlle', 'Mr', 'Mrs', 'Ms',
       'Rev', 'Sir'], dtype=object)

In [36]:
concat.Title = le.transform(concat.Title)

In [37]:
concat.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171,6
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599,7
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282,4
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803,7
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450,6
5,,,2,8.4583,"Moran, Mr. James",0,6,3,1,0,0,330877,6
6,54.0,E46,3,51.8625,"McCarthy, Mr. Timothy J",0,7,1,1,0,0,17463,6
7,2.0,,3,21.075,"Palsson, Master. Gosta Leonard",1,8,3,1,3,0,349909,3
8,27.0,,3,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,0,0,1,347742,7
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,0,1,1,237736,7


### New features family size and family id

In [38]:
# new feature family size
concat['FamilySize'] = concat.SibSp.values + concat.Parch.values + 1

In [39]:
concat.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,FamilySize
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171,6,2
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599,7,2
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282,4,1
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803,7,2
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450,6,1
5,,,2,8.4583,"Moran, Mr. James",0,6,3,1,0,0,330877,6,1
6,54.0,E46,3,51.8625,"McCarthy, Mr. Timothy J",0,7,1,1,0,0,17463,6,1
7,2.0,,3,21.075,"Palsson, Master. Gosta Leonard",1,8,3,1,3,0,349909,3,5
8,27.0,,3,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,0,0,1,347742,7,3
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,0,1,1,237736,7,2


New feature `FamilyID`, extract family information from surnames and family size information. Members of a family should have both the same surname and family size.

In [40]:
surnames = [str.strip(name[0]) for name in NameSplit.values]
surnames[:10]

['Braund',
 'Cumings',
 'Heikkinen',
 'Futrelle',
 'Allen',
 'Moran',
 'McCarthy',
 'Palsson',
 'Johnson',
 'Nasser']

In [41]:
concat['Surname'] = surnames
concat['FamilyID'] = concat.Surname.str.cat(concat.FamilySize.astype(str), sep='')
concat.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,FamilySize,Surname,FamilyID
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171,6,2,Braund,Braund2
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599,7,2,Cumings,Cumings2
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282,4,1,Heikkinen,Heikkinen1
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803,7,2,Futrelle,Futrelle2
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450,6,1,Allen,Allen1
5,,,2,8.4583,"Moran, Mr. James",0,6,3,1,0,0,330877,6,1,Moran,Moran1
6,54.0,E46,3,51.8625,"McCarthy, Mr. Timothy J",0,7,1,1,0,0,17463,6,1,McCarthy,McCarthy1
7,2.0,,3,21.075,"Palsson, Master. Gosta Leonard",1,8,3,1,3,0,349909,3,5,Palsson,Palsson5
8,27.0,,3,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,0,0,1,347742,7,3,Johnson,Johnson3
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,0,1,1,237736,7,2,Nasser,Nasser2


In [42]:
# mark any family id as small if family size is less than or equal to 2
concat.FamilyID.values[concat.FamilySize.values <= 2] = 'Small'

In [43]:
concat.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,FamilySize,Surname,FamilyID
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171,6,2,Braund,Small
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599,7,2,Cumings,Small
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282,4,1,Heikkinen,Small
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803,7,2,Futrelle,Small
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450,6,1,Allen,Small
5,,,2,8.4583,"Moran, Mr. James",0,6,3,1,0,0,330877,6,1,Moran,Small
6,54.0,E46,3,51.8625,"McCarthy, Mr. Timothy J",0,7,1,1,0,0,17463,6,1,McCarthy,Small
7,2.0,,3,21.075,"Palsson, Master. Gosta Leonard",1,8,3,1,3,0,349909,3,5,Palsson,Palsson5
8,27.0,,3,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,0,0,1,347742,7,3,Johnson,Johnson3
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,0,1,1,237736,7,2,Nasser,Small


In [44]:
# check the frequency of family ids
concat.FamilyID.value_counts()

Small             1025
Sage11              11
Andersson7           9
Goodwin8             8
Asplund7             7
Skoog6               6
Fortune6             6
Rice6                6
Panula6              6
Ryerson5             5
Davies3              5
Ford5                5
Lefebre5             5
Palsson5             5
Johnston4            4
Dean4                4
Allison4             4
Becker4              4
West4                4
Brown3               4
Carter4              4
Laroche4             4
Baclini4             4
Herman4              4
Quick3               3
Caldwell3            3
Wells3               3
Drew3                3
Goldsmith3           3
Boulos3              3
                  ... 
Hocking4             2
Hays3                2
Richards3            2
Kink3                2
Hamalainen3          2
Vander Planke3       2
Christy3             2
Thomas3              1
Richards6            1
Minahan3             1
Appleton3            1
Kink-Heilmann5       1
Backstrom4 

Too many family ids with few family members, maybe some families had different last names or something else. Let's clean this too.

In [45]:
freq = list(dict(zip(concat.FamilyID.value_counts().index.tolist(), concat.FamilyID.value_counts().values)).items())
type(freq)

list

In [46]:
freq = np.array(freq)
freq[:10]

array([['Small', '1025'],
       ['Sage11', '11'],
       ['Andersson7', '9'],
       ['Goodwin8', '8'],
       ['Asplund7', '7'],
       ['Skoog6', '6'],
       ['Fortune6', '6'],
       ['Rice6', '6'],
       ['Panula6', '6'],
       ['Ryerson5', '5']], dtype='<U17')

In [47]:
freq.shape

(97, 2)

In [48]:
# select the family ids with frequency of 2 or less
freq[freq[:,1].astype(int) <= 2].shape

(36, 2)

In [49]:
freq = freq[freq[:,1].astype(int) <= 2]

In [50]:
# assign 'Small' for those
concat.FamilyID.values[concat.FamilyID.isin(freq[:,0])] = 'Small'

In [51]:
concat.FamilyID.value_counts()

Small            1074
Sage11             11
Andersson7          9
Goodwin8            8
Asplund7            7
Fortune6            6
Rice6               6
Skoog6              6
Panula6             6
Ryerson5            5
Davies3             5
Palsson5            5
Lefebre5            5
Ford5               5
Laroche4            4
Herman4             4
West4               4
Johnston4           4
Carter4             4
Brown3              4
Allison4            4
Becker4             4
Baclini4            4
Dean4               4
Quick3              3
Peter3              3
Dodge3              3
Navratil3           3
Wick3               3
Nakid3              3
                 ... 
Taussig3            3
Klasen3             3
Crosby3             3
Moubarek3           3
Peacock3            3
Rosblom3            3
Sandstrom3          3
Drew3               3
Spedden3            3
Johnson3            3
McCoy3              3
Collyer3            3
Elias3              3
Coutts3             3
van Billia

In [52]:
# label encoding for family id
le.fit(concat.FamilyID)
concat.FamilyID = le.transform(concat.FamilyID)
concat.FamilyID.unique()

array([50, 38, 28, 48,  2, 43,  3, 22, 31, 47, 39, 57, 24, 49, 35,  9, 21,
       17, 27, 41,  6, 37, 60, 46, 23, 30, 32,  5,  7, 11, 44, 54, 52,  0,
        1, 34, 45, 25, 58, 51, 13, 20, 59, 36, 10, 19, 55, 15, 18,  4, 42,
       14, 16, 53, 26,  8, 56, 29, 33, 12, 40])

In [53]:
# choose usefull features
concat_reduce = concat[[
    'PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp',
    'Parch', 'Fare', 'Title', 'Embarked', 'FamilySize',
    'FamilyID', 'Survived']]
concat_reduce.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Title,Embarked,FamilySize,FamilyID,Survived
0,1,3,1,22.0,1,0,7.25,6,3,2,50,0
1,2,1,0,38.0,1,0,71.2833,7,1,2,50,1
2,3,3,0,26.0,0,0,7.925,4,3,1,50,1
3,4,1,0,35.0,1,0,53.1,7,3,2,50,1
4,5,3,1,35.0,0,0,8.05,6,3,1,50,0


In [54]:
# split
train_final = concat_reduce.iloc[:891].copy()
test_final = concat_reduce.iloc[891:].copy()

In [55]:
# save
train_final.to_csv('./data/train_final.csv', index=False)
test_final.to_csv('./data/test_final.csv', index=False)

In [56]:
%ls data

test_final.csv   test_le.csv      train_final.csv  train_le.csv
