### Feature Engineer dataset

This notebook requires  `train_le.csv` and `test_le.csv` datasets (requires running [dataset-01-label-encoding](./data-01-label-encoding.ipynb) first).

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as prep

In [2]:
train = pd.read_csv('./data/train_le.csv')
test = pd.read_csv('./data/test_le.csv')

In [3]:
train.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450


In [4]:
test.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,34.5,,2,7.8292,"Kelly, Mr. James",0,892,3,1,0,0,330911
1,47.0,,3,7.0,"Wilkes, Mrs. James (Ellen Needs)",0,893,3,0,1,0,363272
2,62.0,,2,9.6875,"Myles, Mr. Thomas Francis",0,894,2,1,0,0,240276
3,27.0,,3,8.6625,"Wirz, Mr. Albert",0,895,3,1,0,0,315154
4,22.0,,3,12.2875,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,896,3,0,1,0,3101298


In [5]:
# concat dfs
concat = train.append(test)

In [6]:
concat.shape

(1309, 12)

In [7]:
train.shape[0] + test.shape[0]

1309

Handle names first

In [8]:
NameSplit = concat.Name.str.split('[,.]')

In [9]:
NameSplit.head()

0                          [Braund,  Mr,  Owen Harris]
1    [Cumings,  Mrs,  John Bradley (Florence Briggs...
2                           [Heikkinen,  Miss,  Laina]
3     [Futrelle,  Mrs,  Jacques Heath (Lily May Peel)]
4                         [Allen,  Mr,  William Henry]
Name: Name, dtype: object

In [10]:
NameSplit.values[0]

['Braund', ' Mr', ' Owen Harris']

In [11]:
NameSplit.values[:5]

array([['Braund', ' Mr', ' Owen Harris'],
       ['Cumings', ' Mrs', ' John Bradley (Florence Briggs Thayer)'],
       ['Heikkinen', ' Miss', ' Laina'],
       ['Futrelle', ' Mrs', ' Jacques Heath (Lily May Peel)'],
       ['Allen', ' Mr', ' William Henry']], dtype=object)

In [12]:
titles = [str.strip(name[1]) for name in NameSplit.values]

In [13]:
titles[:10]

['Mr', 'Mrs', 'Miss', 'Mrs', 'Mr', 'Mr', 'Mr', 'Master', 'Mrs', 'Mrs']

In [14]:
concat['Title'] = titles

In [15]:
concat.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171,Mr
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599,Mrs
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282,Miss
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803,Mrs
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450,Mr
5,,,2,8.4583,"Moran, Mr. James",0,6,3,1,0,0,330877,Mr
6,54.0,E46,3,51.8625,"McCarthy, Mr. Timothy J",0,7,1,1,0,0,17463,Mr
7,2.0,,3,21.075,"Palsson, Master. Gosta Leonard",1,8,3,1,3,0,349909,Master
8,27.0,,3,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,0,0,1,347742,Mrs
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,0,1,1,237736,Mrs


In [16]:
concat.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [17]:
# redundancy: combine Mademoiselle and Madame into a single type
concat.Title.values[concat.Title.isin(['Mme', 'Mlle'])] = 'Mlle'

In [18]:
# keep reducing the number of factor levels
concat.Title.values[concat.Title.isin(['Capt', 'Don', 'Major', 'Sir'])] = 'Sir'
concat.Title.values[concat.Title.isin(['Dona', 'Lady', 'the Countess', 'Jonkheer'])] = 'Lady'

In [19]:
le = prep.LabelEncoder()

In [20]:
le.fit(concat.Title)

LabelEncoder()

In [21]:
le.classes_

array(['Col', 'Dr', 'Lady', 'Master', 'Miss', 'Mlle', 'Mr', 'Mrs', 'Ms',
       'Rev', 'Sir'], dtype=object)

In [22]:
concat.Title = le.transform(concat.Title)

In [23]:
concat.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171,6
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599,7
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282,4
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803,7
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450,6
5,,,2,8.4583,"Moran, Mr. James",0,6,3,1,0,0,330877,6
6,54.0,E46,3,51.8625,"McCarthy, Mr. Timothy J",0,7,1,1,0,0,17463,6
7,2.0,,3,21.075,"Palsson, Master. Gosta Leonard",1,8,3,1,3,0,349909,3
8,27.0,,3,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,0,0,1,347742,7
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,0,1,1,237736,7


New feature family size

In [24]:
# new feature family size
concat['FamilySize'] = concat.SibSp.values + concat.Parch.values + 1

In [25]:
concat.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,FamilySize
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171,6,2
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599,7,2
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282,4,1
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803,7,2
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450,6,1
5,,,2,8.4583,"Moran, Mr. James",0,6,3,1,0,0,330877,6,1
6,54.0,E46,3,51.8625,"McCarthy, Mr. Timothy J",0,7,1,1,0,0,17463,6,1
7,2.0,,3,21.075,"Palsson, Master. Gosta Leonard",1,8,3,1,3,0,349909,3,5
8,27.0,,3,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,0,0,1,347742,7,3
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,0,1,1,237736,7,2


New feature `FamilyID`, extract family information from surnames and family size information. Members of a family should have both the same surname and family size.

In [26]:
surnames = [str.strip(name[0]) for name in NameSplit.values]

In [27]:
surnames[:10]

['Braund',
 'Cumings',
 'Heikkinen',
 'Futrelle',
 'Allen',
 'Moran',
 'McCarthy',
 'Palsson',
 'Johnson',
 'Nasser']

In [28]:
concat['Surname'] = surnames

In [29]:
concat['FamilyID'] = concat.Surname.str.cat(concat.FamilySize.astype(str), sep='')

In [30]:
concat.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,FamilySize,Surname,FamilyID
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171,6,2,Braund,Braund2
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599,7,2,Cumings,Cumings2
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282,4,1,Heikkinen,Heikkinen1
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803,7,2,Futrelle,Futrelle2
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450,6,1,Allen,Allen1
5,,,2,8.4583,"Moran, Mr. James",0,6,3,1,0,0,330877,6,1,Moran,Moran1
6,54.0,E46,3,51.8625,"McCarthy, Mr. Timothy J",0,7,1,1,0,0,17463,6,1,McCarthy,McCarthy1
7,2.0,,3,21.075,"Palsson, Master. Gosta Leonard",1,8,3,1,3,0,349909,3,5,Palsson,Palsson5
8,27.0,,3,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,0,0,1,347742,7,3,Johnson,Johnson3
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,0,1,1,237736,7,2,Nasser,Nasser2


In [31]:
# mark any family id as small if family size is less than or equal to 2
concat.FamilyID.values[concat.FamilySize.values <= 2] = 'Small'

In [32]:
concat.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,FamilySize,Surname,FamilyID
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171,6,2,Braund,Small
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599,7,2,Cumings,Small
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282,4,1,Heikkinen,Small
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803,7,2,Futrelle,Small
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450,6,1,Allen,Small
5,,,2,8.4583,"Moran, Mr. James",0,6,3,1,0,0,330877,6,1,Moran,Small
6,54.0,E46,3,51.8625,"McCarthy, Mr. Timothy J",0,7,1,1,0,0,17463,6,1,McCarthy,Small
7,2.0,,3,21.075,"Palsson, Master. Gosta Leonard",1,8,3,1,3,0,349909,3,5,Palsson,Palsson5
8,27.0,,3,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,0,0,1,347742,7,3,Johnson,Johnson3
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,0,1,1,237736,7,2,Nasser,Small


In [33]:
concat.FamilyID.value_counts()

Small                1025
Sage11                 11
Andersson7              9
Goodwin8                8
Asplund7                7
Fortune6                6
Rice6                   6
Skoog6                  6
Panula6                 6
Davies3                 5
Lefebre5                5
Ford5                   5
Ryerson5                5
Palsson5                5
Carter4                 4
Johnston4               4
Brown3                  4
Allison4                4
Baclini4                4
West4                   4
Herman4                 4
Dean4                   4
Laroche4                4
Becker4                 4
Widener3                3
McCoy3                  3
Peacock3                3
Caldwell3               3
Peter3                  3
Compton3                3
                     ... 
Hays3                   2
Kink-Heilmann3          2
Christy3                2
Frolicher-Stehli3       2
Lahtinen3               2
Jefferys3               2
Gustafsson3             2
Renouf4     

Too many family ids with few family members, maybe some families had different last names or something else. Let's clean this too.

In [34]:
freq = list(dict(zip(concat.FamilyID.value_counts().index.tolist(), concat.FamilyID.value_counts().values)).items())
type(freq)

list

In [35]:
freq = np.array(freq)
freq[:10]

array([['Small', '1025'],
       ['Sage11', '11'],
       ['Andersson7', '9'],
       ['Goodwin8', '8'],
       ['Asplund7', '7'],
       ['Fortune6', '6'],
       ['Rice6', '6'],
       ['Skoog6', '6'],
       ['Panula6', '6'],
       ['Davies3', '5']], 
      dtype='<U17')

In [36]:
freq.shape

(97, 2)

In [37]:
# select the family ids with frequency of 2 or less
freq[freq[:,1].astype(int) <= 2].shape

(36, 2)

In [38]:
freq = freq[freq[:,1].astype(int) <= 2]

In [39]:
freq[:,0]

array(['Beckwith3', 'Vander Planke3', 'Kink3', 'Hocking4', 'Hamalainen3',
       'Richards3', 'Hays3', 'Kink-Heilmann3', 'Christy3',
       'Frolicher-Stehli3', 'Lahtinen3', 'Jefferys3', 'Gustafsson3',
       'Renouf4', 'Frauenthal3', 'Silven3', 'Jacobsohn4', 'Cornell3',
       'Vander Planke4', 'Nicholls3', 'Thomas3', 'Richards6',
       'Kink-Heilmann5', 'Hiltunen3', 'Newell3', 'Davidson4', 'Minahan3',
       'Hocking5', 'Douglas3', 'Newsom3', 'Hansen3', 'Hirvonen3',
       'Backstrom4', 'Strom3', 'Appleton3', 'Frolicher3'], 
      dtype='<U17')

In [40]:
concat.FamilyID.values[concat.FamilyID.isin(freq[:,0])] = 'Small'

In [41]:
concat.FamilyID.value_counts()

Small         1074
Sage11          11
Andersson7       9
Goodwin8         8
Asplund7         7
Rice6            6
Fortune6         6
Panula6          6
Skoog6           6
Ford5            5
Davies3          5
Ryerson5         5
Palsson5         5
Lefebre5         5
West4            4
Dean4            4
Johnston4        4
Baclini4         4
Brown3           4
Allison4         4
Laroche4         4
Carter4          4
Becker4          4
Herman4          4
Widener3         3
Navratil3        3
Sandstrom3       3
Klasen3          3
Dodge3           3
Moubarek3        3
              ... 
Touma3           3
Taussig3         3
Rosblom3         3
Thayer3          3
McCoy3           3
Elias3           3
Mallet3          3
Wells3           3
Johnson3         3
Compton3         3
Hickman3         3
Nakid3           3
Boulos3          3
Danbom3          3
Quick3           3
Hart3            3
Bourke3          3
Goldsmith3       3
Collyer3         3
Van Impe3        3
Crosby3          3
Samaan3     

In [42]:
concat.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,FamilySize,Surname,FamilyID
0,22.0,,3,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0,A/5 21171,6,2,Braund,Small
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1,PC 17599,7,2,Cumings,Small
2,26.0,,3,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1,STON/O2. 3101282,4,1,Heikkinen,Small
3,35.0,C123,3,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1,113803,7,2,Futrelle,Small
4,35.0,,3,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0,373450,6,1,Allen,Small
5,,,2,8.4583,"Moran, Mr. James",0,6,3,1,0,0,330877,6,1,Moran,Small
6,54.0,E46,3,51.8625,"McCarthy, Mr. Timothy J",0,7,1,1,0,0,17463,6,1,McCarthy,Small
7,2.0,,3,21.075,"Palsson, Master. Gosta Leonard",1,8,3,1,3,0,349909,3,5,Palsson,Palsson5
8,27.0,,3,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,0,0,1,347742,7,3,Johnson,Johnson3
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,0,1,1,237736,7,2,Nasser,Small


In [43]:
# label encoding for family id
le.fit(concat.FamilyID)
concat.FamilyID = le.transform(concat.FamilyID)
concat.FamilyID.unique()

array([50, 38, 28, 48,  2, 43,  3, 22, 31, 47, 39, 57, 24, 49, 35,  9, 21,
       17, 27, 41,  6, 37, 60, 46, 23, 30, 32,  5,  7, 11, 44, 54, 52,  0,
        1, 34, 45, 25, 58, 51, 13, 20, 59, 36, 10, 19, 55, 15, 18,  4, 42,
       14, 16, 53, 26,  8, 56, 29, 33, 12, 40])

In [44]:
concat_reduce = concat[[
    'PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp',
    'Parch', 'Fare', 'Title', 'Embarked', 'FamilySize',
    'FamilyID', 'Survived']]
concat_reduce.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Title,Embarked,FamilySize,FamilyID,Survived
0,1,3,1,22.0,1,0,7.25,6,3,2,50,0
1,2,1,0,38.0,1,0,71.2833,7,1,2,50,1
2,3,3,0,26.0,0,0,7.925,4,3,1,50,1
3,4,1,0,35.0,1,0,53.1,7,3,2,50,1
4,5,3,1,35.0,0,0,8.05,6,3,1,50,0


In [45]:
train_final = concat_reduce.iloc[:891].copy()
test_final = concat_reduce.iloc[891:].copy()

In [46]:
test_final.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Title,Embarked,FamilySize,FamilyID,Survived
0,892,3,1,34.5,0,0,7.8292,6,2,1,50,0
1,893,3,0,47.0,1,0,7.0,7,3,2,50,0
2,894,2,1,62.0,0,0,9.6875,6,2,1,50,0
3,895,3,1,27.0,0,0,8.6625,6,3,1,50,0
4,896,3,0,22.0,1,1,12.2875,7,3,3,50,0


In [47]:
train_final.to_csv('./data/train_final.csv', index=False)
test_final.to_csv('./data/test_final.csv', index=False)

%ls data

gender_submission.csv  test_le.csv      train_le.csv
test.csv               train.csv        train_split_final.csv
test_final.csv         train_final.csv  valid_split_final.csv
