In [1]:
#########
# Start #
#########

# import library
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [2]:
# load the data
df = pd.read_csv('./data/train.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [3]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
#########
# Title #
#########

# create Title column
df['Title'] = df.Name.map( lambda x: x.split(',')[1].split( '.' )[0].strip())

# inspect the number of people for each title
df['Title'].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Col               2
Major             2
Lady              1
Jonkheer          1
Don               1
Ms                1
Mme               1
Capt              1
the Countess      1
Sir               1
Name: Title, dtype: int64

In [5]:
# merge similar titles
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace(['Mme','Lady','Ms'], 'Mrs')
df.Title.loc[ (df.Title !=  'Master') & (df.Title !=  'Mr') & (df.Title !=  'Miss') 
             & (df.Title !=  'Mrs')] = 'Others'

# inspect the number of people for each Title
df['Title'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Mr        517
Miss      184
Mrs       128
Master     40
Others     22
Name: Title, dtype: int64

In [6]:
# create TitleID column
count = 0
for t in df.Title.unique():
    df.loc[df.Title == t, 'TitleID'] = count
    count += 1

# inspect the number of people for each TitleID
df.TitleID.value_counts()

0.0    517
2.0    184
1.0    128
3.0     40
4.0     22
Name: TitleID, dtype: int64

In [7]:
##########
# Family #
##########

# create Fsize (family size) column
df['Fsize'] = df['SibSp'] + df['Parch'] + 1

# inspect the number of people for each Fsize
df.Fsize.value_counts()

1     537
2     161
3     102
4      29
6      22
5      15
7      12
11      7
8       6
Name: Fsize, dtype: int64

In [8]:
# create Surname column
df['Surname'] = df.Name.map( lambda x: x.split(',')[0].strip())

# create Family (surname) column
df['Family'] = df.Surname.astype(str) + '_' + df.Fsize.astype(str)

# inspect Family feature
df['Family'].describe()

count             891
unique            701
top       Andersson_7
freq                8
Name: Family, dtype: object

In [9]:
df[df.Fsize == 11]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,TitleID,Fsize,Surname,Family
159,160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S,Master,3.0,11,Sage,Sage_11
180,181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S,Miss,2.0,11,Sage,Sage_11
201,202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S,Mr,0.0,11,Sage,Sage_11
324,325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S,Mr,0.0,11,Sage,Sage_11
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S,Miss,2.0,11,Sage,Sage_11
846,847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S,Mr,0.0,11,Sage,Sage_11
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S,Miss,2.0,11,Sage,Sage_11


In [10]:
# create FamilyID column
count = 0
for f in df.Family.unique():
    df.loc[df.Family == f, 'FamilyID'] = count
    count += 1

# inspect the number of unique FamilyID
len(df.FamilyID.unique())

701

In [11]:
############
# Embarked #
############

df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Title,Surname,Family
count,891,891,891,204,889,891,891,891
unique,891,2,681,147,3,5,667,701
top,"Graham, Mr. George Edward",male,CA. 2343,C23 C25 C27,S,Mr,Andersson,Andersson_7
freq,1,577,7,4,644,517,9,8


In [12]:
# fill the NAN with the most common S
df.Embarked.fillna('S' , inplace=True )

# check if there is any NAN
df.Embarked.isnull().sum(axis=0)

0

In [13]:
# create EmbarkedID column
i = 0
for e in df.Embarked.unique():
    df.loc[df.Embarked == e, 'EmbarkedID'] = i
    i += 1

# inspect the number of people for each EmbarkedID
df.EmbarkedID.value_counts()

0.0    646
1.0    168
2.0     77
Name: EmbarkedID, dtype: int64

In [14]:
########
# Fare #
########

# REFERENCE: https://www.kaggle.com/arjoonn/ticket-fare-analysis

# inspect the number of unique Ticket
len(df.Ticket.unique())

681

In [15]:
# create TicketShare column
df['TicketShare'] = 1

In [16]:
# inspect the same ticket with different fare
counts = []
for t in df.Ticket.unique():
    this = df.loc[df.Ticket == t, 'Fare']
    l = len(this.unique())
    counts.append(l)
    if l > 1:
        df.loc[df.Ticket == t, 'TicketShare'] = l
        print(df.loc[df.Ticket == t])
print(set(counts))

     PassengerId  Survived  Pclass                           Name   Sex   Age  \
138          139         0       3            Osen, Mr. Olaf Elon  male  16.0   
876          877         0       3  Gustafsson, Mr. Alfred Ossian  male  20.0   

     SibSp  Parch Ticket    Fare Cabin Embarked Title  TitleID  Fsize  \
138      0      0   7534  9.2167   NaN        S    Mr      0.0      1   
876      0      0   7534  9.8458   NaN        S    Mr      0.0      1   

        Surname        Family  FamilyID  EmbarkedID  TicketShare  
138        Osen        Osen_1     128.0         0.0            2  
876  Gustafsson  Gustafsson_1     330.0         0.0            2  
set([1, 2])


We see that there is just one case where the same ticket has a different Fare for the people associated with it. Who are these people?
According to Encyclopedia-Titanica they were travelling companions.

The fare differs since they had different destinations. What about the other cases?

In [17]:
df.Fare.describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [18]:
# transform fare from per ticket to per person
for t in df.Ticket.unique():
    this = df.loc[df.Ticket == t, 'Fare']
    no_of_people = this.count()
    df.loc[df.Ticket == t, 'TicketShare'] = no_of_people
    df.loc[df.Ticket == t, 'Fare'] = this / no_of_people

df.Fare.describe()

count    891.000000
mean      17.788989
std       21.218157
min        0.000000
25%        7.762500
50%        8.850000
75%       24.288200
max      221.779200
Name: Fare, dtype: float64

In [19]:
# inspect the number of people for shared ticket
df.TicketShare.value_counts()

1    547
2    188
3     63
4     44
7     21
6     18
5     10
Name: TicketShare, dtype: int64

In [20]:
########
# Ages #
########

# REFERENCE: http://www.ultravioletanalytics.com/2014/11/03/kaggle-titanic-competition-part-ii-missing-values/

# Populate missing ages  using RandomForestClassifier
def setMissingAges(df):
    
    # set features for the dataset
    age_df = df[['Age','EmbarkedID','Fare', 'Parch', 'SibSp','Pclass', 'TitleID', 'FamilyID']]
    
    # split into sets with 'with' and 'without' age values
    withAge = age_df.loc[(df.Age.notnull())]
    withoutAge = age_df.loc[(df.Age.isnull())]
    
    # All the other values are stored in the feature array
    x = withAge.values[:, 1::]
    
    # All age values are stored in a target array
    y = withAge.values[:, 0]
    
    # Create and fit a model
    rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)
    rtr.fit(x, y)
    
    # Use the fitted model to predict the missing values
    predictedAges = rtr.predict(withoutAge.values[:, 1::])
    
    # Assign those predictions to the full data set
    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges 
    
    return df

In [21]:
# set missing values of Age
setMissingAges(df)

# check if there is any NAN
df.Age.isnull().sum(axis=0)

0

In [22]:
df.Age.describe()

count    891.000000
mean      29.882683
std       13.866094
min        0.420000
25%       21.000000
50%       29.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [23]:
#######
# End #
#######

# drop unnecessary columns
df = df.drop(['EmbarkedID','TitleID','FamilyID', 'Surname', 'Cabin'], axis = 1)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Fsize,Family,TicketShare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr,2,Braund_2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs,2,Cumings_2,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss,1,Heikkinen_1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,26.55,S,Mrs,2,Futrelle_2,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr,1,Allen_1,1


In [24]:
# save the results
df.to_csv('./data/outcome.csv', index=False)