In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
train = pd.read_csv(open('../Data/Kaggle/Titanic/train.csv'))

In [3]:
test = pd.read_csv(open('../Data/Kaggle/Titanic/test.csv'))

In [4]:
test['Test'] = 1

In [5]:
train['PassengerId'].count()

891

In [6]:
test['PassengerId'].count()

418

In [7]:
combine = pd.concat([test, train], sort=False, ignore_index=True)

In [8]:
combine[(combine['Test'] != 1)].count()

PassengerId    891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
Test             0
Survived       891
dtype: int64

In [9]:
combine[(combine['Test'] == 1)].count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
Test           418
Survived         0
dtype: int64

In [10]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1046 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Test           418 non-null float64
Survived       891 non-null float64
dtypes: float64(4), int64(4), object(5)
memory usage: 133.0+ KB


In [11]:
combine['Is_male']   = combine['Is_female'] = combine['Sex']
combine['Is_male']   = combine['Is_male'].map(lambda x : 1 if x == 'male' else 0)
combine['Is_female'] = combine['Is_female'].map(lambda x : 1 if x == 'female' else 0)

In [12]:
g_embarked = combine.groupby(by='Embarked')

In [13]:
embarked_dict = {}
for v, k in enumerate(g_embarked.groups.items()):
    embarked_dict.setdefault(k[0], v)

In [14]:
combine['Embarked_1'] = combine.loc[combine['Embarked'].notnull(), 'Embarked'].map(lambda x : embarked_dict[x])

In [16]:
combine.loc[combine['Embarked'].isnull(), ['Embarked']] = -1

In [17]:
title = ['Dr.', 'Rev.', 'Master.', 'Mrs.', 'Miss.', 'Mr.', 'Mlle.', 'Ms.', 'Capt']
title_dict = {}
for v, k in enumerate(title):
    title_dict[k] = v

In [18]:
combine['Title_Num'] = combine['Name'].map(lambda x : title_dict.get(x.split(' ')[1]), '')
combine['Title']     = combine['Name'].map(lambda x : x.split(' ')[1] if title_dict.get(x.split(' ')[1]) else '')
g_title = combine.groupby(by=['Title', 'Survived'])

In [19]:
def to_int(p_num):
    try:
        int(p_num)
        return False
    except:
        return True

In [20]:
combine[combine['Ticket'].map(lambda x : to_int(x))].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Test,Survived,Is_male,Is_female,Embarked_1,Title_Num,Title
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S,1.0,,1,0,2.0,5.0,Mr.
14,906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance...",female,47.0,1,0,W.E.P. 5734,61.175,E31,S,1.0,,0,1,2.0,3.0,Mrs.
15,907,2,"del Carlo, Mrs. Sebastiano (Argenia Genovesi)",female,24.0,1,0,SC/PARIS 2167,27.7208,,C,1.0,,0,1,1.0,,
18,910,3,"Ilmakangas, Miss. Ida Livija",female,27.0,1,0,STON/O2. 3101270,7.925,,S,1.0,,0,1,2.0,4.0,Miss.
20,912,1,"Rothschild, Mr. Martin",male,55.0,1,0,PC 17603,59.4,,C,1.0,,1,0,1.0,5.0,Mr.


In [21]:
combine[(combine['Test'] == 1)].count()['Test']

418

In [22]:
combine[combine['Fare'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Test,Survived,Is_male,Is_female,Embarked_1,Title_Num,Title
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,1.0,,1,0,2.0,5.0,Mr.


In [23]:
combine.loc[combine['Fare'].isnull(), 'Fare'] = combine[(combine['Pclass'] == 3) & (combine['Title'] == 'Mr.') & (combine['Is_male'] == 1)].mean()['Fare']

In [24]:
combine[(combine['Test'] == 1)].count()['Test']

418

In [25]:
combine.loc[combine['Embarked_1'].isnull(), 'Embarked_1'] = combine[(combine['Pclass'] == 1) & (combine['Title'] == 'Miss.') & (combine['Is_female'] == 1)].median()['Embarked_1']

In [26]:
combine[(combine['Test'] == 1)].count()['Test']

418

In [27]:
for i in combine.columns:
    print('{0} = {1}'.format(i, combine[combine[i].isnull()]['PassengerId'].count()))

PassengerId = 0
Pclass = 0
Name = 0
Sex = 0
Age = 263
SibSp = 0
Parch = 0
Ticket = 0
Fare = 0
Cabin = 1014
Embarked = 0
Test = 891
Survived = 418
Is_male = 0
Is_female = 0
Embarked_1 = 0
Title_Num = 47
Title = 0


In [28]:
used_columns = ['Pclass', 'Fare', 'Is_male', 'Is_female', 'Survived']

In [29]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [30]:
X_train = combine[combine['Test'] != 1][used_columns]
X_test  = combine[combine['Test'] == 1][used_columns]

In [31]:
X_train['Pclass'].count()

891

In [32]:
X_test['Pclass'].count()

418

In [33]:
y_train = np.asarray(X_train['Survived'], dtype="|S6")
y_test = np.asarray(X_test['Survived'], dtype="|S6")

In [34]:
del X_train['Survived']
del X_test['Survived']

In [35]:
rfe = model.fit(X_train, y_train)

In [36]:
predict = rfe.predict(X_test)

In [37]:
result = pd.DataFrame(predict)

In [38]:
result.columns = ['Survived']

In [39]:
res = pd.merge(test, result, how='inner', left_index=True, right_index=True)

In [40]:
res = res[['PassengerId', 'Survived']]

In [41]:
res['Survived'] = res['Survived'].astype(float).astype(int)

In [42]:
res.to_csv('submission.csv', sep=',', encoding='utf8', index=False)