In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold, cross_val_score
from sklearn import linear_model, metrics, grid_search, pipeline, preprocessing
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [129]:

# get training & test csv files as a DataFrame
raw_data = pd.read_csv("data/train.csv", dtype={"Age": np.float64}, index_col='PassengerId')
#test_df = pd.read_csv("data/test.csv", dtype={"Age": np.float64}, index_col='PassengerId')

# preview the data
raw_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [130]:
print(raw_data.shape)

(891, 11)


In [131]:
# Check if "nulls" exist.
raw_data.isnull().values.any()

True

In [132]:
raw_data.loc[raw_data.Sex == 'male', 'Sex'] = 0
raw_data.loc[raw_data.Sex == 'female', 'Sex'] = 1
raw_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,S


In [133]:
total_number_train = raw_data.shape[0]
print("Train data: total_number = ", total_number_train)

Train data: total_number =  891


In [134]:
# get columns with missed data for train data
for column in raw_data.columns.values:
  count = raw_data[column].count()
  if count != total_number_train:
    print(column, total_number_train - count)

Age 177
Cabin 687
Embarked 2


In [136]:
raw_data["Age"] = raw_data["Age"].fillna(raw_data.mean()["Age"]);
#X_train.mean()["Age"]

In [137]:
raw_data.isnull().values.any()

True

In [138]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [139]:
raw_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,S


In [140]:
# Random permutation
np.random.seed(0)
raw_data = raw_data.iloc[np.random.permutation(len(raw_data))]
raw_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
496,0,3,"Yousseff, Mr. Gerious",0,29.699118,0,0,2627,14.4583,,C
649,0,3,"Willey, Mr. Edward",0,29.699118,0,0,S.O./P.P. 751,7.55,,S
279,0,3,"Rice, Master. Eric",0,7.0,4,1,382652,29.125,,Q
32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",1,29.699118,1,0,PC 17569,146.5208,B78,C
256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",1,29.0,0,2,2650,15.2458,,C


In [141]:
X_train = raw_data.iloc[:600, :]
X_test = raw_data.iloc[600:, :]

In [142]:
# Select target and drop it from train data 
y_train = X_train['Survived']
X_train = X_train.drop(['Survived'], axis=1)

y_test = X_test['Survived']
X_test = X_test.drop(['Survived'], axis=1)


In [143]:
binary_data_columns = ['Sex']
binary_data_indices = np.array([(column in binary_data_columns) for column in X_train.columns], dtype = bool)

In [144]:
print(binary_data_columns)
print(binary_data_indices)

['Sex']
[False False  True False False False False False False False]


In [145]:
categorical_data_columns = ['Embarked','Name','Ticket','Cabin'] 
categorical_data_indices = np.array([(column in categorical_data_columns) for column in X_train.columns], dtype = bool)

In [146]:
print(categorical_data_columns)
print(categorical_data_indices)

['Embarked', 'Name', 'Ticket', 'Cabin']
[False  True False False False False  True False  True  True]


In [147]:
numeric_data_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
numeric_data_indices = np.array([(column in numeric_data_columns) for column in X_train.columns], dtype = bool)

In [148]:
print(numeric_data_columns)
print(numeric_data_indices)

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
[ True False False  True  True  True False  True False False]


In [149]:
# We are not goung to use "categorical_data_columns"
X_train[categorical_data_columns]=0
X_test[categorical_data_columns]=0

In [150]:
rfc = RandomForestClassifier(random_state = 0, max_depth = 20, n_estimators = 50)

In [151]:
estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = [        
            #binary
            ('binary_variables_processing', preprocessing.FunctionTransformer(lambda data: data[:, binary_data_indices])), 
                    
            #numeric
            ('numeric_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_data_indices])),
                ('scaling', preprocessing.StandardScaler(with_mean = 0., with_std = 1.))            
                        ])),
        
            #categorical
            ('categorical_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, categorical_data_indices])),
                ('hot_encoding', preprocessing.OneHotEncoder(handle_unknown = 'ignore'))            
                        ])),
        ])),
    ('model_fitting', rfc)
    ]
)

In [152]:
%%time
estimator.fit(X_train, y_train)

Wall time: 218 ms


Pipeline(steps=[('feature_processing', FeatureUnion(n_jobs=1,
       transformer_list=[('binary_variables_processing', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x000002116EC48840>, pass_y=False,
          validate=True)), ('numeric_variables_processing', Pipeline(steps=[(...estimators=50, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False))])

In [153]:
metrics.accuracy_score(y_test, estimator.predict(X_test))

0.80756013745704469

In [155]:
estimator.get_params().keys()

dict_keys(['feature_processing__categorical_variables_processing__hot_encoding__n_values', 'feature_processing__categorical_variables_processing__selecting__validate', 'feature_processing__categorical_variables_processing__hot_encoding__categorical_features', 'model_fitting__random_state', 'feature_processing__binary_variables_processing', 'feature_processing__categorical_variables_processing__steps', 'steps', 'feature_processing__categorical_variables_processing__selecting', 'feature_processing__numeric_variables_processing__scaling__with_std', 'model_fitting__max_leaf_nodes', 'feature_processing__numeric_variables_processing__steps', 'feature_processing__categorical_variables_processing__hot_encoding__handle_unknown', 'feature_processing__numeric_variables_processing__selecting__accept_sparse', 'feature_processing__binary_variables_processing__accept_sparse', 'model_fitting__criterion', 'feature_processing__transformer_weights', 'model_fitting', 'feature_processing__binary_variables_

In [156]:
parameters_grid = {
    'model_fitting__max_depth' : [10, 20, 30],
    'model_fitting__n_estimators' : [30, 40, 50, 60, 70], 
}

In [157]:
grid_cv = grid_search.GridSearchCV(estimator, parameters_grid, scoring ='accuracy', cv = 4)

In [158]:
%%time
grid_cv.fit(X_train, y_train)

Wall time: 5.39 s


GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(steps=[('feature_processing', FeatureUnion(n_jobs=1,
       transformer_list=[('binary_variables_processing', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x000002116EC48840>, pass_y=False,
          validate=True)), ('numeric_variables_processing', Pipeline(steps=[(...estimators=50, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'model_fitting__n_estimators': [30, 40, 50, 60, 70], 'model_fitting__max_depth': [10, 20, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [159]:
print(grid_cv.best_score_)
print(grid_cv.best_params_)

0.828333333333
{'model_fitting__n_estimators': 60, 'model_fitting__max_depth': 20}


In [160]:
estimator.predict(X_test)[:10]

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1], dtype=int64)

### Working with "real test" data

In [163]:
test_data = pd.read_csv("data/test.csv", dtype={"Age": np.float64}, index_col='PassengerId')

In [164]:
test_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [165]:
test_data.loc[test_data.Sex == 'male', 'Sex'] = 0
test_data.loc[test_data.Sex == 'female', 'Sex'] = 1
test_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S


In [166]:
binary_data_columns = ['Sex']
binary_data_indices = np.array([(column in binary_data_columns) for column in test_data.columns], dtype = bool)

In [167]:
print(binary_data_columns)
print(binary_data_indices)

['Sex']
[False False  True False False False False False False False]


In [168]:
categorical_data_columns = ['Embarked','Name','Ticket','Cabin'] 
categorical_data_indices = np.array([(column in categorical_data_columns) for column in test_data.columns], dtype = bool)

In [169]:
print(categorical_data_columns)
print(categorical_data_indices)

['Embarked', 'Name', 'Ticket', 'Cabin']
[False  True False False False False  True False  True  True]


In [170]:
numeric_data_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
numeric_data_indices = np.array([(column in numeric_data_columns) for column in test_data.columns], dtype = bool)

In [171]:
print(numeric_data_columns)
print(numeric_data_indices)

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
[ True False False  True  True  True False  True False False]


In [172]:
# We are not goung to use "categorical_data_columns"
test_data[categorical_data_columns]=0
test_data[categorical_data_columns]=0

In [174]:
total_number_train = test_data.shape[0]
print("Train data: total_number = ", total_number_train)

Train data: total_number =  418


In [175]:
# get columns with missed data for train data
for column in test_data.columns.values:
  count = test_data[column].count()
  if count != total_number_train:
    print(column, total_number_train - count)

Age 86
Fare 1


In [179]:
test_data["Age"] = test_data["Age"].fillna(test_data.mean()["Age"]);
test_data["Fare"] = test_data["Fare"].fillna(test_data.mean()["Fare"]);
#X_train.mean()["Age"]

In [180]:
test_data.isnull().values.any()

False

In [190]:
test_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,0,0,34.5,0,0,0,7.8292,0,0
893,3,0,1,47.0,1,0,0,7.0,0,0
894,2,0,0,62.0,0,0,0,9.6875,0,0
895,3,0,0,27.0,0,0,0,8.6625,0,0
896,3,0,1,22.0,1,1,0,12.2875,0,0


In [210]:
PassengerId = np.array(test_data.index.tolist()).astype(np.uint16)

In [211]:
res = estimator.predict(test_data)
res[:10]

array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0], dtype=int64)

In [212]:
PassengerId[:10]

array([892, 893, 894, 895, 896, 897, 898, 899, 900, 901], dtype=uint16)

In [213]:
results = np.vstack((PassengerId, res)).T

In [214]:
results.shape

(418, 2)

In [216]:
np.savetxt("titanic_rf_pipeline_results.csv", results, delimiter=',',fmt='%d')