In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Categorize the age
def age_grouped(df):
    df.Age = df.Age.fillna(-0.5)
    bins = (-1, 0, 5, 13, 20, 35, 65, 120)
    age_categories = ['Unknown', 'Baby', 'Child', 'Teenager', 'Youth', 'Adult', 'Senior']
    categories = pd.cut(df.Age, bins, labels=age_categories)
    df.Age = categories
    return df

# Categorize the Fare price by quartile bins (quartile information obtained from df_train.Fare.describe()).
def fare_grouped(df):
    df.Fare = df.Fare.fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 515)
    quartile_labels = ['Unknown', '1', '2', '3', '4']
    categories = pd.cut(df.Fare, bins, labels=quartile_labels)
    df.Fare = categories
    return df

# Each Cabin starts with a letter which means there are categorized into a group (which might mean something). I want to slice out the first letter before the numbers.

def cabins_firstletter(df):
    df.Cabin = df.Cabin.fillna('N')
    df.Cabin = df.Cabin.apply(lambda x: x[0])
    return df
# Slice out each passenger "Title (Prefix)" 

def name_title(df):
    df['Name_title'] = df.Name.apply(lambda x: x.split(' ')[1])
    return df 
# To transform everything I did above

def transform_features(df):
    df = age_grouped(df)
    df = fare_grouped(df)
    df = cabins_firstletter(df)
    df = name_title(df)
    return df

df_train = transform_features(df)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_title
0,1,0,3,"Braund, Mr. Owen Harris",male,Youth,1,0,A/5 21171,1,N,S,Mr.
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,Adult,1,0,PC 17599,4,C,C,Mrs.
2,3,1,3,"Heikkinen, Miss. Laina",female,Youth,0,0,STON/O2. 3101282,1,N,S,Miss.
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,Youth,1,0,113803,4,C,S,Mrs.
4,5,0,3,"Allen, Mr. William Henry",male,Youth,0,0,373450,2,N,S,Mr.


In [4]:
def base_features(df):
    return df[['SibSp', 'Parch']]
base_features_tf = FunctionTransformer(base_features, validate = False)

def dummies(df):
    cols = ['Pclass', 'Sex', 'Embarked']
    return pd.get_dummies(df[cols], columns = cols)
dummies_tf = FunctionTransformer(dummies, validate = False)

def cabin(df):
    col = ['Cabin', 'Age']
    return pd.get_dummies(df[col], columns = col)
cabin_tf= FunctionTransformer(cabin, validate = False)


In [6]:
fu = FeatureUnion([
    ('base_features_tf', base_features_tf),
    ('dummies_tf', dummies_tf),
    ('cabin_tf', cabin_tf)
])

model = GradientBoostingClassifier()

pipe = Pipeline([
    ('fu', fu),
    ('model', model)
])

In [7]:
params = {}
gs = GridSearchCV(pipe, param_grid =params)
gs.fit(df, df['Survived'])
print gs.best_score_
gs.best_params_

ValueError: Number of features of the model must match the input. Model n_features is 26 and input n_features is 25 

In [None]:
test = pd.read_csv('test.csv')
pred = gs.predict(test)
test.shape

In [None]:
pred.shape

In [None]:
test['Survived'] = pred
test[['PassengerId', 'Survived']].to_csv('submission3.csv', index = False)