# Introduction
<br>
**Context**
<br>
This notebook was created to participate in the Kaggle competition "[Titanic: Machine Learning fro Disaster](https://www.kaggle.com/c/titanic)".
<br>

**Objective**
<br>
The goal of the competition is to predict which passengers will survive the tragedy.
<br>

**Results**
<br>
On August of 2018, I got an accuracy score of 77%.
<br>

# Feature exploration, engineering and cleaning
<br>
First, let's set up our environment.

In [1]:
# Ignore warning messages
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Perform computations
from math import floor

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.grid_search import ParameterGrid
import xgboost

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold, KFold
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# No warning
pd.options.mode.chained_assignment = None

In [2]:
# Upload datasets
train = pd.read_csv('train.csv')
train_passenger_ids = train.PassengerId.tolist()

test = pd.read_csv('test.csv')
test_passenger_ids = test.PassengerId.tolist()

all_data = pd.concat([train, test], sort=False, ignore_index=True)

## 1. Creating new features
<br>
### A. From the _Cabin_ column ...
<br>... we are going to extract:
- <u>has_cabin_number</u> 1 if the traveller had a cabin number.
- <u>cabin_deck</u>: deck where the cabin was located.
- <u>cabin_room_number</u>: room number of the cabin.
- <u>cabin_type</u>: "Suite" if the traveller had several rooms.
- <u>cabin_weird_number</u>: 1 if the cabin number is like "F H30"

In [3]:
# UNDERSTANDING NAN
print("Share of travellers without a cabin number, broken down by Pclass:")
print(all_data[all_data.Cabin.isna()].groupby(['Pclass']).count().PassengerId/all_data.groupby(['Pclass']).count().PassengerId)
## We can't deduct anything from a NaN in this column. Some 1st class travellers don't have any Cabin number, and some 3rd class do.

all_data["has_cabin_number"] = all_data.Cabin.notnull()*1


# CREATING NEW FEATURES
rows_with_cabin = all_data[all_data.Cabin.notnull()].index

## 1. Extracting the deck
all_data["cabin_deck"] = np.nan
for index in rows_with_cabin:
    deck = all_data.Cabin[index][:1]
    if len(deck) == 0:
        all_data["cabin_deck"][index] = "Unknown"
    else:
        all_data["cabin_deck"][index] = deck

## 2. Extracting the room number
#   NB: If multiple cabins, we take the number of the last room:
all_data["cabin_room_number"] = np.nan
for index in rows_with_cabin:
    # Rooms in the deck (Cabin = T) didn't have any number
    if len(all_data.Cabin[index])==1:
        all_data["cabin_room_number"][index] = np.nan

    # We take the last character of the string if ...    
    elif len(all_data.Cabin[index])==2:
        raw_number = int(all_data["Cabin"][index][-1:])
        lower_boundary = str(50 * floor(raw_number/50))
        upper_boundary = str(50 * (floor(raw_number/50) +1))
        all_data["cabin_room_number"][index] = lower_boundary + " - " + upper_boundary
    
    # We take the last 2 characters of the string if ...
    elif (len(all_data.Cabin[index])==3
        or (len(all_data.Cabin[index]) in (5,7) and all_data.Cabin[index].count(' ')==1)
        or (len(all_data.Cabin[index])==11 and all_data.Cabin[index].count(' ')==2)
        or len(all_data.Cabin[index])>11
       ):
        raw_number = int(all_data["Cabin"][index][-2:])
        lower_boundary = str(50 * floor(raw_number/50))
        upper_boundary = str(50 * (floor(raw_number/50) +1))
        all_data["cabin_room_number"][index] = lower_boundary + " - " + upper_boundary
    # We take the last 3 characters of the string if ...
    elif (len(all_data.Cabin[index])==4
          or (all_data.Cabin[index].count(' ')==1 and len(all_data.Cabin[index])==6)
          or (all_data.Cabin[index].count(' ')==1 and len(all_data.Cabin[index])>7)
          or (all_data.Cabin[index].count(' ')==2 and len(all_data.Cabin[index])>11)
         ):
        raw_number =  int(all_data["Cabin"][index][-3:])
        lower_boundary = str(50 * floor(raw_number/50))
        upper_boundary = str(50 * (floor(raw_number/50) +1))
        all_data["cabin_room_number"][index] = lower_boundary + " - " + upper_boundary
    # Let's check that we're not missing any case scenario
    else: 
        all_data["cabin_room_number"][index] = 'Catch me if you can'


## 3. Extracting the number of cabins
all_data["cabin_type"] = np.nan
for index in rows_with_cabin:
    if (all_data.Cabin[index].count(' ')>1) or (all_data.Cabin[index].count(' ')==1 and len(all_data.Cabin[index])>5):
        all_data["cabin_type"][index] = "Suite"
    else:
        all_data["cabin_type"][index] = "Standard"


## 4. Isolating weird cases
all_data["cabin_weird_number"] = 0
for index in rows_with_cabin:
    if all_data.Cabin[index].count(' ')==1 and len(all_data.Cabin[index])==5:
        all_data["cabin_weird_number"][index] = 1

Share of travellers without a cabin number, broken down by Pclass:
Pclass
1    0.207430
2    0.916968
3    0.977433
Name: PassengerId, dtype: float64


### B. From the _Ticket_ column ...
<br>... we are going to extract:
- <u>ticket_prefix</u>: if exist, the letters at the beginning of the ticket number.
- <u>ticket_number</u>: if the ticket has a prefix, it gives the numbers after the prefix. Otherwise, it equals the existing Ticket feature.
- <u>ticket_n_pax</u>: number of travellers with the same ticket..

In [4]:
rows_with_ticket = all_data[all_data.Ticket.notnull()].index

## 1. Extracting ticket prefix
all_data["ticket_prefix"] = np.nan

for index in rows_with_ticket:
    if all_data.Ticket[index].find(" ") != -1:
        all_data["ticket_prefix"][index] = all_data.Ticket[index][:all_data.Ticket[index].find(" ")]
    else:
        all_data["ticket_prefix"][index] = "No prefix"


## 2. Extracing ticket number
all_data["ticket_number"] = np.nan

for index in rows_with_ticket:
    all_data["ticket_number"][index] = all_data.Ticket[index][-(len(all_data.Ticket[index])-all_data.Ticket[index].find(" ")):]


## 3. Computing the number of travellers with same ticket
n_pax = all_data.groupby("Ticket").count().PassengerId.to_frame()
n_pax.columns = ['ticket_n_pax']
all_data = all_data.merge(n_pax, how = "left", on = "Ticket", suffixes=('', '_y'), copy = False)

### C. From the _Name_ column ...
... we are going to extract:
- <u>name_title</u>: the title of the traveller.
- <u>has_nickname</u>: 1 if a nickname is specified.
- <u>has_maiden_name</u>: 1 if a maiden name is specified.
- <u>has_multiple_last_names</u>: 1 if the traveller has several last names.
- <u>has_multiple_first_names</u>: 1 if the traveller has several first names.
- <u>name_has_particle</u>: 1 if the traveller has a particle in his last name.
- <u>name_has_dash</u>: 1 if the traveller has a "-" in his name.

In [5]:
# 1. Extracting the title
all_data["name_title"] = np.nan
for index in range(0,len(all_data)):
    # The title is after the comma, and before the point
    all_data["name_title"][index] = all_data.Name[index][all_data.Name[index].find(',')+2:all_data.Name[index].find('.')]

# 2. Extracting if there is a nickname (")
all_data["has_nickname"] = 0
for index in range(0,len(all_data)):
    # A nickname is separated by quotation marks
    if '"' in all_data.Name[index]:
        all_data["has_nickname"][index] = 1

# 3. Extracting if there is a maiden name
all_data["has_maiden_name"] = np.nan
for index in range(0,len(all_data)):
    if '(' in all_data.Name[index] and all_data.Sex[index]=="female":
        all_data["has_maiden_name"][index] = 1
    elif all_data.Sex[index] == "female":
        all_data["has_maiden_name"][index] = 0

# 4. Extracting the last name(s)
all_data["last_name"] = np.nan
for index in range(0,len(all_data)):
    all_data["last_name"][index] = all_data.Name[index][:all_data.Name[index].find(",")]

# 5. Extracting the number of last names
all_data["n_last_names"] = np.nan
for index in range(0,len(all_data)):
    all_data["n_last_names"][index] = all_data.Name[index][:all_data.Name[index].find(",")].count(" ")

# 6. Extracting has_multiple_first_names
all_data["has_multiple_last_names"] = np.nan
for index in range(0,len(all_data)):
    if all_data.n_last_names[index] > 1:
        all_data["has_multiple_last_names"][index] = 1
    else:
        all_data["has_multiple_last_names"][index] = 0

# 7. Extracting the first name(s)
all_data["first_name"] = np.nan
for index in range(0,len(all_data)):
    if "(" in all_data.Name[index] and '"' in all_data.Name[index]:
        position = min(all_data.Name[index].find("("), all_data.Name[index].find('"'))
        all_data["first_name"][index] = all_data.Name[index][all_data.Name[index].find(".")+2:position]

    elif "(" in all_data.Name[index]:
        position = all_data.Name[index].find("(")
        all_data["first_name"][index] = all_data.Name[index][all_data.Name[index].find(".")+2:position]

    elif '"' in all_data.Name[index]:
        position = all_data.Name[index].find('"')
        all_data["first_name"][index] = all_data.Name[index][all_data.Name[index].find(".")+2:position]

    else:
        all_data["first_name"][index] = all_data.Name[index][all_data.Name[index].find(".")+2:]

# 8. Extracting the number of first names
all_data["n_first_names"] = np.nan
for index in range(0,len(all_data)):
    if "(" in all_data.Name[index] and '"' in all_data.Name[index]:
        position = min(all_data.Name[index].find("("), all_data.Name[index].find('"'))
        all_data["n_first_names"][index] = all_data.Name[index][all_data.Name[index].find(".")+2:position].count(" ")

    elif "(" in all_data.Name[index]:
        position = all_data.Name[index].find("(")
        all_data["n_first_names"][index] = all_data.Name[index][all_data.Name[index].find(".")+2:position].count(" ")

    elif '"' in all_data.Name[index]:
        position = all_data.Name[index].find('"')
        all_data["n_first_names"][index] = all_data.Name[index][all_data.Name[index].find(".")+2:position].count(" ")

    else:
        all_data["first_name"][index] = all_data.Name[index][all_data.Name[index].find(".")+2:].count(" ")

# 9. Extracting has_multiple_first_names
all_data["has_multiple_first_names"] = np.nan
for index in range(0,len(all_data)):
    if all_data.n_first_names[index] > 1:
        all_data["has_multiple_first_names"][index] = 1
    else:
        all_data["has_multiple_first_names"][index] = 0

# 10. Extracting whether there is a particle in the name
all_data["name_has_particle"] = 0
for index in range(0,len(all_data)):
    if all_data.Name[index][:2] == "de":
        all_data["name_has_particle"][index] = 1

# 11. Extracting whethere there is a dash in the name
all_data["name_has_dash"] = 0
for index in range(0,len(all_data)):
    if "-" in all_data.Name[index]:
        all_data["name_has_dash"][index] = 1

### D. From the _Age_ column ...
<br>Let's simply group ages and replace missing values with "unknown".

In [6]:
all_data["age_group"] = 'unknown'
rows_with_age = all_data[all_data.Age.notnull()].index

for index in rows_with_age:
    lower_boundary = str(10 * floor(all_data.Age[index]/10))
    upper_boundary = str(10 * (floor(all_data.Age[index]/10)+1))
    all_data["age_group"][index] = lower_boundary + " - " + upper_boundary

### E. From _SibSp_ and _Parch_ columns ...
<br>...we are going to extract:
- <u>sibsp_group</u>: a grouped version of SibSp.
- <u>parch_group</u>: a grouped version of Parch.
- <u>relatives_group</u>: the number of relatives (SibSp + Parch).
- <u>is_alone</u>: 1 if the traveller doesn't have any relative.

In [7]:
# 1. Grouping SibSp
all_data["sibsp_group"] = all_data.SibSp
for index in range(0,len(all_data)):
    if all_data.SibSp[index] > 4:
        all_data["sibsp_group"][index] = "5+"

# 2. Grouping Parch
all_data["parch_group"] = all_data.Parch
for index in range(0,len(all_data)):
    if all_data.Parch[index] > 2:
        all_data["parch_group"][index] = "3+"

# 3. Computing total number of related
all_data["n_relatives"] = all_data.SibSp + all_data.Parch

# 4. Grouping relatives
all_data["relatives_group"] = all_data.n_relatives
for index in range(0,len(all_data)):
    if all_data.n_relatives[index] > 4:
        all_data["relatives_group"][index] = "5+"

# 5. Creating is_alone
all_data["is_alone"] = 0
for index in range(0,len(all_data)):
    if all_data.n_relatives[index] == 0:
        all_data["is_alone"][index] = 1

### F. From the _Fare_ column ...
<br>Let's simply group fares.

In [8]:
rows_with_fare = all_data[all_data.Fare.notnull()].index

all_data["fare_group"] = np.nan

for index in rows_with_fare:
    if all_data.Fare[index] == 0:
        all_data["fare_group"][index] = "invited"
        
    elif all_data.Fare[index] >= 100:
        all_data["fare_group"][index] = "100+"
    
    elif all_data.Fare[index] >= 40:
        lower_boundary = str(20 * floor(all_data.Fare[index]/20))
        upper_boundary = str(20 * (floor(all_data.Fare[index]/20)+1))
        all_data["fare_group"][index] = lower_boundary + " - " + upper_boundary
        
    elif all_data.Fare[index] >= 15:
        lower_boundary = str(5 * floor(all_data.Fare[index]/5))
        upper_boundary = str(5 * (floor(all_data.Fare[index]/5)+1))
        all_data["fare_group"][index] = lower_boundary + " - " + upper_boundary

    else:
        lower_boundary = str(2 * floor(all_data.Fare[index]/2))
        upper_boundary = str(2 * (floor(all_data.Fare[index]/2)+1))
        all_data["fare_group"][index] = lower_boundary + " - " + upper_boundary

## 2. Selecting features

In [9]:
cols_to_drop = ["Name", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "ticket_number", "last_name", "n_last_names", "first_name", "n_first_names", "n_relatives"]
all_data = all_data.drop(cols_to_drop, axis = 1)
print("%d features were dropped." %(len(cols_to_drop)))

13 features were dropped.


## 3. Transforming categorical features

In [10]:
cols_not_to_encode = ["PassengerId", "Survived"]
cols_to_encode = list(set(all_data.columns.tolist())-set(cols_not_to_encode))

all_data = pd.get_dummies(all_data, columns=cols_to_encode)
print("We now have %d features." %(len(all_data.columns.tolist())))

We now have 161 features.


## 4. Splitting train and test sets

In [11]:
train = all_data[all_data.PassengerId.isin(train_passenger_ids)]
test = all_data[all_data.PassengerId.isin(test_passenger_ids)].drop("Survived", axis = 1)

# Generating our Base First-Level Models

In [12]:
# Creating X and y
X = train.drop("Survived", axis = 1)
y = train.Survived

## 1. Defining base parameters

In [13]:
# Random Forest
rf_grid = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
           'warm_start': [True], 
           'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
           'min_samples_leaf': [1, 2, 4],
           'min_samples_split': [2, 5, 10],
           'max_features': ['auto', 'sqrt']}

# Extra Trees
et_grid = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
           'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
           'min_samples_leaf': [1, 2, 4],
           'min_samples_split': [2, 5, 10],
           'max_features': ['auto', 'sqrt']}

# AdaBoost
ab_grid = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
           'learning_rate': [x for x in np.arange(0.5, 1.5, 0.1)],
           }

# GradientBoost
gb_grid = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
           'learning_rate': [x for x in np.arange(0.5, 1.5, 0.1)], 
           'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
           'min_samples_leaf': [1, 2, 4],
           'min_samples_split': [2, 5, 10],
           'max_features': ['auto', 'sqrt']}

## 2. Tuning parameters

**Let's get an idea of the parameters we want to use.**
<br>As I don't have any idea of the range I should use to test each parameter, I am first going to test randomly 100 combinations of parameters, and save the best one.
<br>Inspiration: [Hyperparameter Tuning The Radom Forest in Python](https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74), by William Koehrsen

In [14]:
# Defining a function that randomly searches best parameters across
# 100 different combinations, using 3 fold cross validation.

def get_param_with_randomsearch(model, param_grid, X, y):
    
    # 1. Fixing parameters
    
    print("RandomizedSearchCV is going to test the following parameters:")
    print(param_grid)
    
    random_model = RandomizedSearchCV( estimator = model, 
                                       param_distributions = param_grid,
                                       n_iter = 100,
                                       cv = 3,
                                       random_state = 42,
                                       n_jobs = -1,
                                       scoring = "accuracy")
    
    
    # 2. Fitting the random search model
    
    random_model.fit(X,y)

    print("")
    print("The best combination of parameters is:")
    print(random_model.best_params_)
    print("")
    print("The best combination of parameters we've found generated an accuracy score of {:0.2f}%." .format(random_model.best_score_*100))
    
    return random_model

**Now that we have a clearer idea of the kind of parameters we need, let's tune them more precisely using GridSearchCV.**

In [15]:
# Defining a function that 
#  1. Creates a grid of parameters to test, based on the best parameter set found with get_param_with_randomsearch.
#  2. Find the optimal combination of parameters in this grid.

def get_param_with_gridsearchcv(model, shortcut, random_param_grid, X, y):
    
    # 1. Building param grid based on the result of the function get_param_with_randomsearch

    param_grid = {}
    
    stage = "go"
    
    model_str = str(model)
    
    if shortcut == "rf":
        param_grid["n_estimators"] = [random_param_grid["n_estimators"]]
        param_grid["warm_start"] = [random_param_grid["warm_start"]]
        param_grid["max_depth"] = list(set([x for x in range(random_param_grid["max_depth"],random_param_grid["max_depth"]+6,3)] + [x for x in range(random_param_grid["max_depth"],random_param_grid["max_depth"]-6,-3)]))
        param_grid["min_samples_leaf"] = list(set([x for x in range(random_param_grid["min_samples_leaf"], random_param_grid["min_samples_leaf"]+3,1)] + [x for x in range(random_param_grid["min_samples_leaf"], max(0,random_param_grid["min_samples_leaf"]-3),-1)]))
        param_grid["min_samples_split"] = list(set([x for x in range(random_param_grid["min_samples_split"], random_param_grid["min_samples_split"]+4, 2)] + [x for x in range(random_param_grid["min_samples_split"], max(1,random_param_grid["min_samples_split"]-4), -2)]))
        param_grid["max_features"] = [random_param_grid["max_features"]]
    

    elif shortcut == "et":
        param_grid["n_estimators"] = [random_param_grid["n_estimators"]]
        param_grid["max_depth"] = list(set([x for x in range(random_param_grid["max_depth"],random_param_grid["max_depth"]+6,3)] + [x for x in range(random_param_grid["max_depth"],random_param_grid["max_depth"]-6,-3)]))
        param_grid["min_samples_leaf"] = list(set([x for x in range(random_param_grid["min_samples_leaf"], random_param_grid["min_samples_leaf"]+3,1)] + [x for x in range(random_param_grid["min_samples_leaf"], max(0,random_param_grid["min_samples_leaf"]-3),-1)]))
        param_grid["max_features"] = [random_param_grid["max_features"]]
        param_grid["min_samples_split"] = list(set([x for x in range(random_param_grid["min_samples_split"], random_param_grid["min_samples_split"]+4, 2)] + [x for x in range(random_param_grid["min_samples_split"], max(1,random_param_grid["min_samples_split"]-4), -2)]))
    
    elif shortcut == "ab":
        param_grid["n_estimators"] = [random_param_grid["n_estimators"]]
        param_grid["learning_rate"] = list(set([x for x in np.arange(random_param_grid["learning_rate"],max(0,random_param_grid["learning_rate"]-0.5),-0.02)]+[x for x in np.arange(random_param_grid["learning_rate"],max(0,random_param_grid["learning_rate"]+0.5),0.02)]))

    elif shortcut == "gb":
        param_grid["n_estimators"] = [random_param_grid["n_estimators"]]
        param_grid["max_depth"] = list(set([x for x in range(random_param_grid["max_depth"],random_param_grid["max_depth"]+4,3)] + [x for x in range(random_param_grid["max_depth"],random_param_grid["max_depth"]-4,-3)]))
        param_grid["min_samples_leaf"] = list(set([x for x in range(random_param_grid["min_samples_leaf"], random_param_grid["min_samples_leaf"]+2,1)] + [x for x in range(random_param_grid["min_samples_leaf"], max(0,random_param_grid["min_samples_leaf"]-2),-1)]))
        param_grid["min_samples_split"] = list(set([x for x in range(random_param_grid["min_samples_split"], random_param_grid["min_samples_split"]+4, 2)] + [x for x in range(random_param_grid["min_samples_split"], max(1,random_param_grid["min_samples_split"]-4), -2)]))
        param_grid["max_features"] = [random_param_grid["max_features"]]
        param_grid["learning_rate"] = list(set([x for x in np.arange(random_param_grid["learning_rate"],max(0,random_param_grid["learning_rate"]-0.5),-0.02)]+[x for x in np.arange(random_param_grid["learning_rate"],max(0,random_param_grid["learning_rate"]+0.5),0.02)]))


    else:
        print("The shortcut is not recognized. Possible values are:")
        print("- rf for RandomForestClassifier()")
        print("- et for ExtraTreesClassifier()")
        print("- ab for AdaBoostClassifier()")
        print("- gb for GradientBoostingClassifier()")
        stage = "stop"
    
    # 2. Instantiating the grid search model
    
    if stage == "go":
        
        print("GridSearchCV is going to test the following parameters:")
        print(param_grid)
        
        grid_search = GridSearchCV(estimator = model, 
                                   param_grid = param_grid, 
                                   cv = 3, 
                                   n_jobs = -1, 
                                   verbose = 0, 
                                   scoring = "accuracy")


    # 3. Fitting the grid search to the data
    
        grid_search.fit(X, y)
    
        print("")
        print("The best combination of parameters is:")
        print(grid_search.best_params_)
        print("")
        print("With these parameters, we reach an accuracy score of {:0.2f}%." .format(grid_search.best_score_*100))
    
        return grid_search

**Last, let's create a function that imbricates both functions above**.

In [16]:
# Defining a function that concatenates the two above

def get_params(model, shortcut, param_grid, X, y):

    # 1. Find best parameters using RandomizedSearchCV
    
    randomized_model = get_param_with_randomsearch(model, param_grid, X, y)
    
    param_grid = randomized_model.best_params_
    
    
    # 2. Tune best parameters using GridSearchCV
    
    grid_model = get_param_with_gridsearchcv(model, shortcut, param_grid, X, y) 
    
    return grid_model

**Let's now apply this function to each model.**

In [None]:
# Random Forest
rf_model = get_params(RandomForestClassifier(), "rf", rf_grid, X, y)

# Extra Trees
et_model = get_params(ExtraTreesClassifier(), "et", et_grid, X, y)

# AdaBoost
ab_model = get_params(AdaBoostClassifier(), "ab", ab_grid, X, y)

# GradientBoost
gb_model = get_params(GradientBoostingClassifier(), "gb", gb_grid, X, y)

In [13]:
# For saving time ...
# ... storing best parameters

rf_params = {'warm_start': True,
             'verbose': 0,
             'n_estimators': 600,
             'min_samples_split': 8,
             'min_samples_leaf': 4,
             'max_features': 'sqrt',
             'max_depth': 53}

et_params = {'verbose': 0,
             'n_jobs': -1,
             'n_estimators': 800,
             'min_samples_split': 8,
             'min_samples_leaf': 3,
             'max_features': 'sqrt',
             'max_depth': 97}

ab_params = {'n_estimators': 200,
             'learning_rate': 0.019999999999999574}

gb_params = {'n_estimators' : 1000,
             'learning_rate': 0.8600000000000002,
             'max_depth': 27,
             'min_samples_leaf':1,
             'min_samples_split': 10,
             'max_features': 'sqrt'}


# ... fitting models
rf_model = RandomForestClassifier(**rf_params)
rf_model.fit(X,y)

et_model = RandomForestClassifier(**et_params)
et_model.fit(X,y)

ab_model = AdaBoostClassifier(**ab_params,algorithm='SAMME')
ab_model.fit(X,y)

gb_model = GradientBoostingClassifier(**gb_params)
gb_model.fit(X,y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.8600000000000002, loss='deviance',
              max_depth=27, max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

## 3. Predicting

In [14]:
predictions_train = pd.DataFrame(
                        {'PassengerId': train.PassengerId.tolist(),
                         'rf_survived': rf_model.predict(X).tolist(),
                         'et_survived': et_model.predict(X).tolist(),
                         'ab_survived': ab_model.predict(X).tolist(),
                         'gb_survived': gb_model.predict(X).tolist()
                        })

predictions_test = pd.DataFrame(
                        {'PassengerId': test.PassengerId.tolist(),
                         'rf_survived': rf_model.predict(test).tolist(),
                         'et_survived': et_model.predict(test).tolist(),
                         'ab_survived': ab_model.predict(test).tolist(),
                         'gb_survived': gb_model.predict(test).tolist()
                        })

In [15]:
# Adding the new columns to train and test df
train = train.merge(predictions_train, how="left", on="PassengerId")

test = test.merge(predictions_test, how="left", on="PassengerId")

In [16]:
X = train.drop("Survived", axis = 1)
y = train.Survived

# Second-Level Predictions from the First-level Output

In [17]:
xgb_grid = {'nthread':[4],
              'objective':['binary:logistic'],
              'learning_rate': [0.05],
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [1000],
              'seed': [1337]}


xgb_model = GridSearchCV(xgboost.XGBClassifier(), xgb_grid, n_jobs=5, 
                   cv=StratifiedKFold(y, n_folds=5, shuffle=True), 
                   scoring='accuracy',
                   verbose=0, refit=True)

xgb_model.fit(X, y)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0. 1. ... 1. 0.], n_folds=5, shuffle=True, random_state=None),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=5,
       param_grid={'nthread': [4], 'objective': ['binary:logistic'], 'learning_rate': [0.05], 'max_depth': [6], 'min_child_weight': [11], 'silent': [1], 'subsample': [0.8], 'colsample_bytree': [0.7], 'n_estimators': [1000], 'seed': [1337]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [50]:
predictions = xgb_model.predict(test).astype(int)

  if diff:


# Formatting before uploading on Kaggle

In [52]:
submission = pd.DataFrame.from_csv('gender_submission.csv')
submission['Survived'] = predictions
print(submission.head())

             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 0


In [54]:
submission.to_csv('submission.csv', index=True)

In [45]:
submission_2 = test[["PassengerId", "ab_survived"]].astype(int)
submission_2.columns = ["PassengerId", "Survived"]

0 PassengerId
892    0.0
Name: Survived, dtype: float64


In [None]:
submission_2 = pd.DataFrame.from_csv('gender_submission.csv')
submission_2.Survived= test.ab_survived.astype(int).as_matrix()

In [None]:
submission_2.to_csv('submission2.csv', index=True)