In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier,VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import xgboost as xgb
import lightgbm as lgb

from scipy import stats
from scipy.stats import norm, skew #for some statistics
sns.set(style='white', context='notebook', palette='deep')

In [None]:
# Load data
##### Load train and Test set

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
IDtest = test["PassengerId"]

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.shape

## Data Explanation

* Data Dictionary

**Variable	Definition	Key**
1. survival	Survival	0 = No, 1 = Yes
2. pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
3. sex	Sex	
4. Age	Age in years	
5. sibsp	# of siblings / spouses aboard the Titanic	
6. parch	# of parents / children aboard the Titanic	
7. ticket	Ticket number	
8. fare	Passenger fare	
9. cabin	Cabin number	
10. embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

**Variable Notes**
* pclass: A proxy for socio-economic status (SES)
  1st = Upper
  2nd = Middle
  3rd = Lower

* age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

* sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

* parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

## Outliers detection 

In [None]:
# Outlier detection 

def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

# detect outliers from Age, SibSp , Parch and Fare
Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])

In [None]:
outliers = train.iloc[Outliers_to_drop]
outliers

In [None]:
# remove  outliers

train = train.drop(Outliers_to_drop, axis=0).reset_index(drop=True)

## Check for missing values

In [None]:
## Join train and test datasets in order to obtain the same number of features during categorical conversion
train_len = len(train)
dataset =  pd.concat(objs=[train, test], axis=0).reset_index(drop=True)

In [None]:
# fill missing values with nan
dataset = dataset.fillna(np.nan)

# check number of nan
dataset.isnull().sum()

* Age and Cabin features have an important part of missing values.

* Survived missing values correspond to the join testing dataset (Survived column doesn't exist in test set) 

In [None]:
train.dtypes

In [None]:
### Summarize data
# descriptive statistics
train.describe()

## Feature Analysis

### 1. Numerical features

In [None]:
# Correlations of numerical features with survival
plt.figure(figsize=(10,8))
g = sns.heatmap(train[["Survived","SibSp","Parch","Age","Fare"]].corr(), annot=True, fmt=".2f")

* only fare seems to have a more significant correlation with survival.  
* We need to explore the relationship of other features with survival in more details.

In [None]:
# Explore SibSp feature vs Survived
g = sns.catplot(x="SibSp",y="Survived", data=train, kind="bar", height=6, palette = "muted")
g = g.set_ylabels("Survival Probability")

In [None]:
# Explore Parch vs Survived
g = sns.catplot(x="Parch", y="Survived", data=train, kind="bar", height=6, palette="muted")
g = g.set_ylabels("Survival Probability")

Small families have more chance to survive, more than single (Parch 0), medium (Parch 3,4) and large families (Parch 5,6 ).

Be carefull there is an important standard deviation in the survival of passengers with 3 parents/children

In [None]:
# Explore Age vs Survived
plt.figure(figsize=(16,10))
g = sns.FacetGrid(train, col='Survived', size=10)
g = g.map(sns.distplot, "Age")

Age distribution seems to be a skewed distribution, maybe a gaussian distribution.

We notice that age distributions are not the same in the survived and not survived subpopulations. Indeed, there is a peak corresponding to young passengers, that have survived. We also see that passengers between 60-80 have less survived.

So, even if "Age" is not correlated with "Survived", we can see that there is age categories of passengers that of have more or less chance to survive.

It seems that very young passengers have more chance to survive.

In [None]:
# superimpose the two distributions of survived and not survived 
# Explore Age distibution 
plt.figure(figsize=(10,8))
g = sns.kdeplot(train["Age"][(train["Survived"] == 0) & (train["Age"].notnull())], color="Red", shade = True)
g = sns.kdeplot(train["Age"][(train["Survived"] == 1) & (train["Age"].notnull())], ax=g, color="Blue", shade= True)
g.set_xlabel("Age")
g.set_ylabel("Frequency")
g = g.legend(["Not Survived","Survived"])

In survived distribution,there is a peak on young children from 0 to 5 years old.

In [None]:
# Explore Fare distribution 
plt.figure(figsize=(10,8))
g = sns.distplot(train["Fare"], color="m", label="Skewness : %.2f"%(dataset["Fare"].skew()))
g = g.legend(loc="best")

From the picture, we see that Fare is highly skewed, this can lead to overweight on high values of fare in the model.
We transform it by log(1+x) to remove the skewness.

In [None]:
train["Fare"] = train["Fare"].map(lambda i: np.log1p(i))

In [None]:
plt.figure(figsize=(10,8))
g = sns.distplot(train["Fare"], color="b", label="Skewness : %.2f"%(train["Fare"].skew()))
g = g.legend(loc="best")

Skewness of Fare is significantly reduced after the transformation.

### 2. Categorical features

In [None]:
# Sex vs survival
g = sns.barplot(x="Sex", y="Survived", data=train)
g = g.set_ylabel("Survival Probability")

In [None]:
train[["Sex", "Survived"]].groupby('Sex').mean()

It is clear that male has less chance of being survived than female. 
Thus, sex is an important feature in predicting survival.

In [None]:
plt.figure(figsize=(10,8))
g = sns.barplot(x="Pclass", y="Survived", data=train)
g = g.set_ylabel("Survival Probability")

Pclass of 1 has the highest chance of survival. Pclass of 3 has the lowest chance. 

In [None]:
plt.figure(figsize=(10,8))
g = sns.barplot(x="Pclass", y="Survived", hue="Sex", data=train)
g = g.set_ylabel("Survival Probability")

Throughout all three classes, female has much high chance to survive than male.

In [None]:
# Embarked vs Survival
plt.figure(figsize=(10,8))
g = sns.barplot(x="Embarked", y="Survived", data=train)
g = g.set_ylabel("Survival Probability")

It seems that passenger coming from Cherbourg (C) have more chance to survive.

My hypothesis is that the proportion of first class passengers is higher for those who came from Cherbourg (C) than Queenstown (Q), Southampton (S).

Let's see the Pclass distribution vs Embarked

In [None]:
plt.figure(figsize=(10,8))
g = sns.factorplot("Pclass", col="Embarked", kind="count", data=train, palette="muted")
g.despine(left=True)

The third class is the most frequent for passenger coming from Southampton (S) and Queenstown (Q), 
whereas Cherbourg (C) passengers are mostly in first class which have the highest survival rate.

## Filling missing values

### 1. Age

In [None]:
dataset.isnull().sum()

Age has 256 missing values in the whole dataset. Since age is an important feature for predicting survival in subpopulations, we need to analyze the correlation between age and other features in detail.

In [None]:
# Explore Age vs Sex, Parch , Pclass and SibSP
g = sns.catplot(y="Age",x="Sex",data=dataset,kind="box")
g = sns.catplot(y="Age",x="Sex",hue="Pclass", data=dataset,kind="box")
g = sns.catplot(y="Age",x="Parch", data=dataset,kind="box")
g = sns.catplot(y="Age",x="SibSp", data=dataset,kind="box")

Age distribution seems to be the same in Male and Female subpopulations, so Sex is not informative to predict Age.

However, 1rst class passengers are older than 2nd class passengers who are also older than 3rd class passengers.

Moreover, the more a passenger has parents/children the older he is and the more a passenger has siblings/spouses the younger he is.

In [None]:
# convert Sex to categorical variable with male 0 and female 1
dataset['Sex'] = dataset['Sex'].map({'male':0, "female":1})

In [None]:
dataset.head()

In [None]:
plt.figure(figsize=(8,6))
g = sns.heatmap(dataset[["Age","Sex","SibSp","Parch","Pclass"]].corr(), cmap="RdBu_r", annot=True)

The correlation map confirms the factorplots observations except for Parch. Age is not correlated with Sex, but is negatively correlated with Pclass, Parch and SibSp.

In the plot of Age in function of Parch, Age is growing with the number of parents / children. But the general correlation is negative.

So, i decided to use SibSP, Parch and Pclass in order to impute the missing ages.

The strategy is to fill Age with the median age of similar rows according to Pclass, Parch and SibSp.

In [None]:
# Filling missing value of Age 

## Fill Age with the median age of similar rows according to Pclass, Parch and SibSp
# Index of NaN age rows
index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = dataset["Age"].median()
    age_pred = dataset["Age"][((dataset['SibSp'] == dataset.iloc[i]["SibSp"]) & 
                               (dataset['Parch'] == dataset.iloc[i]["Parch"]) & 
                               (dataset['Pclass'] == dataset.iloc[i]["Pclass"]))].median()
    if not np.isnan(age_pred) :
        dataset['Age'].iloc[i] = age_pred
    else :
        dataset['Age'].iloc[i] = age_med

In [None]:
dataset['Age'].isnull().sum()

In [None]:
g = sns.catplot(x="Survived", y = "Age",data = train, kind="box")
g = sns.catplot(x="Survived", y = "Age",data = train, kind="violin")

### 2. Fare

In [None]:
dataset["Fare"].isnull().sum()

Since there is only one missing value in Fare, I decide to fill it with median 

In [None]:
dataset['Fare'].describe()

In [None]:
train[['Fare','Pclass']].groupby('Pclass').mean()

In [None]:
g = sns.catplot(y='Fare', x="Pclass", data=train, kind='box')

From the boxplot, we can see that the Fare is highly related wit the pessenger class. 
So we will impute the fare corresponding to the mean fare of the class it belongs.

In [None]:
dataset['Fare'][dataset['Fare'].isnull()].index

In [None]:
dataset.loc[1033,'Pclass']

In [None]:
fare = train[['Fare','Pclass']].groupby('Pclass').mean()['Fare'][3]
dataset['Fare'] = dataset['Fare'].fillna(fare)

In [None]:
dataset['Fare'].isnull().sum()

### 3. Embarked

In [None]:
# Check the data with missing Embarked

embarked_nan_idx = dataset['Embarked'][dataset['Embarked'].isnull()].index

In [None]:
dataset.iloc[embarked_nan_idx]

In [None]:
dataset["Embarked"].value_counts()

Since most people boarded from port Southampton, we impute the missing values of Embarked by S

In [None]:
dataset['Embarked'] = dataset["Embarked"].fillna("S")

### 4. Cabin

In [None]:
dataset['Cabin'].describe()

In [None]:
dataset['Cabin'].isnull().sum()

The Cabin feature column contains 292 values and 1007 missing values.

I supposed that passengers without a cabin have a missing value displayed instead of the cabin number.

In [None]:
# Replace the Cabin by the first letter, and "X" if it is missing.
dataset['Cabin'] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin']])

In [None]:
plt.figure(figsize=(8,6))
g = sns.countplot(dataset['Cabin'], order = dataset['Cabin'].value_counts().index)

In [None]:
g = sns.factorplot(y="Survived",x="Cabin",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")

We can see that passengers with a cabin have generally more chance to survive than passengers without (X).

It is particularly true for cabin B, C, D, E and F.

In [None]:
dataset = pd.get_dummies(dataset, columns=["Cabin"], prefix = "Cabin")

In [None]:
dataset.head()

## Feature Engineering

### 1. Name/Title

In [None]:
dataset['Name'].head()

In [None]:
title = [i.split(',')[1].split('.')[0].strip() for i in dataset['Name']]
dataset['Title'] = pd.Series(title)
dataset['Title'].head()

In [None]:
dataset['Title'].value_counts()

In [None]:
plt.figure(figsize=(8,6))
g = sns.countplot(x="Title", data=dataset)
g = plt.setp(g.get_xticklabels(), rotation=45)

In [None]:
g = sns.catplot("Title", col="Survived", data=dataset, kind='count', size=10)
g = plt.xticks(rotation=45)

There is 18 titles in the dataset, most of them are very rare and we can group them in 4 categories.**

In [None]:
# Convert to categorical values Title 
dataset["Title"] = dataset["Title"].replace(['Lady', 'the Countess','Capt', 
                                             'Col','Don', 'Dr', 'Major', 
                                             'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset["Title"] = dataset["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
dataset["Title"] = dataset["Title"].astype(int)

In [None]:
g = sns.countplot('Title', data=dataset)
g = g.set_xticklabels(["Master","Miss-Mrs","Mr","Rare"])

In [None]:
g = sns.catplot(x="Title", y='Survived', data=dataset, kind='bar')
g = g.set_xticklabels(["Master","Miss-Mrs","Mr","Rare"])
g = g.set_ylabels("Survival Probability")

In [None]:
# Drop Name feature
dataset.drop(labels=["Name"], axis=1, inplace=True)


### 2. Family size

We can imagine that large families will have more difficulties to evacuate, looking for theirs sisters/brothers/parents during the evacuation. So, i choose to create a "Fize" (family size) feature which is the sum of SibSp , Parch and 1 (including the passenger).

In [None]:
# Create Family Size feature 
dataset['Fsize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [None]:
g = sns.factorplot(x="Fsize",y="Survived",data = dataset)
g = g.set_ylabels("Survival Probability")

In [None]:
dataset['Single'] = dataset['Fsize'].map(lambda s: 1 if s == 1 else 0)
dataset['SmallF'] = dataset['Fsize'].map(lambda s: 1 if s == 2 else 0)
dataset['MedF'] = dataset['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
dataset['LargeF'] = dataset['Fsize'].map(lambda s: 1 if s >= 5 else 0)

In [None]:
g = sns.factorplot(x="Single",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="SmallF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="MedF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="LargeF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")

Factorplots of family size categories show that Small and Medium families have more chance to survive than single passenger and large families.

In [None]:
dataset.head()

In [None]:
# convert Title to dummy variables
dataset = pd.get_dummies(dataset, columns = ["Title"])
dataset = pd.get_dummies(dataset, columns = ["Embarked"])

In [None]:
dataset.head()

### 3. Ticket

In [None]:
dataset['Ticket'].head()

It could mean that tickets sharing the same prefixes could be booked for cabins placed together. It could therefore lead to the actual placement of the cabins within the ship.

Tickets with same prefixes may have a similar class and survival.

So i decided to replace the Ticket feature column by the ticket prefix. Which may be more informative.

In [None]:
dataset['Ticket'][2].replace("/","").replace(".","").strip().split(' ')[0]

In [None]:
Ticket = []
for i in list(dataset.Ticket):
    if not i.isdigit():
        Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0])
    else:
        Ticket.append("X")
dataset['Ticket'] = Ticket
dataset['Ticket'].head()

In [None]:
dataset['Ticket'].unique()

In [None]:
dataset = pd.get_dummies(dataset, columns=['Ticket'], prefix='T')

In [None]:
dataset.head()

In [None]:
# Create categorical values for Pclass
dataset['Pclass'] = dataset['Pclass'].astype("category")
dataset = pd.get_dummies(dataset, columns=['Pclass'], prefix='Pc')

In [None]:
# Drop Passenger ID
dataset.drop(labels=["PassengerId"], axis=1, inplace=True)

In [None]:
dataset.head()

## Overfitting prevention

In [None]:
# To prevent overfitting, dummy varaibles with more than 99% 1 or 0 will be removed.

#overfit = []
#for i in dataset.columns:
#    counts = dataset[i].value_counts()
#    zeros = counts.iloc[0]
#    if zeros / len(dataset) * 100 > 99:
#        overfit.append(i)

In [None]:
#overfit

In [None]:
#dataset.drop(overfit, axis=1, inplace=True)
#dataset.head()

## Skewed features

In [None]:
#data = dataset.drop(labels = ['Survived'], axis=1)

#numeric_feats = data.dtypes[data.dtypes != "object"].index

# Check the skew of all numerical features
#skewed_feats = data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
#print("\nSkew in numerical features: \n")
#skewness = pd.DataFrame({'Skew' :skewed_feats})
#skewness.head


In [None]:
#skewness.index

In [None]:
#skewness = skewness[abs(skewness.Skew) > 0.75]
#print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

#from scipy.special import boxcox1p
#skewed_features = skewness.index
#lam = 0.15
#for feat in skewed_features:
#    dataset[feat] = boxcox1p(dataset[feat], lam)

## Modelling 

In [None]:
# separate dataset into train and test sets

train = dataset[:train_len]
test = dataset[train_len:]
test.drop(labels =["Survived"], axis=1, inplace=True)

In [None]:
train["Survived"] = train["Survived"].astype(int)

Y_train = train["Survived"]

X_train = train.drop(labels = ["Survived"],axis = 1)

### 1. Simple models 

Compare 12 popular classifiers and evaluate the mean accuracy of each of them by a stratified kfold cross validation procedure.

1. SVC
2. Decision Tree
3. AdaBoost
4. Random Forest
5. Extra Trees
6. Gradient Boosting
7. Multiple layer perceprton (neural network)
8. KNN
9. Logistic regression
10. Linear Discriminant Analysis
11. XGBoost
12. LightGBM

In [None]:
# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=10)

In [None]:
random_state = 13
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(random_state=random_state))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state=random_state))
classifiers.append(LinearDiscriminantAnalysis())
classifiers.append(xgb.XGBClassifier(random_state=random_state))
classifiers.append(lgb.LGBMClassifier(random_state=random_state))



In [None]:
cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, X_train, y=Y_train, cv=kfold, scoring='accuracy', n_jobs=4))

cv_mean = []
cv_std = []
for cv_result in cv_results:
    cv_mean.append(cv_result.mean())
    cv_std.append(cv_result.std())


In [None]:
cv_res = pd.DataFrame({"CrossValMeans":cv_mean,"CrossValerrors": cv_std,"Algorithm":["SVC","DecisionTree","AdaBoost",
"RandomForest","ExtraTrees","GradientBoosting","MultipleLayerPerceptron","KNeighboors",
"LogisticRegression","LinearDiscriminantAnalysis", "XGBoost", "LightGBM"]})

plt.figure(figsize=(10,8))
g = sns.barplot("CrossValMeans","Algorithm",data=cv_res, palette="Set3",orient = "h",**{'xerr':cv_std})
g.set_xlabel("Mean Accuracy")
g = g.set_title("Cross validation scores")

In [None]:
cv_mean

From the cv scores, we choose Gradient Boosting, Logistic Regression, AdaBoost, LDA, and RandomForest classifiers for ensemble modeling.

### 2. Hyperparameters tuning for best models

Perform a grid search optimization for Gradient Boosting, XGBoost, LightGBM, AdaBoost, Random Forest and SVC classifiers.

In [None]:
train.shape

In [None]:
# Gradient boosting tunning

GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [50], #[900],
              'learning_rate': [0.1],
              'max_depth': [4],
              'min_samples_split': [60], #range(30,120,10), #[60], #range(20,120,20),
              'min_samples_leaf': [10], #range(10, 100, 10), #[50], #[0.1, 0.2, 0.3, 0.4, 0.5],
              'max_features': ['sqrt'], #range(5,20,2) 
              'subsample': [0.8]  
              }

gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsGBC.fit(X_train,Y_train)

GBC_best = gsGBC.best_estimator_

# Best score
gsGBC.best_score_

In [None]:
gsGBC.best_score_

In [None]:
GBC_best

In [None]:
# XGBoost tunning

XGB = xgb.XGBClassifier()
xgb_param_grid = {"colsample_bytree": [0.6], 
                  "gamma": [0],
                  "learning_rate": [0.01], 
                  "max_depth": [3], 
                  "min_child_weight": [1], 
                  "n_estimators": [150],
                  "reg_alpha":[0], 
                  "reg_lambda": [1],
                  "subsample": [0.75]
                 }
                                 
gsXGB = GridSearchCV(XGB, param_grid=xgb_param_grid, cv = kfold, scoring="accuracy", n_jobs=4, verbose = 1)

gsXGB.fit(X_train, Y_train)

XGB_best = gsXGB.best_estimator_

# Best score
gsXGB.best_score_

In [None]:
gsXGB.best_score_

In [None]:
XGB_best

In [None]:
# LightGBM tunning

LGB = lgb.LGBMClassifier()
lgb_param_grid = {"max_depth": [100],
                  "num_iterations": [100], #[100]
                  "num_leaves": [10],
                  "min_data_in_leaf": [20],
                  "feature_fraction": [0.5], 
                  "learning_rate": [0.1],  #[0.1]
                  "bagging_fraction": [0.8],
                  "bagging_freq": [5],
                  }

gsLGB = GridSearchCV(LGB, param_grid=lgb_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1)

gsLGB.fit(X_train, Y_train)

LGB_best = gsLGB.best_estimator_

# Best score
gsLGB.best_score_

In [None]:
gsLGB.best_score_

In [None]:
LGB_best

In [None]:
# Adaboost
#DTC = DecisionTreeClassifier()

#adaDTC = AdaBoostClassifier(DTC, random_state=7)

#ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
#              "base_estimator__splitter" :   ["best", "random"],
#              "algorithm" : ["SAMME","SAMME.R"],
#              "n_estimators" :[1,2,3],
#              "learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]
#                 }

#gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

#gsadaDTC.fit(X_train,Y_train)

#ada_best = gsadaDTC.best_estimator_

# Best score
#gsadaDTC.best_score_

In [None]:
#gsadaDTC.best_score_

In [None]:
# ada_best

In [None]:
# RFC Parameters tunning 
RFC = RandomForestClassifier()


## Search grid for optimal parameters
rf_param_grid = {"max_depth": [n for n in range(9, 14)],     
                  "min_samples_split": [n for n in range(4, 11)], 
                  "min_samples_leaf":  [n for n in range(2, 5)],     
                  "n_estimators":  [n for n in range(10, 60, 10)]}


gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsRFC.fit(X_train,Y_train)

RFC_best = gsRFC.best_estimator_

# Best score
gsRFC.best_score_

In [None]:
gsRFC.best_score_

In [None]:
#RFC_best

In [None]:
### SVMC classifier
#SVMC = SVC(probability=True)
#svc_param_grid = {'kernel': ['rbf'], 
#                  'gamma': [0.01], #[0.001, 0.01, 0.1, 1],
#                  'C': [50] #[1, 10, 50, 100, 200, 300, 1000]
#                 }

#gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

#gsSVMC.fit(X_train,Y_train)

#SVMC_best = gsSVMC.best_estimator_

# Best score
#gsSVMC.best_score_

In [None]:
#gsSVMC.best_score_

In [None]:
#SVMC_best

### Plot learning curves

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    """Generate a simple plot of the test and training learning curve"""
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

#g = plot_learning_curve(gsGBC.best_estimator_,"GradientBoosting learning curves",X_train,Y_train,cv=kfold)
#g = plot_learning_curve(gsadaDTC.best_estimator_,"AdaBoost learning curves",X_train,Y_train,cv=kfold)
#g = plot_learning_curve(gsRFC.best_estimator_,"RF mearning curves",X_train,Y_train,cv=kfold)
#g = plot_learning_curve(gsSVMC.best_estimator_,"SVC learning curves",X_train,Y_train,cv=kfold)

In [None]:
#from mlxtend.classifier import StackingClassifier
#sclf = StackingClassifier(classifiers=[XGB_best,LGB_best, GBC_best],
#                          meta_classifier=RFC_best)


In [None]:
#sclf_score = cross_val_score(sclf, X_train, y=Y_train, cv=kfold, scoring='accuracy', n_jobs=4)

In [None]:
#sclf_score.mean()

In [None]:
#sclf.fit(X_train, Y_train)

In [None]:
#votingC = VotingClassifier(estimators=[('xgb', XGB_best),('rfc', RFC_best), 
#                                       ('lgb', LGB_best)], voting='soft', n_jobs=4)

#votingC = votingC.fit(X_train, Y_train)

In [None]:
#test_Survived = pd.Series(votingC.predict(test), name="Survived")
test_Survived = pd.Series(RFC_best.predict(test), name="Survived")
results = pd.concat([IDtest,test_Survived],axis=1)

results.to_csv("ensemble_python_voting.csv",index=False)