# Bagging

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot
%matplotlib inline
import pandas as pd

In [None]:
df = sns.load_dataset("titanic")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.dropna(inplace=True)

In [None]:
df.head()


In [None]:
df['age'].hist(bins=50)

## Data Pre-processing

In [None]:
X = df[['pclass','sex','age']]

In [None]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [None]:
X['sex'] = lb.fit_transform(X['sex'])

In [None]:
X.head()

In [None]:
X.shape

In [None]:
X.describe()

In [None]:
X.info()

In [None]:
y=df['survived']

In [None]:
y.value_counts()

## Fit Model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
def print_training_score(clf, X, y):
    print("Training Result:\n")
    print("Accuracy: {0:.4f}\n".format(accuracy_score(y, clf.predict(X))))
    print("Classification Report: \n {} \n".format(classification_report(y, clf.predict(X))))
    print("Confusion Matrix: \n {} \n".format(confusion_matrix(y, clf.predict(X))))

    res = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
    print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

In [None]:
def print_test_score(clf, X, y):
    print("Test Result:\n")

    print("Accuracy: {0:.4f}\n".format(accuracy_score(y, clf.predict(X))))
    print("Classification Report: \n {} \n".format(classification_report(y, clf.predict(X))))
    print("Confusion Matrix: \n {} \n".format(confusion_matrix(y, clf.predict(X))))

## Decision Tree

In [None]:
clf = DecisionTreeClassifier(random_state=42)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print_training_score(clf, X, y)

In [None]:
print_test_score(clf, X_test,y_test)

## Bagging 

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
#bag_clf = BaggingClassifier(base_estimator=clf, n_estimators=1000, bootstrap=False,n_jobs=-1,random_state=42)

In [None]:
bag_clf = BaggingClassifier(base_estimator=clf, n_estimators=1000, bootstrap=True,n_jobs=-1,random_state=42)

In [None]:
bag_clf.fit(X_train, y_train)

In [None]:
print_training_score(bag_clf, X_train, y_train)

In [None]:
print_test_score(bag_clf, X_test, y_test)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier(random_state=42)

In [None]:
rf_clf.fit(X_train, y_train)

In [None]:
print_training_score(rf_clf, X_train, y_train)

In [None]:
print_test_score(rf_clf, X_test, y_test)

## Grid Search

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
params_grid = {"max_depth": [3,None],
              "min_samples_split": [2,3,10],
               "min_samples_leaf": [1,3,10],
               "bootstrap": [True, False],
               "criterion": ['gini', 'entropy']
              }

In [None]:
grid_search = GridSearchCV(rf_clf, params_grid, n_jobs=-1, cv=5, verbose=1, scoring='accuracy')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_.get_params()

In [None]:
print_training_score(grid_search, X_train, y_train)

In [None]:
print_test_score(grid_search, X_test,y_test)

# Extremele Randomize Trees

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
xt_clf = ExtraTreesClassifier(random_state=42)

In [None]:
xt_clf.fit(X_train, y_train)

In [None]:
print_training_score(xt_clf, X_train, y_train)
print_test_score(xt_clf, X_test,y_test)

# AdaBoost

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_clf = AdaBoostClassifier()

In [None]:
ada_clf.fit(X_train, y_train)

In [None]:
print_training_score(ada_clf , X_train, y_train)

In [None]:
print_test_score(ada_clf, X_test, y_test)

## AdaBoost with Random Forest

In [None]:
ada_clf=AdaBoostClassifier(RandomForestClassifier(bootstrap=False,
 criterion="entropy",
 min_samples_leaf=1,
 min_samples_split=2,
 n_estimators=10,
 n_jobs=-1,
 oob_score=False,
 random_state= 42
 ))

In [None]:
ada_clf.fit(X_train, y_train)

In [None]:
print_training_score(ada_clf, X_train, y_train)
print_test_score(ada_clf, X_test,y_test)

In [None]:
# Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gb_clf=GradientBoostingClassifier()

In [None]:
gb_clf.fit(X_train, y_train)

In [None]:
print_training_score(gb_clf, X_train, y_train)
print_test_score(gb_clf, X_test,y_test)

# Extreme Gradient Boosting

In [None]:
import xgboost as xgb

In [None]:
xgb_clf = xgb.XGBClassifier(max_depth=1, n_estimators=5000, learning_rate=0.2)

In [None]:
xgb_clf.fit(X_train, y_train)

In [None]:
print_training_score(xgb_clf, X_train, y_train)
print_test_score(xgb_clf, X_test,y_test)