# Ensemble/Voting Classification in Python with Scikit-Learn
refï¼šhttps://www.kaggle.com/c/titanic/submit

In [1]:
from unittest import result
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.model_selection import train_test_split, KFold, cross_val_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier

In [2]:
training_data = pd.read_csv("data/train.csv")
testing_data = pd.read_csv("data/test.csv")

def get_nulls(training, testing):
    print("Training Data:")
    print(pd.isnull(training).sum())
    print("Testing Data:")
    print(pd.isnull(testing).sum())

get_nulls(training_data, testing_data)

Training Data:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Testing Data:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [3]:
# Drop the cabin column, as there are too many missing values
# Drop the ticket numbers too, as there are too many categories
# Drop names as they won't really help predict survivors
testing_passenger_ids = testing_data["PassengerId"].values

training_data.drop(
    ["Name", "Ticket", "Cabin", "PassengerId"], axis=1, inplace=True)
testing_data.drop(["Name", "Ticket", "Cabin", "PassengerId"],
                  axis=1, inplace=True)

# Taking the mean/average value would be impacted by the skew
# so we should use the median value to impute missing values
training_data["Age"].fillna(training_data["Age"].median(), inplace=True)

training_data["Embarked"].fillna(
    training_data["Embarked"].mode()[0], inplace=True)
testing_data["Age"].fillna(testing_data["Age"].median(), inplace=True)
testing_data["Fare"].fillna(testing_data["Fare"].median(), inplace=True)

get_nulls(training_data, testing_data)

Training Data:
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
Testing Data:
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [4]:
# Fit the encoder on the data (Feature: Sex)
encoder = LabelEncoder()
encoder.fit(training_data["Sex"])

# Transform and replace training data
training_data["Sex"] = encoder.transform(training_data["Sex"])
testing_data["Sex"] = encoder.transform(testing_data["Sex"])

print(training_data.head())
# Fit the encoder on the data (Feature: Embarked)

encoder = OneHotEncoder()
embarked_data_arr = np.array(training_data["Embarked"]).reshape(-1, 1)

# Encode training data
encoder.fit(embarked_data_arr)
encoded_array = encoder.transform(embarked_data_arr).toarray()
training_data.drop(["Embarked"], axis=1, inplace=True)
training_data = pd.concat([training_data, pd.DataFrame(
    data=encoded_array, columns=["Embarked_1", "Embarked_2", "Embarked_3"])], axis=1)

# Encode testing data
embarked_data_arr = np.array(testing_data["Embarked"]).reshape(-1, 1)
encoded_array = encoder.transform(embarked_data_arr).toarray()
testing_data.drop(["Embarked"], axis=1, inplace=True)
testing_data = pd.concat([testing_data, pd.DataFrame(
    data=encoded_array, columns=["Embarked_1", "Embarked_2", "Embarked_3"])], axis=1)

# Scaling the data (Feature: Age, Fare)
age_scalar = StandardScaler()
age_scalar.fit(training_data["Age"].values.reshape(-1, 1))

training_data["Age"] = age_scalar.transform(
    training_data["Age"].values.reshape(-1, 1))

testing_data["Age"] = age_scalar.transform(
    testing_data["Age"].values.reshape(-1, 1))

fare_scalar = StandardScaler()
fare_scalar.fit(training_data["Fare"].values.reshape(-1, 1))

training_data["Fare"] = fare_scalar.transform(
    training_data["Fare"].values.reshape(-1, 1))
testing_data["Fare"] = fare_scalar.transform(
    testing_data["Fare"].values.reshape(-1, 1))

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    1  22.0      1      0   7.2500        S
1         1       1    0  38.0      1      0  71.2833        C
2         1       3    0  26.0      0      0   7.9250        S
3         1       1    0  35.0      1      0  53.1000        S
4         0       3    1  35.0      0      0   8.0500        S


In [5]:
# Now to select our training/testing data
features = training_data.drop(["Survived"], axis=1)
labels = training_data["Survived"]

print("Features = \n", features.head())
print("Labels = \n", labels.head())

# Make the train/test data from validation
x_train, x_eval, y_train, y_eval = train_test_split(
    features, labels, test_size=0.1, random_state=12)

Features = 
    Pclass  Sex       Age  SibSp  Parch      Fare  Embarked_1  Embarked_2  \
0       3    1 -0.565736      1      0 -0.502445         0.0         0.0   
1       1    0  0.663861      1      0  0.786845         1.0         0.0   
2       3    0 -0.258337      0      0 -0.488854         0.0         0.0   
3       1    0  0.433312      1      0  0.420730         0.0         0.0   
4       3    1  0.433312      0      0 -0.486337         0.0         0.0   

   Embarked_3  
0         1.0  
1         0.0  
2         1.0  
3         1.0  
4         1.0  
Labels = 
 0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


## Simple Averaging Approach

In [6]:
LogReg_clf = LogisticRegression()
DTree_clf = DecisionTreeClassifier()
SVC_clf = SVC(probability=True)

LogReg_clf.fit(x_train, y_train)
DTree_clf.fit(x_train, y_train)
SVC_clf.fit(x_train, y_train)

LogReg_pred = LogReg_clf.predict(x_eval)
DTree_pred = DTree_clf.predict(x_eval)
SVC_pred = SVC_clf.predict(x_eval)

averaged_preds = (LogReg_pred + DTree_pred + SVC_pred)//3
acc = accuracy_score(y_eval, averaged_preds)
print(acc)

0.8


## Bagging Classification Example

In [7]:
logreg_bagging_model = BaggingClassifier(
    base_estimator=LogReg_clf, n_estimators=50, random_state=12)
dtree_bagging_model = BaggingClassifier(
    base_estimator=DTree_clf, n_estimators=50, random_state=12)
extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=12)
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=12)


def bagging_ensemble(name, model):
    k_folds = KFold(n_splits=20, random_state=12, shuffle=True)
    # Array of scores of the estimator for each run of the cross validation.(nd-array)
    scores = cross_val_score(model, x_train, y_train, cv=k_folds)
    print(name, scores.mean())


bagging_ensemble("Bagging with Logistic regression", logreg_bagging_model)
bagging_ensemble("Bagging with Decision trees", dtree_bagging_model)
bagging_ensemble("Bagging with Extra trees", extra_trees_model)
bagging_ensemble("Bagging with Random forest", random_forest_model)

Bagging with Logistic regression 0.7964634146341464
Bagging with Decision trees 0.8126219512195121
Bagging with Extra trees 0.7876219512195123
Bagging with Random forest 0.8088719512195122


## Boosting Classification Example

In [8]:
print("AdaBoost with Decision trees:")
k_folds = KFold(n_splits=20, random_state=12, shuffle=True)
num_estimators = [20, 40, 60, 80, 100]

for num_estimator in num_estimators:
    # default is DecisionTreeClassifier
    ada_boost_clf = AdaBoostClassifier(
        n_estimators=num_estimator, random_state=12)
    scores = cross_val_score(ada_boost_clf, x_train, y_train, cv=k_folds)
    print(f"Result of {num_estimator} estimators: {scores.mean()}")


AdaBoost with Decision trees:
Result of 20 estimators: 0.8052134146341464
Result of 40 estimators: 0.8176524390243903
Result of 60 estimators: 0.8176829268292682
Result of 80 estimators: 0.8114024390243901
Result of 100 estimators: 0.8114024390243904


## voting\Stacking Classification Example

In [9]:
voting_clf = VotingClassifier(
    estimators=[("SVC", SVC_clf), ("DTs", DTree_clf), ("Logreg", LogReg_clf)], voting="soft")

scores = cross_val_score(voting_clf, x_train, y_train, cv=k_folds)

print(f"Voting classifier: {scores.mean()}")

# About the accuracy of the model
# pred_eval = voting_clf.predict(x_eval)
# print("Voting classifier:")
# print(f"Accurracy: {accuracy_score(y_eval, pred_eval)}")
# print(f"Log loss : {log_loss(y_eval, pred_eval)}")
# print(f"F1 score : {f1_score(y_eval, pred_eval)}")

Voting classifier: 0.8301524390243902


## AdaBoost with Voting Classification

In [10]:
print("AdaBoost classifier with SVC, DTs and LR")
num_estimators = [1, 5, 10, 15]

for num_estimator in num_estimators:
    ada_boost_clf = AdaBoostClassifier(
        base_estimator=voting_clf, n_estimators=num_estimator, random_state=12)
    ada_boost_clf.fit(x_train, y_train)
    scores = cross_val_score(ada_boost_clf, x_train, y_train, cv=k_folds)
    print(f"Result of {num_estimator} estimators: {scores.mean()}")


AdaBoost classifier with SVC, DTs and LR
Result of 1 estimators: 0.8014024390243903
Result of 5 estimators: 0.8126219512195121
Result of 10 estimators: 0.8001219512195122
Result of 15 estimators: 0.8013719512195122


In [None]:
def export_file_for_submission(filename, passenger_id_list, prediction):
    """
    passenger_id_list: 1D-array for passenfer ID
    prediction: 1D-array for prediction
    """
    df = pd.DataFrame(np.c_[passenger_id_list, prediction],
                      columns=["PassengerId", "Survived"])
    df.to_csv(path_or_buf=filename, index=False)

![](data/screenshot.png)