In this notebook I will work with Titanic data. This time I'm going to build ensemble stacking machinery based on few machine learning algoriths like random forest, SVM, k-nearest neighbours and SGD classifier. No neural network will be used.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
import import_ipynb
import my_own_transformers

importing Jupyter notebook from my_own_transformers.ipynb


In [12]:
from my_own_transformers import load_and_present, filling_values_regression


data = load_and_present("train.csv")
submission_data =  load_and_present("test.csv", head=False, info=False)

random_state=14
LEN = len(data)
COLUMNS_TO_DROP = []


pass_id = submission_data["PassengerId"]#will be used later
data.drop(["PassengerId", "Name","Ticket","Cabin"],axis=1,inplace=True)
submission_data.drop(["PassengerId", "Name","Ticket","Cabin"],axis=1,inplace=True)


#missing values easiest treatment
data["Embarked"].fillna(inplace=True,value="Q")
submission_data["Fare"].fillna(inplace=True,value=submission_data["Fare"].mean())




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                      

I have already done EDA (exploratory data analysis) in previous notebook. The only thing I will add before going into building and tesing models are combination of features. 

In [13]:
#Using pipeline to transform the data
from sklearn.pipeline import Pipeline
from my_own_transformers import getting_dummies_01, filling_values_regression_method,numerical_to_categorical

pipeline_0 = Pipeline(steps=[("dummies", getting_dummies_01(variables_to_dummies=["Embarked", "Pclass","Sex"]))])
data = pipeline_0.fit_transform(data)
submission_data = pipeline_0.fit_transform(submission_data)

data = filling_values_regression(data, "Age")
submission_data = filling_values_regression(submission_data, "Age")

#Stratified sampling

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=14)
for train_index, test_index in split.split(data, data["Survived"]):
    train = data.iloc[train_index]
    test = data.iloc[test_index]


#pipeline v2 

pipeline = Pipeline(steps=[
   # ("dummies", getting_dummies_01(variables_to_dummies=["Embarked", "Pclass","Sex"])),
  #  ("filling_age", filling_values_regression_method(variable_to_fill="Age",model=LinearRegression(),variable_to_drop="Survived")),
    ("age_to_categorical", numerical_to_categorical(interval=[10,20,30,40,50,6000],variable="Age")),
    ("dummies_age", getting_dummies_01(variables_to_dummies=["Age"])),
    ("fare_to_categorical", numerical_to_categorical(interval=[10,30,6000],variable="Fare")),
    ("dummies_fare", getting_dummies_01(variables_to_dummies=["Fare"]))
])
train = pipeline.fit_transform(train)
test = pipeline.transform(test)
submission_data = pipeline.transform(submission_data)

y_train = train["Survived"]
x_train = train.drop(["Survived"],axis=1)
y_test = test["Survived"]
x_test = test.drop(["Survived"],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  "        output = pd.concat([output,encoded],axis=1)     \n",


In [14]:
#Adding combination of features (like XY, X^2) - we will not go higher than X^3 for sure, it would be too much features
from sklearn.preprocessing import PolynomialFeatures

x_train_poly = PolynomialFeatures(degree=2, include_bias=False ).fit_transform(x_train)
x_test_poly  = PolynomialFeatures(degree=2, include_bias=False ).fit_transform(x_test)
submission_data = PolynomialFeatures(degree=2, include_bias=False ).fit_transform(submission_data)


## Voting classifier

The idea o voting classifier is fitting many different classifiers, like logistic regression, SVM, or random forest and then based on combined informations of their votes (predictions) made final prediction. 
We can distinguish two ways of voting - soft and hard. Hard voting means looking only on predictions of classifiers contained in voting classifier, in turn soft voting is based on predicted probabilities (it works only if classifiers can calculate the probability).

In [15]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

rnf_clf = RandomForestClassifier(n_estimators=200)
log_clf = LogisticRegression(max_iter=1000)
svc_clf = SVC(probability=True)
voting_clf = VotingClassifier(estimators=[("01", rnf_clf), ("02", log_clf), ("03", svc_clf)],
                              voting="soft")

print("Standard features:")
for clf in (rnf_clf,log_clf,svc_clf,voting_clf):
    predictions = clf.fit(x_train,y_train).predict(x_test)
    print(clf.__class__.__name__, accuracy_score(predictions,y_test))
print("\n"+"Dataset expanded by combination of features:")
for clf in (rnf_clf,log_clf,svc_clf,voting_clf):
    predictions = clf.fit(x_train_poly,y_train).predict(x_test_poly)
    print(clf.__class__.__name__, accuracy_score(predictions,y_test))
    
#all classifiers hadnt been adjusted (I'm talking about hyperparameters)


Standard features:
RandomForestClassifier 0.7988826815642458
LogisticRegression 0.7877094972067039
SVC 0.7988826815642458
VotingClassifier 0.8212290502793296

Dataset expanded by combination of features:
RandomForestClassifier 0.8156424581005587
LogisticRegression 0.8212290502793296
SVC 0.7877094972067039
VotingClassifier 0.8212290502793296


First, we can observe that adding more features generally improves the score. I it also worth mentioning that soft voting seems giving better results - which isn't suprising, we are not losing the information of probalility in this approach. Overall voting classifier did better than any of it's sub-classifiers. I'm alsgo going to check the performance of AdaBoostClassifier.

In [16]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier()
print("Standard features:")
predictions = clf.fit(x_train,y_train).predict(x_test)
print(ada_clf.__class__.__name__, accuracy_score(predictions,y_test))
print("\n"+"Dataset expanded by combination of features:")
predictions = clf.fit(x_train_poly,y_train).predict(x_test_poly)
print(ada_clf.__class__.__name__, accuracy_score(predictions,y_test))



Standard features:
AdaBoostClassifier 0.8100558659217877

Dataset expanded by combination of features:
AdaBoostClassifier 0.8100558659217877


In [17]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(estimator=AdaBoostClassifier(), X=x_train,y=y_train))
print(cross_val_score(estimator=AdaBoostClassifier(), X=x_train_poly,y=y_train))
print(cross_val_score(estimator=voting_clf, X=x_train, y=y_train))
print(cross_val_score(estimator=voting_clf, X=x_train_poly,y=y_train))

[0.76223776 0.78321678 0.85211268 0.81690141 0.80985915]
[0.81118881 0.81118881 0.76760563 0.82394366 0.8028169 ]
[0.78321678 0.81818182 0.84507042 0.84507042 0.83098592]
[0.8041958  0.81818182 0.83098592 0.83802817 0.82394366]


Now it's time to take care of numbers of features - remove noninformative ones.

In [18]:
ada_clf = ada_clf.fit(x_train_poly,y_train)
#pick just the important features
indexes = []
for i in range(x_train_poly.shape[1]):
    if ada_clf.feature_importances_[i]>0:
        indexes.append(i)
        
important_features = x_train_poly[:,indexes]
important_features_test = x_test_poly[:,indexes]

print("Dataset with important features:", important_features.shape)
print(cross_val_score(estimator=voting_clf, X=important_features,y=y_train))

Dataset with important features: (712, 39)
[0.8041958  0.84615385 0.81690141 0.83802817 0.81690141]


Tuning the parametrs - grid search.

In [19]:
from sklearn.model_selection import GridSearchCV

print(cross_val_score(estimator=AdaBoostClassifier(), X=important_features,y=y_train))


param_grid = [{"n_estimators": [10,50,100], "algorithm": ["SAMME", "SAMME.R"], "learning_rate": [1,0.5]}]

grid_search = GridSearchCV(ada_clf, param_grid, cv=5, scoring="accuracy").fit(important_features,y_train)

ada_clf_best = grid_search.best_estimator_
print(ada_clf_best)
print(cross_val_score(estimator=ada_clf_best, X=important_features,y=y_train))

[0.76923077 0.81118881 0.85915493 0.80985915 0.80985915]
AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1,
                   n_estimators=100, random_state=None)
[0.78321678 0.81118881 0.84507042 0.84507042 0.84507042]


In [20]:
voting_clf = VotingClassifier(estimators=[("01", rnf_clf), ("02", log_clf), ("03", svc_clf), ("04",ada_clf_best)],
                              voting="soft")

In [29]:
all_data = np.concatenate((important_features,important_features_test),axis=0)
all_y = pd.concat([y_train,y_test],axis=0)
voting_clf.fit(all_data, all_y)

submission_data = submission_data[:,indexes]
print(submission_data.shape)


final_submission = pd.DataFrame(data={"PassengerId":pass_id,
    "Survived":[1 if (i > 0.5) else 0 for i in voting_clf.predict(submission_data.reshape(418,-1))]
        })
print(final_submission.head())
final_submission.to_csv(path_or_buf="submission_04.csv",index=False)


(418, 39)
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         0
