In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

In [28]:
data = pd.read_csv('spaceship_titanic.csv').dropna()
data = data.reset_index(drop=True)
print(data.columns)

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')


In [29]:
data = data.drop(['PassengerId', 'Name', 'Cabin'], axis=1)

categorical_features = ['HomePlanet', 'CryoSleep', 
                        'Destination', 'VIP']
numeric_features = ['Age', 'RoomService', 'FoodCourt', 
                    'ShoppingMall', 'Spa', 'VRDeck']

data['Transported'] = LabelEncoder().fit_transform(data['Transported'])

# Числовые признаки
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

# Категориальные признаки
for feature in categorical_features:
    for unique_value in data[feature].unique():
        data[feature].mask(data[feature] == unique_value,
                           data.loc[data[feature] == unique_value, 'Transported'].mean(),
                           inplace=True)
    data[feature] = data[feature].astype(float)

print(data)

      HomePlanet  CryoSleep  Destination       Age       VIP  RoomService  \
0       0.659892   0.332709     0.466783  0.695413  0.506983    -0.345756   
1       0.425687   0.332709     0.466783 -0.336769  0.506983    -0.176748   
2       0.659892   0.332709     0.466783  2.002842  0.370370    -0.279083   
3       0.659892   0.332709     0.466783  0.282540  0.506983    -0.345756   
4       0.425687   0.332709     0.466783 -0.887266  0.506983     0.124056   
...          ...        ...          ...       ...       ...          ...   
6601    0.659892   0.332709     0.622601  0.833037  0.370370    -0.345756   
6602    0.425687   0.816895     0.505618 -0.749641  0.506983    -0.345756   
6603    0.425687   0.332709     0.466783 -0.199145  0.506983    -0.345756   
6604    0.659892   0.332709     0.622601  0.213728  0.506983    -0.345756   
6605    0.659892   0.332709     0.466783  1.039473  0.506983    -0.150389   

      FoodCourt  ShoppingMall       Spa    VRDeck  Transported  
0     -0.2

In [30]:
x = data.drop(['Transported'], axis=1)
y = data['Transported']

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1)

In [31]:
knn = KNeighborsClassifier(n_neighbors=3)
rf = RandomForestClassifier(max_depth=6, n_estimators=50)
logreg = LogisticRegression(random_state=1)

knn.fit(x_train, y_train)
rf.fit(x_train, y_train)
logreg.fit(x_train, y_train)

print('KNN: {}'.format(f1_score(y_test, knn.predict(x_test))))
print('Random forest: {}'.format(f1_score(y_test, rf.predict(x_test))))
print('Logistic Regression: {}'.format(f1_score(y_test, logreg.predict(x_test))))

KNN: 0.7683109118086696
Random forest: 0.7941605839416058
Logistic Regression: 0.8063583815028902


### **Блендинг**

In [32]:
class Blender:
    def __init__(self, base_models, meta_alg):
        self.models = base_models
        self.meta_alg = meta_alg
    
    def fit(self, x_train, y_train):
        x_train_base, x_train_meta, y_train_base, y_train_meta = train_test_split(
            x_train, y_train, test_size=0.3, random_state=2)
        
        train_meta_features = []
        for base_model in self.models:
            base_model.fit(x_train_base, y_train_base)
            base_model_predict = base_model.predict(x_train_meta)
            train_meta_features.append(base_model_predict)

        ready_train_meta_features = pd.Series(train_meta_features[0])
        for meta_feature in train_meta_features[1:]:
            ready_train_meta_features = pd.concat([ready_train_meta_features, pd.Series(meta_feature)], axis=1)
        ready_train_meta_features.columns = [str(i + 1) for i in range(len(train_meta_features))]

        self.meta_alg.fit(ready_train_meta_features, y_train_meta)
    
    def predict(self, x_test):
        test_meta_features = []
        for base_model in self.models:
            base_model_predict = base_model.predict(x_test)
            test_meta_features.append(base_model_predict)

        ready_test_meta_features = pd.Series(test_meta_features[0])
        for meta_feature in test_meta_features[1:]:
            ready_test_meta_features = pd.concat([ready_test_meta_features, pd.Series(meta_feature)], axis=1)
        ready_test_meta_features.columns = [str(i + 1) for i in range(len(test_meta_features))]

        return self.meta_alg.predict(ready_test_meta_features)

**Базовые и мета- алгоритмы**

In [33]:
knn_model = KNeighborsClassifier(n_neighbors=3)
rf_model = RandomForestClassifier(max_depth=6, n_estimators=50)
logreg_model = LogisticRegression(random_state=1)

models = [knn_model, rf_model, logreg_model]

meta_alg = XGBClassifier(n_estimators=100, max_depth=3)

**Создаем и обучаем "blender"**

In [34]:
blender = Blender(models.copy(), meta_alg)
blender.fit(x_train, y_train)

**Посмотрим на результат**

In [35]:
blender_predict = blender.predict(x_test)

print('Blending: {}'.format(f1_score(y_test, blender_predict)))

Blending: 0.8063354931605473


**Теперь попробуем усредненный blending**

In [36]:
blenders = [Blender(models.copy(), XGBClassifier(n_estimators=100, max_depth=3)),
            Blender(models.copy(), XGBClassifier(n_estimators=100, max_depth=3)),
            Blender(models.copy(), XGBClassifier(n_estimators=100, max_depth=3)),
            Blender(models.copy(), XGBClassifier(n_estimators=100, max_depth=3)),
            Blender(models.copy(), XGBClassifier(n_estimators=100, max_depth=3))]

results = []
for blender in blenders:
    blender.fit(x_train, y_train)
    results.append(blender.predict(x_test))

result_predict = np.mean(np.array(results), axis=0)
result_predict = np.array(list(map(lambda x: 1 if x >= 0.5 else 0, result_predict)))

print('Averaged Blending: {}'.format(f1_score(y_test, result_predict)))

Averaged Blending: 0.8054558506819813


In [37]:
from sklearn.calibration import cross_val_predict
from sklearn.base import BaseEstimator, ClassifierMixin

class Stacker(BaseEstimator, ClassifierMixin):
    def __init__(self, models, ens_model):
        self.models = models
        self.ens_model = ens_model
        self.n = len(models)
        self.valid = None
    
    def fit(self, x, y=None, cv=3, err=0.001):
        self.valid = err * np.random.randn(x.shape[0], self.n)
        
        for t, clf in enumerate(self.models):
            self.valid[:, t] += cross_val_predict(clf, x, y, cv=cv, n_jobs=-1, method='predict')
            clf.fit(x, y)
        
        self.ens_model.fit(self.valid, y)  
            
        return self
    
    def predict(self, X, y=None):
        X_meta = np.zeros((X.shape[0], self.n))
        
        for t, clf in enumerate(self.models):
            X_meta[:, t] = clf.predict(X)
        
        a = self.ens_model.predict(X_meta)
        
        return a

In [38]:
ens_model = XGBClassifier()
s1 = Stacker(models, ens_model)
s1.fit(x_train, y_train)

print('F1-score on stacking: {}'.format(f1_score(y_test, s1.predict(x_test))))

F1-score on stacking: 0.7713147410358566
