In [1]:
import dill
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline

In [2]:
df = pd.read_csv('titanic.csv', index_col='PassengerId')
print(df.shape)
df.head()

(891, 11)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [4]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.column]
    

class Imputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy):
        self.strategy = strategy
        self.column_imputer = None
    
    def fit(self, X, y=None):
        if self.strategy == 'mode':
            self.column_imputer = X.value_counts().index[0]
        elif self.strategy == 'median':
            self.column_imputer = X.median()
        return self
    
    def transform(self, X):
        X.fillna(self.column_imputer, inplace=True)
        
        return pd.DataFrame(X)
    

class OrdinalEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, column):
        self.key_dict = {}
        self.column = column
    
    def fit(self, X, y=None):
        
        # Самому популярныму значению присвоим порядковый номер 0,
        # далее увеличиваем номер
        for i, value in enumerate(X[self.column].value_counts().index):
            self.key_dict[value] = i
        
        return self
    
    def transform(self, X):
        for value in X[self.column].unique():
            if value not in self.key_dict:
                self.key_dict[value] = 0
        X[self.column] = X[self.column].map(self.key_dict)
        
        return X

In [5]:
age = Pipeline([
    ('selector', FeatureSelector(column='Age')),
    ('imputer', Imputer('median'))
])

pclass = Pipeline([
    ('selector', FeatureSelector(column='Pclass')),
    ('imputer', Imputer('mode'))
])

sex = Pipeline([
    ('selector', FeatureSelector(column='Sex')),
    ('imputer', Imputer('mode')),
    ('encoder', OrdinalEncoder(column='Sex'))
])

feats = FeatureUnion([
    ('Age', age),
    ('Pclass', pclass),
    ('Sex', sex)
])

In [6]:
rf_model = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=29))
])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Survived'], random_state=29)

In [8]:
X_test.to_csv('X_test.csv')
y_test.to_csv('y_test.csv')

In [9]:
rf_model.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Age',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Age')),
                                                                 ('imputer',
                                                                  Imputer(strategy='median'))])),
                                                ('Pclass',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Pclass')),
                                                                 ('imputer',
                                                                  Imputer(strategy='mode'))])),
                                                ('Sex',
                                                 Pipeline(steps=[('selector',
                   

In [10]:
preds = rf_model.predict_proba(X_test)[:, 1]

In [11]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.2627380952380952, F-Score=0.743, Precision=0.657, Recall=0.855


In [12]:
with open('best_threshold.dill', 'wb') as f:
    dill.dump(thresholds[ix], f)

In [13]:
with open('rforest_model.dill', 'wb') as f:
    dill.dump(rf_model, f)