In [58]:
import numpy as np
import pandas as pd
import random

In [59]:
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, Normalizer

from sklearn.impute import SimpleImputer

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error as mse

In [60]:
train = pd.read_csv("train.csv")

y = train["Survived"]
X = train.drop(columns=["PassengerId", "Survived"])
X

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [61]:
class AgeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, step_of_partitions=10, mode="r"):
        self.mode = mode
        self.step_of_partitions = step_of_partitions
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        
        ages = [x for x in range(self.step_of_partitions, int(X_copy.Age.max())+self.step_of_partitions, self.step_of_partitions)]
        for step in ages:
            X_copy.Age.mask(((X_copy.Age > step-self.step_of_partitions) & (X_copy.Age <= step)), step, inplace=True)
        
        if self.mode == "r":
            X_copy.loc[:, 'Age'] = X_copy.Age.apply(lambda x: random.choice(ages) if np.isnan(x) else x)

        if self.mode == "s":
            counts = X_copy.Age.value_counts(normalize=True).round(3).sort_index()
            
            ages = counts.index.values
            weights = counts.values
            
            X_copy.loc[:, 'Age'] = X_copy.Age.apply(lambda x: random.choices(ages, weights, k=1)[0] if np.isnan(x) else x)

        return X_copy

In [62]:
estimators = [
    ("age-transformer", AgeTransformer(step_of_partitions=10, mode="r")),
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("encoder", OrdinalEncoder()), 
    ("scaler", Normalizer())
]

In [65]:
prepare_pipe = Pipeline(estimators)
X_ = prepare_pipe.fit_transform(X)
print(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.20, random_state=42)

model = LogisticRegression(solver="liblinear", max_iter=100, penalty="l2", random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(mse(y_test, y_pred))
model.coef_

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')
0.29608938547486036


array([[-0.08524125, -0.26468988, -0.17620309, -0.14525648, -0.02472494,
         0.01377559, -0.90761215,  2.36783071,  0.30078341, -0.13001529]])

In [None]:
'''
# {'max_iter': 300, 'penalty': None, 'solver': 'lbfgs'} 0.19
lbfgs_parameters = {
    "penalty": ["l2", None], 
    "solver": ["lbfgs"], 
    "max_iter": [x for x in range(100, 1000, 100)]
}

# {'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'} 0.29
liblinear_parameters = {
    "penalty": ["l2", "l1"], 
    "solver": ["liblinear"], 
    "max_iter": [x for x in range(100, 1000, 100)]
}

# {'max_iter': 100, 'penalty': None, 'solver': 'newton-cg'} 0.20
newton_cg_parameters = {
    "penalty": ["l2", None], 
    "solver": ["newton-cg"], 
    "max_iter": [x for x in range(100, 1000, 100)]
}

# {'max_iter': 100, 'penalty': None, 'solver': 'newton-cholesky'} 0.20
newton_cholesky_parameters = {
    "penalty": ["l2", None], 
    "solver": ["newton-cholesky"], 
    "max_iter": [x for x in range(100, 1000, 100)]
}

# {'max_iter': 4700, 'penalty': None, 'solver': 'sag'} 0.22
sag_parameters = {
    "penalty": ["l2", None], 
    "solver": ["sag"], 
    "max_iter": [x for x in range(4000, 6000, 100)]
}

# {'max_iter': 5600, 'penalty': None, 'solver': 'saga'} 0.22
saga_parameters = {
    "penalty": ["l1", "l2", "elasticnet", None], 
    "solver": ["saga"], 
    "max_iter": [x for x in range(4000, 6000, 100)]
}


grid = GridSearchCV(model, saga_parameters)
grid.fit(X_train, y_train)

print(grid.best_params_)'''

### TODO
1. Трансформер для выбора признаков (по коеффициенту Пирсона, в дальнейшем - возможен другой коеффициент корреляции)
2. Переписать стандартные трансформеры в кастомные (по необходимости, см. скрипт с анализом)