# 1. Импорты

In [1]:
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.impute import SimpleImputer

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier

from sklearn.metrics import f1_score, roc_auc_score, roc_curve

# 2. Обучение

In [10]:
train = pd.read_csv("train.csv")

y = train["Survived"]
X = train.drop(columns=["PassengerId", "Survived", "Name", "Ticket"])
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.2500,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.9250,,S
3,1,female,35.0,1,0,53.1000,C123,S
4,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,,S
887,1,female,19.0,0,0,30.0000,B42,S
888,3,female,,1,2,23.4500,,S
889,1,male,26.0,0,0,30.0000,C148,C


In [5]:
class AgeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, step_of_partitions=10, mode="r"):
        self.mode = mode
        self.step_of_partitions = step_of_partitions
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        
        ages = [x for x in range(self.step_of_partitions, int(X_copy.Age.max())+self.step_of_partitions, self.step_of_partitions)]
        for step in ages:
            X_copy.Age.mask(((X_copy.Age > step-self.step_of_partitions) & (X_copy.Age <= step)), step, inplace=True)
        
        if self.mode == "r":
            X_copy.loc[:, 'Age'] = X_copy.Age.apply(lambda x: random.choice(ages) if np.isnan(x) else x)

        if self.mode == "s":
            counts = X_copy.Age.value_counts(normalize=True).round(3).sort_index()
            
            ages = counts.index.values
            weights = counts.values
            
            X_copy.loc[:, 'Age'] = X_copy.Age.apply(lambda x: random.choices(ages, weights, k=1)[0] if np.isnan(x) else x)

        return X_copy

In [33]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.2500,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.9250,,S
3,1,female,35.0,1,0,53.1000,C123,S
4,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,,S
887,1,female,19.0,0,0,30.0000,B42,S
888,3,female,,1,2,23.4500,,S
889,1,male,26.0,0,0,30.0000,C148,C


In [54]:
X_ = X.copy()

# Dummy-обработка категориальных признаков
X_.Sex = X_.Sex.map({"male": 1, "female": 0}, na_action="ignore")
X_.Embarked = X_.Embarked.map({"S": 0, "C": 1, "Q": 2}, na_action="ignore")

labels = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "T": 8}
X_.Cabin = X_.Cabin.apply(lambda x: x if pd.isnull(x) else labels[x[0]])


# Обработка пропущенных значений
imputer = SimpleImputer(strategy="mean")
X_ = imputer.fit_transform(X_)

X_

array([[ 3.        ,  1.        , 22.        , ...,  7.25      ,
         3.39215686,  0.        ],
       [ 1.        ,  0.        , 38.        , ..., 71.2833    ,
         3.        ,  1.        ],
       [ 3.        ,  0.        , 26.        , ...,  7.925     ,
         3.39215686,  0.        ],
       ...,
       [ 3.        ,  0.        , 29.69911765, ..., 23.45      ,
         3.39215686,  0.        ],
       [ 1.        ,  1.        , 26.        , ..., 30.        ,
         3.        ,  1.        ],
       [ 3.        ,  1.        , 32.        , ...,  7.75      ,
         3.39215686,  2.        ]])

In [6]:
estimators = [
    ("age-transformer", AgeTransformer(step_of_partitions=10, mode="r")),
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("encoder", OrdinalEncoder()), 
    ("scaler", Normalizer())
]

In [55]:
#prepare_pipe = Pipeline(estimators)
#X_ = prepare_pipe.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.20, random_state=42)

model = SGDClassifier(random_state=42)

'''parametrs = {
    "loss": ["log_loss"],
    "penalty": ["l1"],
    "alpha": [0.0],
    "shuffle": [False],
    "max_iter": [1000],
    "learning_rate": ["adaptive"],
    "eta0": [6.0],
    "early_stopping": [False],
}
grid = GridSearchCV(model, parametrs)

grid.fit(X_train, y_train)
print(grid.best_params_)'''

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
roc_auc_score(y_test, y_pred)

0.5