# 1. Импорты

In [84]:
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [85]:
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier

from sklearn.metrics import f1_score, roc_auc_score, roc_curve, mean_squared_error

# 2. Обучение

In [86]:
train = pd.read_csv("train.csv")

y = train["Survived"]
X = train.drop(columns=["PassengerId", "Survived", "Name", "Ticket"])
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.2500,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.9250,,S
3,1,female,35.0,1,0,53.1000,C123,S
4,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,,S
887,1,female,19.0,0,0,30.0000,B42,S
888,3,female,,1,2,23.4500,,S
889,1,male,26.0,0,0,30.0000,C148,C


In [87]:
class AgeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, step_of_partitions=10, mode="r"):
        self.mode = mode
        self.step_of_partitions = step_of_partitions
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        
        ages = [x for x in range(self.step_of_partitions, int(X_copy.Age.max())+self.step_of_partitions, self.step_of_partitions)]
        for step in ages:
            X_copy.Age.mask(((X_copy.Age > step-self.step_of_partitions) & (X_copy.Age <= step)), step, inplace=True)
        
        if self.mode == "r":
            X_copy.loc[:, 'Age'] = X_copy.Age.apply(lambda x: random.choice(ages) if np.isnan(x) else x)

        if self.mode == "s":
            counts = X_copy.Age.value_counts(normalize=True).round(3).sort_index()
            
            ages = counts.index.values
            weights = counts.values
            
            X_copy.loc[:, 'Age'] = X_copy.Age.apply(lambda x: random.choices(ages, weights, k=1)[0] if np.isnan(x) else x)

        return X_copy

In [88]:
class SexTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, mode="r"):
        self.mode = mode
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        
        X_copy.Sex = X_copy.Sex.map({"male": 1, "female": 0}, na_action="ignore")
        
        sex = X_copy.Sex.unique()
        if self.mode == "r":
            X_copy.loc[:, 'Sex'] = X_copy.Sex.apply(lambda x: random.choice(sex) if np.isnan(x) else x)

        if self.mode == "s":
            counts = X_copy.Sex.value_counts(normalize=True).round(3).sort_index()
            
            sex = counts.index.values
            weights = counts.values
            
            X_copy.loc[:, 'Sex'] = X_copy.Sex.apply(lambda x: random.choices(sex, weights, k=1)[0] if np.isnan(x) else x)
        
        return X_copy

In [89]:
class FareTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        
        return X_copy

In [90]:
class CabinTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        
        return X_copy

In [91]:
class EmbarkedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        
        return X_copy

In [92]:
X_ = X.copy()

'''# Dummy-обработка категориальных признаков
X_.Sex = X_.Sex.map({"male": 1, "female": 0}, na_action="ignore")
X_.Embarked = X_.Embarked.map({"S": 0, "C": 1, "Q": 2}, na_action="ignore")

labels = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "T": 8}
X_.Cabin = X_.Cabin.apply(lambda x: x if pd.isnull(x) else labels[x[0]])


# Обработка пропущенных значений
imputer = SimpleImputer(strategy="most_frequent")
X_ = pd.DataFrame(imputer.fit_transform(X_), columns=X.columns)

scaler = MinMaxScaler()
fare = X_["Fare"].to_numpy()
X_["Fare"] = scaler.fit_transform(fare.reshape(-1, 1))

X_'''

'# Dummy-обработка категориальных признаков\nX_.Sex = X_.Sex.map({"male": 1, "female": 0}, na_action="ignore")\nX_.Embarked = X_.Embarked.map({"S": 0, "C": 1, "Q": 2}, na_action="ignore")\n\nlabels = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "T": 8}\nX_.Cabin = X_.Cabin.apply(lambda x: x if pd.isnull(x) else labels[x[0]])\n\n\n# Обработка пропущенных значений\nimputer = SimpleImputer(strategy="most_frequent")\nX_ = pd.DataFrame(imputer.fit_transform(X_), columns=X.columns)\n\nscaler = MinMaxScaler()\nfare = X_["Fare"].to_numpy()\nX_["Fare"] = scaler.fit_transform(fare.reshape(-1, 1))\n\nX_'

In [93]:
estimators = [
    ("age-transformer", AgeTransformer(step_of_partitions=10, mode="s")),
    ("sex-transformer", SexTransformer(mode="s")),
    ("fare-transformer", FareTransformer()),
    ("cabin-transformer", CabinTransformer()),
    ("embarked-transformer", EmbarkedTransformer()),
]

prepare_pipe = Pipeline(estimators)
X_ = prepare_pipe.fit_transform(X_)

X_.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,30.0,1,0,7.25,,S
1,1,0,40.0,1,0,71.2833,C85,C
2,3,0,30.0,0,0,7.925,,S
3,1,0,40.0,1,0,53.1,C123,S
4,3,1,40.0,0,0,8.05,,S
5,3,1,30.0,0,0,8.4583,,Q
6,1,1,60.0,0,0,51.8625,E46,S
7,3,1,10.0,3,1,21.075,,S
8,3,0,30.0,0,2,11.1333,,S
9,2,0,20.0,1,0,30.0708,,C


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.20, random_state=42)

model = SGDClassifier(random_state=42)

'''parametrs = {
    "loss": ["log_loss"],
    "penalty": ["l1"],
    "alpha": [0.0],
    "shuffle": [False],
    "max_iter": [1000],
    "learning_rate": ["adaptive"],
    "eta0": [6.0],
    "early_stopping": [False],
}
grid = GridSearchCV(model, parametrs)

grid.fit(X_train, y_train)
print(grid.best_params_)'''

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
roc_auc_score(y_test, y_pred)

0.8132561132561132