In [1]:
import numpy as np
print(np.__version__)
import pandas as pd
print(pd.__version__)

1.19.5
1.1.5


In [2]:
titanic = pd.read_csv('https://github.com/mbburova/MDS/raw/main/titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
import re
titanic['Title'] = titanic['Name'].apply(lambda x: re.compile("[A-Z][a-z]+\.").findall(x)[0][:-1])

In [4]:
proper_titles = titanic['Title'].value_counts(dropna=False)
proper_titles = list(proper_titles[proper_titles > 6].index)
proper_titles

['Mr', 'Miss', 'Mrs', 'Master', 'Dr']

In [5]:
titanic['Title'] = titanic['Title'].apply(lambda x: x if x in proper_titles else "Other")

In [6]:
categ_columns = titanic.columns[titanic.dtypes == "object"].tolist()
print("Number of categorical features =", len(categ_columns))
print(categ_columns)

Number of categorical features = 6
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title']


In [7]:
columns_to_drop = [el for el in categ_columns if titanic[el].unique().size > 100]
categ_columns = [el for el in categ_columns if titanic[el].unique().size <= 100]

print(columns_to_drop)
print(categ_columns)

['Name', 'Ticket', 'Cabin']
['Sex', 'Embarked', 'Title']


In [8]:
titanic.drop(columns_to_drop, axis=1, inplace=True)

print("Number of categorical features =", len(categ_columns))

Number of categorical features = 3


In [9]:
titanic.drop(["PassengerId", "SibSp", "Parch"], axis=1, inplace=True)
assert 'PassengerId' not in titanic.columns
titanic.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title
0,0,3,male,22.0,7.25,S,Mr
1,1,1,female,38.0,71.2833,C,Mrs


In [10]:
ordinal_cols = ["Pclass"]
numeric_cols = ["Age", "Fare"]

print('Ordinal columns are: ', ordinal_cols)
print('Numeric columns are: ', numeric_cols)

Ordinal columns are:  ['Pclass']
Numeric columns are:  ['Age', 'Fare']


In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
class MeanGroupImputer(BaseEstimator, TransformerMixin):
    '''
    Class used for imputing missing values in a pd.DataFrame using mean value within group
    
    Parameters
    ----------    
    group_cols : str
        A column used to calculate the mean values
    Returns
    -------
    X : array-like
        The array with imputed values in the target column
    '''
    def __init__(self, group_col):
        assert type(group_col) == str, 'group_col should be a string'

        self.group_col = group_col
    
    def fit(self, X, y=None):
        assert X[self.group_col].isna().sum() == 0, 'There are missing values in the group_col'
        
        # Group dataset by `group_col` and calculate mean value of all the other columns within this group
        self.mapping = X.groupby(self.group_col).mean()
        return self 
    
    def transform(self, X, y=None):
        X = X.copy()
        # make sure that the imputer was fitted
        assert self.mapping is not None
        
        # loop over all the groups
        for index, row in self.mapping.iterrows():

            # Fill in missing values for the group `index` with the values `row`            
            if index:
                X.loc[X[self.group_col] == index, row.index] \
                    = X.loc[X[self.group_col] == index, row.index].fillna(value=dict(zip(list(row.index), row.values)))

        # Then drop grouping column (we did not transform it, so it is not needed anymore)
        X.drop(self.group_col, axis=1, inplace=True)
        return X.values

In [12]:
from sklearn.preprocessing import OneHotEncoder

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline

age_pipe = make_pipeline(MeanGroupImputer(group_col='Title'), StandardScaler())
fare_pipe = make_pipeline(StandardScaler())
categ_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())


# Combine all three pipelines in one column transformer
column_transformer = ColumnTransformer([
 ('age', age_pipe, ['Age', 'Title']),
 ('fare', fare_pipe, ['Fare']),
 ('all_categ', categ_pipe, ['Sex'])],
remainder='drop'
)

In [13]:
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title
0,0,3,male,22.0,7.2500,S,Mr
1,1,1,female,38.0,71.2833,C,Mrs
2,1,3,female,26.0,7.9250,S,Miss
3,1,1,female,35.0,53.1000,S,Mrs
4,0,3,male,35.0,8.0500,S,Mr
...,...,...,...,...,...,...,...
886,0,2,male,27.0,13.0000,S,Other
887,1,1,female,19.0,30.0000,S,Miss
888,0,3,female,,23.4500,S,Miss
889,1,1,male,26.0,30.0000,C,Mr


In [14]:
test_titanic = column_transformer.fit_transform(titanic)
df = pd.concat([pd.DataFrame(test_titanic), titanic.Survived], axis=1)
df

Unnamed: 0,0,1,2,3,Survived
0,-0.584387,-0.502445,0.0,1.0,0
1,0.621365,0.786845,1.0,0.0,1
2,-0.282949,-0.488854,1.0,0.0,1
3,0.395286,0.420730,1.0,0.0,1
4,0.395286,-0.486337,0.0,1.0,0
...,...,...,...,...,...
886,-0.207590,-0.386671,0.0,1.0,0
887,-0.810466,-0.044381,1.0,0.0,1
888,-0.601421,-0.176263,1.0,0.0,0
889,-0.282949,-0.044381,0.0,1.0,1


In [15]:
from sklearn.model_selection import train_test_split

tr, te = train_test_split(df, test_size=0.2, random_state=42)

y_train = tr.Survived
y_test = te.Survived
X_train = tr.drop(['Survived'], axis=1)
X_test = te.drop(['Survived'], axis=1)

In [16]:
from sklearn.svm import SVC
import pickle


svm = SVC(C=1.3)
model = svm.fit(X_train, y_train)

# with open('model', 'wb') as f:
#     pickle.dump(model,f)

In [17]:
# with open('model', 'rb') as f:
#     mod = pickle.load(f)
    
# mod.predict([[0.2334, 0.9292, 0.0, 1.0]])

In [18]:
# fare_mean = 32.204207968574636
# fare_std = 49.6934285971809
# fare_min = 0.0
# fare_max = 512.329200

# age_mean = 29.754659
# age_std = 13.277179
# age_min = 0.42
# age_max = 80