In [91]:
import pandas as pd # to work with dataframes
import numpy as np # to work with arrays
import re # to work with regular expressions
from sklearn.pipeline import Pipeline # to build pipelines
from imputer import MeanGroupImputer # to impute missing values
from sklearn.pipeline import make_pipeline # to build pipelines
from sklearn.compose import ColumnTransformer # to build column transformers
from sklearn.preprocessing import StandardScaler # to standardize data
from sklearn.svm import SVC # to build support vector machines
import joblib # to save the pipeline

In [92]:
# read the data from github repository
titanic = pd.read_csv('https://github.com/mbburova/MDS/raw/main/titanic.csv')

In [93]:
# extract the title from the name
titanic['Title'] = titanic['Name'].apply(lambda x: re.compile("[A-Z][a-z]+\.").findall(x)[0][:-1])
# create a time series of the number of passengers with each title
proper_titles = titanic['Title'].value_counts(dropna=False)
# create a list of titles with more than 6 passengers
proper_titles = list(proper_titles[proper_titles > 6].index)
# replace titles with less than 6 passengers with "Other"
titanic['Title'] = titanic['Title'].apply(lambda x: x if x in proper_titles else "Other")

In [94]:
# create a pipeline that imputes missing values in the 'Age' column
age_pipe = make_pipeline(MeanGroupImputer(group_col='Title'))
imp = ColumnTransformer([
    ('imputing', age_pipe, ['Age', 'Title'])],
    remainder='drop'
)

In [95]:
# fit the imputer and transform the data to a numpy array
age = imp.fit_transform(titanic)
# create numpy with values 1 and 0 for 'male' and 'female'
sex = np.where(titanic['Sex'] == 'male', 1, 0).reshape(-1, 1)
# concatenate the arrays to use them in the model
data = np.concatenate((age, titanic.Fare.values.reshape(-1, 1), sex), axis=1)

In [96]:
# create a pipeline that scales the data and fits the model
model = Pipeline([
    ('scaling', StandardScaler()),
    ('model', SVC(C=1.3))
])

In [97]:
# save the model
joblib.dump(model.fit(data, titanic['Survived']), './model.pkl')

['./model.pkl']