In [50]:
import pandas as pd
from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")

In [51]:
X, y = datasets.fetch_openml(name='titanic', version=1, return_X_y=True)

In [52]:
X.drop(['ticket', 'cabin', 'body', 'boat', 'home.dest'], axis=1, inplace=True)

In [53]:
print(f'Input features shape: {X.shape}')
print(f'Labels shape: {y.shape}')

Input features shape: (1309, 8)
Labels shape: (1309,)


In [54]:
# new features: family_size and is_alone
X['family_size'] = X['parch'] + X['sibsp']
X.drop(['parch', 'sibsp'], axis=1, inplace=True)
X['is_alone'] = 1
X['is_alone'].loc[X['family_size'] > 1] = 0

X.head()

Unnamed: 0,pclass,name,sex,age,fare,embarked,family_size,is_alone
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,211.3375,S,0,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,151.55,S,3,0
2,1,"Allison, Miss. Helen Loraine",female,2.0,151.55,S,3,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,151.55,S,3,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,151.55,S,3,0


In [55]:
X['title'] =  X['name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
X.drop(["name"], axis=1, inplace=True)
X.head()

Unnamed: 0,pclass,sex,age,fare,embarked,family_size,is_alone,title
0,1,female,29.0,211.3375,S,0,1,Miss
1,1,male,0.9167,151.55,S,3,0,Master
2,1,female,2.0,151.55,S,3,0,Miss
3,1,male,30.0,151.55,S,3,0,Mr
4,1,female,25.0,151.55,S,3,0,Mrs


In [56]:
rare_titles = (X['title'].value_counts() < 10)
rare_titles

title
Mr              False
Miss            False
Mrs             False
Master          False
Rev              True
Dr               True
Col              True
Mlle             True
Ms               True
Major            True
Capt             True
Sir              True
Dona             True
Jonkheer         True
the Countess     True
Don              True
Mme              True
Lady             True
Name: count, dtype: bool

In [57]:
X.title.loc[X.title == 'Miss'] = 'Mrs'
X['title'] = X.title.apply(lambda x: 'rare' if rare_titles[x] else x)

In [27]:
# cat_cols = ['embarked', 'sex', 'pclass', 'title', 'is_alone']
# cat_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
#     ('pca', PCA(n_components=10))
# ])

In [28]:
# num_cols = ['age', 'fare', 'family_size']
# num_transformer = Pipeline(steps=[
#     ('imputer', KNNImputer(n_neighbors=5)),
#     ('scaler', RobustScaler())
# ])

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

# Selecting features
cat_features = X_train.select_dtypes(include="object")
potential_binary_features = cat_features.nunique() == 2

binary_cat_features = cat_features[
    potential_binary_features[potential_binary_features].index
]
other_cat_features = cat_features[
    potential_binary_features[~potential_binary_features].index
]
num_features = X_train.select_dtypes(["float"])

# Defining the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        (
            "binary",
            OneHotEncoder(),
            binary_cat_features.columns.tolist(),
        ),
        (
            "cat",
            OneHotEncoder(),
            other_cat_features.columns.tolist(),
        ),
        ("num", StandardScaler(), num_features.columns.tolist()),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

# Fit the preprocessor on the training data and transform both the training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


In [29]:
# preprocessor = ColumnTransformer(transformers=[
#                 ('num', num_transformer, num_cols),
#                 ('cat', cat_transformer, cat_cols)
#             ])
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)
# 
# X_train_preprocessed = preprocessor.fit_transform(X_train)
# X_test_preprocessed = preprocessor.transform(X_test)

In [58]:
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', num_transformer, num_cols),
#         ('cat', cat_transformer, cat_cols)
#     ])

In [60]:
# clf = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', RandomForestClassifier())])
# # X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)
# cross_val_score(clf, X_train, y_train, cv=5, scoring="accuracy").mean()

0.7802825244930508

In [59]:

# Define the classifier
classifier = RandomForestClassifier()

# Fit the classifier on the preprocessed training data
classifier.fit(X_train_preprocessed, y_train)

# Evaluate the classifier using cross-validation on the preprocessed training data
cv_score = cross_val_score(classifier, X_train_preprocessed, y_train, cv=5, scoring="accuracy").mean()
print(f"Cross-validation Accuracy: {cv_score:.4f}")

Cross-validation Accuracy: 0.7717


https://www.kaggle.com/code/saumandas/neural-networks-in-tensorflow-with-titanic