This one is identical except for the pipeline. This time, instead of scaling the age, we're going to map it onto age ranges

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
# Step 1: Load the Titanic dataset
titanic_data = pd.read_csv('titanic.csv')

In [3]:
# Step 2: Preprocess the data
# Drop irrelevant columns and handle missing values
titanic_data = titanic_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [4]:
# Step 3: Split the dataset into training and testing sets
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']
# what's the sklearn function for train test splitting? google it
# oh. That didn't work. You have to import it
X_train, X_test, y_train, y_test = 

In [5]:
numerical_features = ['Fare']
categorical_features = ['Sex', 'Embarked', 'Pclass']

In [6]:
numerical_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

In [15]:
# Custom transformer for 'Age' feature
class AgeTransformer(BaseEstimator, TransformerMixin):
    def transform(self, X):
        age_categories = pd.cut(X['Age'], bins=[0, 18, 60, 100], labels=['young', 'adult', 'old'])
        return age_categories.to_frame()

    def fit(self, X, y=None):
        return self

In [22]:
# Step 5: Create the pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('age', Pipeline([
            ('age_transformer', AgeTransformer()),
            ('encoder', OneHotEncoder(drop='first'))
        ]), ['Age'])
    ])

In [23]:
classifier = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

In [24]:
# Step 6: Train the model
classifier.fit(X_train, y_train)

In [25]:
# Step 7: Evaluate the model's performance on the testing set
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [26]:
# Print the accuracy
print("Accuracy:", accuracy)

Accuracy: 0.8100558659217877


In [None]:
# Step 8: Compare the performance of different models and hyperparameter settings
# You can repeat steps 5-7 with different classifiers and adjust their hyperparameters
# you can look at grid_search_code.ipynb if you want to grid search