In [None]:
# Import relvant libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
import random
random.seed(42)

In [None]:
complete_data = pd.read_csv('titanic.csv')

raw_features = complete_data.drop(columns=['Survived'])
outcomes = complete_data['Survived']

In [None]:
raw_features.head()

In [None]:
outcomes.head()

In [None]:
raw_features.info()

In [None]:
features_names_excl = raw_features.drop(columns=['Name'])

In [None]:
#One-hot encode the features
features = pd.get_dummies(features_names_excl)

# Fill rows with missing values with zeros
features = features.fillna(0.0)

In [None]:
features.info()

In [None]:
features.head()

In [None]:
#Split the Data into Training and Testing Sets
X_train_dev, X_test, y_train_dev, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, test_size=0.25, random_state=42)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Train a Decision Tree model without imposing restrictions on the depth
default_model = DecisionTreeClassifier().fit(X_train, y_train)
default_model

In [None]:
# Compute the model's accuracy on the training and development set
def test_accuracy(model, X, y):
    y_pred = model.predict(X)
    accuracy = round(accuracy_score(y, y_pred) * 100., 1)
    return accuracy

training_accuracy = test_accuracy(default_model, X_train, y_train)
development_accuracy = test_accuracy(default_model, X_dev, y_dev)
print('Training accuracy:  {0:2.1f}'.format(training_accuracy)+'%')
print('Development accuracy:  {0:2.1f}'.format(development_accuracy)+'%')

In [None]:
# Plot the Decision Tree
plt.figure(figsize=(12, 8))
plot_tree(model, filled=True, rounded=True, class_names=['Not Survived', 'Survived'])
plt.title("Decision Tree for Titanic Survival Prediction")
plt.show()


In [None]:
# Report the accuracy of the final model on the test data
final_model = DecisionTreeClassifier(max_depth=6, min_samples_leaf=2, min_samples_split=2).fit(X_train, y_train)
test_accuracy = test_accuracy(final_model, X_test, y_test)
print('Test accuracy:  {0:2.1f}'.format(test_accuracy)+'%')

In [None]:
def test_accuracy(model, X, y):
    y_pred = model.predict(X)
    accuracy = round(accuracy_score(y, y_pred) * 100., 1)
    return accuracy,  # Return the accuracy as a tuple with a comma at the end

# Try building the model with different values of the max_depth [2-10]
results = pd.DataFrame(columns=['max_depth',
                                'min_samples_leaf',
                                'min_samples_split',
                                'n_nodes',
                                'training_accuracy',
                                'development_accuracy'])

for max_depth_i in range(2, 11):
    for min_samples_leaf_i in range(1, 15):
        for min_samples_split_i in range(2, 15):
            model = DecisionTreeClassifier(max_depth=max_depth_i,
                                           min_samples_leaf=min_samples_leaf_i,
                                           min_samples_split=min_samples_split_i).fit(X_train, y_train)

            # Calculate training and development accuracies separately
            training_accuracy = test_accuracy(model, X_train, y_train)
            development_accuracy = test_accuracy(model, X_dev, y_dev)
            n_nodes = model.tree_.node_count

            # Create a new row with all the information
            new_row = pd.Series({'max_depth': max_depth_i,
                                 'min_samples_leaf': min_samples_leaf_i,
                                 'min_samples_split': min_samples_split_i,
                                 'n_nodes': n_nodes,
                                 'training_accuracy': training_accuracy[0],  # Access the accuracy from the tuple
                                 'development_accuracy': development_accuracy[0]},  # Access the accuracy from the tuple
                                name=len(results))  # Set the name to the length of the results DataFrame

            # Append the new row to the results DataFrame
            results = results.append(new_row)
            
            # Plot the decision tree for each max_depth
            plt.figure(figsize=(10, 6))
            plot_tree(model, filled=True, rounded=True, class_names=['Not Survived', 'Survived'])
            plt.title(f"Decision Tree for Titanic Survival Prediction (Max Depth {max_depth_i})")
            plt.show()


# Sorting the results based on development accuracy
results = results.sort_values(by='development_accuracy', ascending=False).reset_index(drop=True)

results.head()

In [None]:
# Initialize empty lists to store training and development accuracies
training_accuracies = []
development_accuracies = []

# Loop through different values of max_depth
for max_depth_i in range(2, 11):
    # Train the model
    model = DecisionTreeClassifier(max_depth=max_depth_i, random_state=42)
    model.fit(X_train, y_train)
    
    # Calculate training and development accuracies
    training_accuracy = accuracy_score(y_train, model.predict(X_train))
    development_accuracy = accuracy_score(y_dev, model.predict(X_dev))
    
    # Append the accuracies to the respective lists
    training_accuracies.append(training_accuracy)
    development_accuracies.append(development_accuracy)

# Plot the training and development accuracies for each max_depth
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(range(2, 11), training_accuracies, label='Training Accuracy', color='blue')
ax.plot(range(2, 11), development_accuracies, label='Development Accuracy', color='orange')
ax.set_xlabel('Max Depth')
ax.set_ylabel('Accuracy')
ax.set_title('Training and Development Accuracies')
ax.legend()
plt.show()


The shape plotted is a line graph which shows how the training and development accuracies change as the max_depth of the decision tree varies. It provides a visual representation of the model's performance with different levels of complexity or max_depth for the decision tree. The graph allows us to observe trends and patterns in the accuracy scores and helps in identifying the optimal max_depth value that balances model complexity and generalization performance,  reveal whether the model is overfitting or underfitting. In this case, the development accuracy is low or decreases with increasing max_depth while training accuracy is high, indicating overfitting. So the model is memorizing the training data and is not able to generalize to new data.