In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
# Load the Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic_data = pd.read_csv(url)

In [3]:
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
# Select the features and target variable
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
target = 'Survived'
X = titanic_data[features]
y = titanic_data[target]

In [5]:
X

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
0,3,male,22.0,7.2500,S
1,1,female,38.0,71.2833,C
2,3,female,26.0,7.9250,S
3,1,female,35.0,53.1000,S
4,3,male,35.0,8.0500,S
...,...,...,...,...,...
886,2,male,27.0,13.0000,S
887,1,female,19.0,30.0000,S
888,3,female,,23.4500,S
889,1,male,26.0,30.0000,C


In [6]:
type(X)

pandas.core.frame.DataFrame

In [7]:
type(X['Age'])

pandas.core.series.Series

In [8]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [9]:
type(y)

pandas.core.series.Series

In [10]:
# Preprocess the data
numeric_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

In [11]:
numeric_features

['Age', 'Fare']

In [12]:
categorical_features

['Pclass', 'Sex', 'Embarked']

In [13]:
X[categorical_features]

Unnamed: 0,Pclass,Sex,Embarked
0,3,male,S
1,1,female,C
2,3,female,S
3,1,female,S
4,3,male,S
...,...,...,...
886,2,male,S
887,1,female,S
888,3,female,S
889,1,male,C


In [14]:
X[categorical_features].nunique()

Pclass      3
Sex         2
Embarked    3
dtype: int64

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [16]:
X = preprocessor.fit_transform(X)
X

array([[22.        ,  7.25      ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [38.        , 71.2833    ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [26.        ,  7.925     ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [29.69911765, 23.45      ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [26.        , 30.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [32.        ,  7.75      ,  0.        , ...,  1.        ,
         0.        ,  0.        ]])

In [17]:
X.shape

(891, 11)

In [18]:
X[0]

array([22.  ,  7.25,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,  0.  ,
        1.  ,  0.  ])

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Decision Tree

In [20]:
# Train the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [21]:
# Make predictions on the test set
y_pred = clf.predict(X_train)

# Evaluate the model
accuracy = accuracy_score(y_train, y_pred)
print("Training Accuracy:", accuracy)

Training Accuracy: 0.9789325842696629


In [22]:
# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.770949720670391


### SKLearn MLP

In [23]:
from sklearn.neural_network import MLPClassifier
clf_mlp = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)

In [24]:
y_pred = clf_mlp.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.7988826815642458


In [25]:
test_scores, train_scores = [], []
for i in [100, 200, 300]:
    clf_mlp = MLPClassifier(random_state=1, hidden_layer_sizes = i,
                            max_iter=300).fit(X_train, y_train)
    # Create predictions for the X_train and X_test datasets.
    train_predictions = clf_mlp.predict(X_train)
    test_predictions = clf_mlp.predict(X_test)
    # Append the accuracy score for the test and train predictions.
    train_scores.append(round(accuracy_score(y_train, train_predictions), 2))
    test_scores.append(round(accuracy_score(y_test, test_predictions), 2))
# Print the train and test scores.
print("The training scores were: {}".format(train_scores))
print("The testing scores were: {}".format(test_scores))

The training scores were: [0.83, 0.84, 0.83]
The testing scores were: [0.8, 0.81, 0.8]


### Keras NN 1


***keras.models.Sequential***
Sequential models require that each layer has weights or connections *only* to the one layer coming directly after it in the network diagram.


***input_shape=(X_train.shape[1],)***
*   input_shape - specifies the number of columns.  This is needed for the input layer, as each input column is represented by a node.


***keras.layers.Dense***
- each one of these lines represents a layer in the neural network
- 64 represents the number of nodes
- last layer has one node

***activation='relu'***
- the activation function inside the node (take all inputs and generate an output based on the selected activation function)


***activation='sigmoid'***
- the activation function which in this case for the output layer resulting in a 0 or 1 output



In [26]:
from tensorflow import keras

# Define the model architecture
# 1 input, 1 intermediate and 1 output layer
model = keras.models.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

***optimizer='adam'***
- the optimizer to guide the model in searching for the optimal solution


***loss='binary_crossentropy'***
- loss function (penalty for error)


In [27]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [28]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping_monitor = EarlyStopping(patience=2)

In [29]:
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32,
          callbacks = [early_stopping_monitor], verbose=1)

Epoch 1/20



Epoch 2/20



Epoch 3/20



Epoch 4/20



Epoch 5/20



Epoch 6/20



Epoch 7/20



Epoch 8/20



Epoch 9/20



Epoch 10/20



Epoch 11/20



Epoch 12/20



Epoch 13/20



Epoch 14/20



Epoch 15/20



Epoch 16/20



Epoch 17/20



Epoch 18/20



Epoch 19/20



Epoch 20/20





<keras.src.callbacks.History at 0x7a77c8903610>

In [30]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

Test loss: 0.4604
Test accuracy: 0.7821


### Keras NN 2 - a more complex model

In [31]:
from tensorflow import keras

In [32]:
from keras.models import Sequential
from keras.layers import Dense

# Create the model
model = Sequential()

# Add the input layer
keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),

# Add 50 intermediate layers
for _ in range(50):
    model.add(Dense(units=100, activation='relu'))

# Add the output layer
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [33]:
early_stopping_monitor = EarlyStopping(patience=4)
model.fit(X_train, y_train, epochs=20, batch_size=32,
          callbacks = [early_stopping_monitor], verbose=1)

Epoch 1/20



Epoch 2/20



Epoch 3/20



Epoch 4/20



Epoch 5/20



Epoch 6/20



Epoch 7/20



Epoch 8/20



Epoch 9/20



Epoch 10/20



Epoch 11/20



Epoch 12/20



Epoch 13/20



Epoch 14/20



Epoch 15/20



Epoch 16/20



Epoch 17/20



Epoch 18/20



Epoch 19/20



Epoch 20/20





<keras.src.callbacks.History at 0x7a77b6768f70>

In [34]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

Test loss: 0.6794
Test accuracy: 0.5866
