In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import time

In [3]:
data = pd.read_csv("data_week11.csv")
X = data.drop('outcome', axis=1)
y = data['outcome']
results = {}

def train_evaluate(data_size, hidden_layer_sizes, random_state=42):
    if len(X) > data_size:
        X_sample, _, y_sample, _ = train_test_split(X, y, train_size=data_size, random_state=random_state)
    else:
        X_sample, y_sample = X, y
    X_train, X_val, y_train, y_val = train_test_split(X_sample, y_sample, test_size=0.2, random_state=random_state)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    model = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        max_iter=300,
        early_stopping=True,
        random_state=random_state
    )

    start_time = time.time()
    model.fit(X_train_scaled, y_train)
    end_time = time.time()

    train_predictions = model.predict(X_train_scaled)
    val_predictions = model.predict(X_val_scaled)

    train_error = 1 - accuracy_score(y_train, train_predictions)
    val_error = 1 - accuracy_score(y_val, val_predictions)
    execution_time = end_time - start_time

    return {
        "train_error": round(train_error, 4),
        "val_error": round(val_error, 4),
        "execution_time": round(execution_time, 2)
    }

configs = [
    (1000, (4,)),
    (10000, (4,)),
    (100000, (4,)),
    (1000, (4, 4)),
    (10000, (4, 4)),
    (100000, (4, 4))
]

for data_size, hidden_layers in configs:
    key = f"{data_size}_{hidden_layers}"
    results[key] = train_evaluate(data_size, hidden_layers)
    print(f"Completed: {data_size} samples, {hidden_layers} hidden layers")
    print(f"Train error: {results[key]['train_error']}")
    print(f"Validation error: {results[key]['val_error']}")
    print(f"Execution time: {results[key]['execution_time']} seconds")
    print("-" * 40)

print("\nResults Table:")
print("Data size | Configuration | Training error | Validation error | Time of execution")
print("-" * 80)

for data_size, hidden_layers in configs:
    key = f"{data_size}_{hidden_layers}"
    config_name = "1 hidden layer 4 nodes" if len(hidden_layers) == 1 else "2 hidden layers of 4 nodes each"
    print(f"{data_size} | {config_name} | {results[key]['train_error']} | {results[key]['val_error']} | {results[key]['execution_time']} sec")

Completed: 1000 samples, (4,) hidden layers
Train error: 0.2488
Validation error: 0.2
Execution time: 0.13 seconds
----------------------------------------
Completed: 10000 samples, (4,) hidden layers
Train error: 0.009
Validation error: 0.0125
Execution time: 0.57 seconds
----------------------------------------
Completed: 100000 samples, (4,) hidden layers
Train error: 0.0005
Validation error: 0.0006
Execution time: 3.56 seconds
----------------------------------------
Completed: 1000 samples, (4, 4) hidden layers
Train error: 0.2488
Validation error: 0.195
Execution time: 0.03 seconds
----------------------------------------
Completed: 10000 samples, (4, 4) hidden layers
Train error: 0.2385
Validation error: 0.252
Execution time: 0.14 seconds
----------------------------------------
Completed: 100000 samples, (4, 4) hidden layers
Train error: 0.001
Validation error: 0.0012
Execution time: 2.04 seconds
----------------------------------------

Results Table:
Data size | Configuration