In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier

# Specify the number of folds (K)
n_splits = 4  

# Initialize K-Fold cross-validator
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

data_train = pd.read_csv('./data/features.train.csv').set_index('id')
data_validation = pd.read_csv('./data/features.test.csv').set_index('id')
data_combined = pd.concat([data_train, data_validation])

# Extract features and labels for training set
X_train = data_combined.drop('label', axis=1)
y_train = data_combined['label']

# Extract features and labels for validation set
X_validation = data_validation.drop('label', axis=1)
y_validation = data_validation['label']


model = MLPClassifier(
    hidden_layer_sizes    = (17,8, 17), 
    solver                = 'sgd', 
    activation            = 'logistic', 
    alpha                 = 0.0029,
    batch_size            = 36, 
    learning_rate         = 'constant',
    learning_rate_init    = 0.12, 
    max_iter              = 1000,
    momentum              = 0.0, 
    nesterovs_momentum    = False, 
    validation_fraction   = 0.2, 
    shuffle               = True,
    random_state          = 0
)

# Lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform K-Fold cross-validation
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Train model on the training set
    model.fit(X_train_fold, y_train_fold)

    # Make predictions on the test set
    y_pred_fold = model.predict(X_test_fold)


    # Evaluate the model's performance
    accuracy_fold = accuracy_score(y_test_fold, y_pred_fold)
    precision_fold = precision_score(y_test_fold, y_pred_fold)
    recall_fold = recall_score(y_test_fold, y_pred_fold)
    f1_fold = f1_score(y_test_fold, y_pred_fold)

    accuracy_scores.append(accuracy_fold)
    precision_scores.append(precision_fold)
    recall_scores.append(recall_fold)
    f1_scores.append(f1_fold)

# Print the evaluation metrics for each fold
for fold, accuracy, precision, recall, f1 in zip(range(1, n_splits + 1), accuracy_scores, precision_scores, recall_scores, f1_scores):
    print(f'Fold {fold}: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}')

# Print the average evaluation metrics across all folds
average_accuracy = sum(accuracy_scores) / n_splits
average_precision = sum(precision_scores) / n_splits
average_recall = sum(recall_scores) / n_splits
average_f1 = sum(f1_scores) / n_splits
print(f'Average Accuracy Across Folds: {average_accuracy:.4f}')
print(f'Average Precision Across Folds: {average_precision:.4f}')
print(f'Average Recall Across Folds: {average_recall:.4f}')
print(f'Average F1 Score Across Folds: {average_f1:.4f}')


Fold 1: Accuracy = 0.8214, Precision = 0.7901, Recall = 0.7781, F1 Score = 0.7841
Fold 2: Accuracy = 0.8256, Precision = 0.7692, Recall = 0.8166, F1 Score = 0.7922
Fold 3: Accuracy = 0.8203, Precision = 0.7598, Recall = 0.8473, F1 Score = 0.8012
Fold 4: Accuracy = 0.8239, Precision = 0.7984, Recall = 0.7856, F1 Score = 0.7919
Average Accuracy Across Folds: 0.8228
Average Precision Across Folds: 0.7794
Average Recall Across Folds: 0.8069
Average F1 Score Across Folds: 0.7923


In [2]:
X_train.shape

(20900, 88)