In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv("data_week11.csv")

In [5]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [6]:
# Function to train and evaluate model for different dataset sizes
def evaluate_model_size(X, y, sizes=[100, 1000, 10000, 100000, 1000000, 10000000]):
    results = []

    # For each dataset size
    for size in sizes:
        # If we have enough data
        if size <= len(X):
            print(f"\nEvaluating with {size} samples...")

            # Sample the data
            indices = np.random.choice(len(X), size=size, replace=False)
            X_sample = X.iloc[indices]
            y_sample = y.iloc[indices]

            # Scale features
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_sample)

            # Create model
            model = XGBClassifier(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=3,
                random_state=42
            )

            # Set up 5-fold CV
            kf = KFold(n_splits=5, shuffle=True, random_state=42)

            # Initialize arrays to store accuracy and AUC scores
            accuracies = []
            auc_scores = []

            # Measure time for the entire process
            start_time = time.time()

            # Perform 5-fold cross-validation manually
            for train_idx, test_idx in kf.split(X_scaled):
                # Split data into train and test sets
                X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
                y_train, y_test = y_sample.iloc[train_idx], y_sample.iloc[test_idx]

                # Fit model
                model.fit(X_train, y_train)

                # Make predictions
                y_pred = model.predict(X_test)
                y_pred_proba = model.predict_proba(X_test)[:, 1]

                # Calculate metrics
                acc = accuracy_score(y_test, y_pred)
                auc = roc_auc_score(y_test, y_pred_proba)

                # Store metrics
                accuracies.append(acc)
                auc_scores.append(auc)

            end_time = time.time()
            time_taken = end_time - start_time

            # Calculate average metrics
            avg_accuracy = np.mean(accuracies)
            avg_auc = np.mean(auc_scores)

            # Store results
            results.append({
                'Dataset size': size,
                'Test accuracy': avg_accuracy,
                'Testing-set predictive performance (AUC)': avg_auc,
                'Time taken (seconds)': time_taken
            })

            print(f"Test Accuracy: {avg_accuracy:.4f}")
            print(f"AUC Score: {avg_auc:.4f}")
            print(f"Time taken: {time_taken:.2f} seconds")
        else:
            print(f"Skipping size {size} - not enough data")
            results.append({
                'Dataset size': size,
                'Test accuracy': None,
                'Testing-set predictive performance (AUC)': None,
                'Time taken (seconds)': None
            })

    # Create a DataFrame with the results
    results_df = pd.DataFrame(results)
    return results_df

# Run the evaluation
sizes_to_evaluate = [100, 1000, 10000, 100000, 1000000, 10000000]
results_table = evaluate_model_size(X, y, sizes=sizes_to_evaluate)

# Display the results in a formatted table
print("\nResults Summary:")
print(results_table.to_string(index=False))


Evaluating with 100 samples...
Test Accuracy: 0.9200
AUC Score: 0.9480
Time taken: 0.14 seconds

Evaluating with 1000 samples...
Test Accuracy: 0.9470
AUC Score: 0.9881
Time taken: 0.22 seconds

Evaluating with 10000 samples...
Test Accuracy: 0.9646
AUC Score: 0.9948
Time taken: 0.45 seconds

Evaluating with 100000 samples...
Test Accuracy: 0.9711
AUC Score: 0.9966
Time taken: 6.84 seconds

Evaluating with 1000000 samples...
Test Accuracy: 0.9713
AUC Score: 0.9969
Time taken: 29.31 seconds

Evaluating with 10000000 samples...
Test Accuracy: 0.9716
AUC Score: 0.9970
Time taken: 303.24 seconds

Results Summary:
 Dataset size  Test accuracy  Testing-set predictive performance (AUC)  Time taken (seconds)
          100       0.920000                                  0.947984              0.143890
         1000       0.947000                                  0.988125              0.218498
        10000       0.964600                                  0.994848              0.449722
       100