In [2]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import time

In [3]:
df = pd.read_csv("week11.csv")

# Ensure you don't exceed the available data
max_size = len(df)

# Dataset sizes to test
sizes = [100, 1000, 10000, 100000, 1000000, 10000000]

# Iterate through each size
for size in sizes:
    if size > max_size:
        print(f"Skipping size {size} – not enough data (only {max_size} rows available).")
        continue

    print(f"\n--- Dataset size: {size} ---")
    sample_df = df.sample(n=size, random_state=42)

    # Split into features and target
    X = sample_df.drop("outcome", axis=1)
    y = sample_df["outcome"]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize model
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    # Time training
    start_time = time.time()
    model.fit(X_train, y_train)
    fit_time = time.time() - start_time

    # Predict and evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cv_scores = cross_val_score(model, X, y, cv=5)

    print(f"Test Accuracy: {acc:.4f}")
    print(f"5-Fold CV Accuracy: {cv_scores.mean():.4f}")
    print(f"Time taken to fit model: {fit_time:.2f} seconds")


--- Dataset size: 100 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Test Accuracy: 0.9000
5-Fold CV Accuracy: 0.9200
Time taken to fit model: 0.13 seconds

--- Dataset size: 1000 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Test Accuracy: 0.9650
5-Fold CV Accuracy: 0.9530
Time taken to fit model: 0.06 seconds

--- Dataset size: 10000 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Test Accuracy: 0.9745
5-Fold CV Accuracy: 0.9742
Time taken to fit model: 0.15 seconds

--- Dataset size: 100000 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Test Accuracy: 0.9859
5-Fold CV Accuracy: 0.9872
Time taken to fit model: 0.83 seconds

--- Dataset size: 1000000 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Test Accuracy: 0.9918
5-Fold CV Accuracy: 0.9918
Time taken to fit model: 9.94 seconds

--- Dataset size: 10000000 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Test Accuracy: 0.9931
5-Fold CV Accuracy: 0.9932
Time taken to fit model: 103.23 seconds
