In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# Start timer
start_time = time.time()

# Loading data from files
train = pd.read_csv('/content/train_set.csv')
test = pd.read_csv('/content/test_set.csv')

# Separating attributes and labels
X = train.drop(columns=['Y', 'RecordId'])
Y = train['Y']
test = test.drop(columns=['RecordId'])

# Imputing missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
test = imputer.transform(test)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Training XGBoost model
xgb_model = XGBClassifier(
    max_depth=3,
    n_estimators=261,
    learning_rate=0.1,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8
)
xgb_model.fit(X_train, y_train)

# Predicting and scoring
predicted_probab = xgb_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, predicted_probab)
print('ROC AUC of the xgb model:', auc_score)

# Final prediction
final_prob = xgb_model.predict_proba(test)[:, 1]

# End timer
end_time = time.time()
print(f"Sequential execution time: {end_time - start_time:.2f} seconds")

# Saving predictions to CSV
df_sample = pd.read_csv('/content/sample_submission.csv')  # <-- corrected line
df_sample['Y'] = final_prob
df_sample.to_csv('/content/sample_submission_xgbboost.csv', index=False)


ROC AUC of the xgb model: 0.9634332871070214
Sequential execution time: 15.32 seconds


In [2]:
import pandas as pd
import numpy as np
import time
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# Start timer
start_time = time.time()

# Load data
train = pd.read_csv('/content/train_set.csv')
test = pd.read_csv('/content/test_set.csv')

# Preprocess
X = train.drop(columns=['Y', 'RecordId'])
Y = train['Y']
test = test.drop(columns=['RecordId'])

imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
test = imputer.transform(test)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train model
xgb_model = XGBClassifier(
    max_depth=3,
    n_estimators=261,
    learning_rate=0.1,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8
)
xgb_model.fit(X_train, y_train)

# Evaluate
predicted_probab = xgb_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, predicted_probab)
print('ROC AUC of the xgb model:', auc_score)

# Parallel prediction on test set
def predict_chunk(chunk):
    return xgb_model.predict_proba(chunk)[:, 1]

chunk_size = 1000
test_chunks = [test[i:i+chunk_size] for i in range(0, len(test), chunk_size)]

# Farm-style parallel execution
final_probs = Parallel(n_jobs=-1)(delayed(predict_chunk)(chunk) for chunk in test_chunks)
final_prob = np.concatenate(final_probs)

# End timer
end_time = time.time()
print(f"Parallel execution time: {end_time - start_time:.2f} seconds")

# Saving predictions to CSV
df_sample = pd.read_csv('/content/sample_submission.csv')  # <-- corrected line
df_sample['Y'] = final_prob
df_sample.to_csv('/content/sample_submission_xgbboost.csv', index=False)


ROC AUC of the xgb model: 0.9634332871070214
Parallel execution time: 17.76 seconds


In [10]:
from xgboost import XGBClassifier
import time

# Start timer
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train with GPU support
xgb_model_gpu = XGBClassifier(
    tree_method='gpu_hist',   # enables GPU acceleration
    device='cuda',
    max_depth=3,
    n_estimators=261,
    learning_rate=0.1,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='auc'
)

start_time = time.time()

xgb_model_gpu.fit(X_train, y_train)

# Prediction and evaluation
predicted_gpu_probab = xgb_model_gpu.predict_proba(X_test)[:, 1]
auc_score_gpu = roc_auc_score(y_test, predicted_gpu_probab)

# End timer
end_time = time.time()
print("GPU Training Time:", end_time - start_time, "seconds")
print("ROC AUC with GPU:", auc_score_gpu)

# Saving predictions to CSV
df_sample = pd.read_csv('/content/sample_submission.csv')  # <-- corrected line
df_sample['Y'] = final_prob
df_sample.to_csv('/content/sample_submission_xgbboost.csv', index=False)



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



GPU Training Time: 5.475257396697998 seconds
ROC AUC with GPU: 0.9627271413525579


In [11]:
import xgboost as xgb
import time


# Convert data to DMatrix and push to GPU
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'tree_method': 'hist',
    'device': 'cuda',  # use GPU
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'min_child_weight': 2,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Train the model
start = time.time()
booster = xgb.train(params, dtrain, num_boost_round=261)
end = time.time()

# Predict and evaluate
preds = booster.predict(dtest)
auc_gpu = roc_auc_score(y_test, preds)

print("GPU Training Time:", end - start, "seconds")
print("ROC AUC with GPU:", auc_gpu)


GPU Training Time: 0.9442901611328125 seconds
ROC AUC with GPU: 0.962781076935298
