In [1]:
# Install dependencies
!pip install lightgbm --quiet
!apt-get install -y -qq libboost-dev

In [7]:
import time
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from concurrent.futures import ThreadPoolExecutor

# Start timer
start_time = time.time()

# === Load raw CSVs ===
train_set_path = '/content/train_set.csv'
test_set_path = '/content/test_set.csv'
train_set = pd.read_csv(train_set_path)
test_set = pd.read_csv(test_set_path)
record_ids = test_set['RecordId']

# === Define preprocessing function ===
def preprocess(df, is_train=True):
    df.drop(columns=['RecordId', 'X71', 'X76'], inplace=True)
    if is_train:
        X = df.drop(columns=['Y'])
        Y = df['Y']
        return X, Y
    else:
        return df

# === Parallel preprocessing ===
with ThreadPoolExecutor() as executor:
    future_train = executor.submit(preprocess, train_set, True)
    future_test = executor.submit(preprocess, test_set, False)
    X, Y = future_train.result()
    test_X = future_test.result()

# === Imputation and Scaling (done once, sequentially) ===
imputer = SimpleImputer(strategy='mean')
scaler = MinMaxScaler()

# Train
X_imputed = imputer.fit_transform(X)
X_scaled = scaler.fit_transform(X_imputed)

# Test
test_imputed = imputer.transform(test_X)
test_scaled = scaler.transform(test_imputed)

# Split into train/validation
X_train, X_validate, Y_train, Y_validate = train_test_split(X_scaled, Y, test_size=0.3, random_state=42)

# === Train LightGBM with GPU and multithreading ===
model = lgb.LGBMClassifier(
    boosting_type='gbdt',
    device_type='gpu',
    max_depth=10,
    n_estimators=290,
    learning_rate=0.025,
    colsample_bytree=0.19,
    min_child_weight=2,
    reg_alpha=0.19,
    reg_lambda=0.19,
    random_state=42,
    n_jobs=-1,
    num_threads=-1
)

model.fit(X_train, Y_train)

# === Evaluate ===
md_predictions_probs = model.predict_proba(X_validate)[:, 1]
md_roc = roc_auc_score(Y_validate, md_predictions_probs)
print("Validation ROC AUC:", md_roc)

# === Predict on test set ===
test_predictions_probs = model.predict_proba(test_scaled)[:, 1]

# === Save submission ===
submission = pd.DataFrame({
    'RecordId': record_ids,
    'Y': test_predictions_probs
})
submission.to_csv('submission_pipeline_gpu_multithreaded.csv', index=False)
print("Submission file created: submission_pipeline_gpu_multithreaded.csv")

# End timer
end_time = time.time()
print(f"Total execution time: {end_time - start_time:.2f} seconds")




[LightGBM] [Info] Number of positive: 466, number of negative: 171819
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17366
[LightGBM] [Info] Number of data points in the train set: 172285, number of used features: 75
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 62 dense feature groups (10.52 MB) transferred to GPU in 0.012564 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002705 -> initscore=-5.910011
[LightGBM] [Info] Start training from score -5.910011




Validation ROC AUC: 0.969476021468568




Submission file created: submission_pipeline_gpu_multithreaded.csv
Total execution time: 16.77 seconds
