In [None]:
#install dependencies 
%pip install pandas pyarrow lightgbm scikit-learn

In [14]:
#import libraries
import pandas as pd
import lightgbm as lgb 
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [4]:
#Load dataset

# Read CSV files
submission_df = pd.read_csv("685404e30cfdb_submission_template.csv")
data_dict_df = pd.read_csv("data_dictionary.csv")

# Read Parquet files
add_event_df = pd.read_parquet("add_event.parquet")
add_trans_df = pd.read_parquet("add_trans.parquet")
offer_metadata_df = pd.read_parquet("offer_metadata.parquet")
test_data_df = pd.read_parquet("test_data.parquet")
train_data_df = pd.read_parquet("train_data.parquet")


In [None]:
# Dictionary of your loaded DataFrames
dataframes = {
    "submission_df": submission_df,
    "data_dict_df": data_dict_df,
    "add_event_df": add_event_df,
    "add_trans_df": add_trans_df,
    "offer_metadata_df": offer_metadata_df,
    "test_data_df": test_data_df,
    "train_data_df": train_data_df,
}

# Print summary for each
for name, df in dataframes.items():
    print(f"\n📁 {name}")
    print(f"🔢 Shape: {df.shape}")
    print(f"💾 Memory Usage: {df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")
    print(f"📋 Columns: {list(df.columns)}")



📁 submission_df
🔢 Shape: (369301, 5)
💾 Memory Usage: 67.19 MB
📋 Columns: ['id1', 'id2', 'id3', 'id5', 'pred']

📁 data_dict_df
🔢 Shape: (372, 3)
💾 Memory Usage: 0.08 MB
📋 Columns: ['masked_column', 'Description', 'Type']

📁 add_event_df
🔢 Shape: (21457473, 5)
💾 Memory Usage: 4846.65 MB
📋 Columns: ['id2', 'id3', 'id6', 'id4', 'id7']

📁 add_trans_df
🔢 Shape: (6339465, 9)
💾 Memory Usage: 2718.02 MB
📋 Columns: ['id2', 'f367', 'f368', 'f369', 'f370', 'f371', 'f372', 'id8', 'f374']

📁 offer_metadata_df
🔢 Shape: (4164, 12)
💾 Memory Usage: 2.17 MB
📋 Columns: ['id3', 'id9', 'f375', 'f376', 'f377', 'id10', 'id11', 'f378', 'f374', 'id8', 'id12', 'id13']

📁 test_data_df
🔢 Shape: (369301, 371)
💾 Memory Usage: 7228.31 MB
📋 Columns: ['id1', 'id2', 'id3', 'id4', 'id5', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 

In [None]:
data_dict_df

Unnamed: 0,masked_column,Description,Type
0,id1,Primary Key(masked),Key
1,id2,Customer ID (masked),-
2,id3,Offer ID,Categorical
3,id4,Event timestamp,Numerical
4,id5,Event Date,Numerical
...,...,...,...
367,f362,Total clicks on the incoming offer's industry ...,Numerical
368,f363,CTR on the incoming offer's industry for the g...,Numerical
369,f364,CM’s past 6 moth impressions on relevant offers,Numerical
370,f365,CM’s past 6 moth clicks on relevant offers,Numerical


In [17]:
# === Basic Preprocessing === #
target = "y"
id_cols = ['id1', 'id2', 'id3', 'id4', 'id5']

# Drop ID columns for training
X = train_data_df.drop(columns=[target] + id_cols)
y = train_data_df[target]

print("🔍 Converting object columns to numeric or categorical...")

# Detect object columns
object_cols = X.select_dtypes(include='object').columns

for col in object_cols:
    try:
        X[col] = pd.to_numeric(X[col])  # Try to convert to float
    except:
        X[col] = X[col].astype("category")  # If fails, treat as categorical

print(f"✅ Converted {len(object_cols)} object columns.")


🔍 Converting object columns to numeric or categorical...
✅ Converted 366 object columns.


In [18]:
# === Split for validation === #
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print(f"✅ Training size: {len(X_train)}, Validation size: {len(X_val)}")

# === LightGBM Model === #

print("🚀 Training LightGBM model...")
lgbm = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    random_state=42
)

lgbm.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="auc",
    callbacks=[
        early_stopping(stopping_rounds=10),
        log_evaluation(period=20)  
    ]
)
print("✅ Model training complete.")


# Validation evaluation
print("📊 Predicting on validation set...")
val_preds = lgbm.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_preds)
print(f"🎯 Validation ROC-AUC: {val_auc:.4f}")


✅ Training size: 616131, Validation size: 154033
🚀 Training LightGBM model...
[LightGBM] [Info] Number of positive: 29641, number of negative: 586490
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035051 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 47717
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 324
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048108 -> initscore=-2.984997
[LightGBM] [Info] Start training from score -2.984997
Training until validation scores don't improve for 10 rounds
[20]	valid_0's auc: 0.919432	valid_0's binary_logloss: 0.114394
[40]	valid_0's auc: 0.927532	valid_0's binary_logloss: 0.10045
[60]	valid_0's auc: 0.932145	valid_0's binary_logloss: 0.0949194
[80]	valid_0's auc: 0.936708	valid_0's binary_logloss: 0.0911734
[100]	valid_0's auc: 0.939036	valid_0's bi

In [23]:
print("🔍 Preprocessing test set...")
test_data_df['y'] = -1
X_test = test_data_df.drop(columns=id_cols + ['y'])

for col in X.columns:
    if col not in X_test.columns:
        X_test[col] = 0
    if str(X_test[col].dtype) != str(X[col].dtype):
        try:
            X_test[col] = X_test[col].astype(X[col].dtype)
        except:
            print(f"⚠️ Could not convert column {col}")

print("✅ Test data aligned.")

# === Prediction === #
import numpy as np

print("📊 Predicting on test set...")

batch_size = 50000
n_rows = X_test.shape[0]
test_preds = []

for i in range(0, n_rows, batch_size):
    X_batch = X_test.iloc[i:i + batch_size]
    preds = lgbm.predict_proba(X_batch)[:, 1]
    test_preds.extend(preds)
    print(f"✅ Predicted rows: {min(i + batch_size, n_rows)} / {n_rows}")

test_preds = np.array(test_preds)
print("🎯 All test predictions complete.")



🔍 Preprocessing test set...
✅ Test data aligned.
📊 Predicting on test set...
✅ Predicted rows: 50000 / 369301
✅ Predicted rows: 100000 / 369301
✅ Predicted rows: 150000 / 369301
✅ Predicted rows: 200000 / 369301
✅ Predicted rows: 250000 / 369301
✅ Predicted rows: 300000 / 369301
✅ Predicted rows: 350000 / 369301
✅ Predicted rows: 369301 / 369301
🎯 All test predictions complete.


In [24]:
# === Prepare Submission File === #
print("📝 Preparing submission file...")
submission_df = test_data_df[['id1', 'id2', 'id3', 'id5']].copy()
submission_df['pred'] = test_preds


filename = "r2_submission_file_1_<dattebayo>.csv"
submission_df.to_csv(filename, index=False)
print(f"✅ Submission saved to: {filename}")

📝 Preparing submission file...
✅ Submission saved to: r2_submission_file_1_<dattebayo>.csv
