In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score

from joblib import dump


In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)


Train shape: (9880, 18)
Test shape: (2470, 18)


In [4]:
y = train["Revenue"].astype(int)
X = train.drop("Revenue", axis=1)

# Handle ID
if "ID" in X.columns:
    X = X.drop("ID", axis=1)

test_ids = test["ID"]
test = test.drop("ID", axis=1)


In [5]:
DROP_COLS = ["OperatingSystems", "Browser", "Region", "TrafficType"]

for col in DROP_COLS:
    if col in X.columns:
        X = X.drop(col, axis=1)
        test = test.drop(col, axis=1)


In [6]:
cat_cols = X.select_dtypes(include=["object"]).columns
encoder = LabelEncoder()

for col in cat_cols:
    X[col] = encoder.fit_transform(X[col].astype(str))
    test[col] = encoder.transform(test[col].astype(str))


In [7]:
for col in X.columns:
    X[col] = X[col].fillna(X[col].median())
    test[col] = test[col].fillna(X[col].median())


In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Training samples:", X_train.shape[0])
print("Validation samples:", X_val.shape[0])


Training samples: 7904
Validation samples: 1976


In [9]:
gb = GradientBoostingClassifier(
    n_estimators=350,
    learning_rate=0.04,
    max_depth=3,
    random_state=42
)

et = ExtraTreesClassifier(
    n_estimators=500,
    min_samples_split=12,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)


In [10]:
gb.fit(X_train, y_train)
et.fit(X_train, y_train)

gb_p = gb.predict_proba(X_val)[:, 1]
et_p = et.predict_proba(X_val)[:, 1]

ensemble_p = 0.65 * gb_p + 0.35 * et_p
val_preds = (ensemble_p >= 0.5).astype(int)

print("\nüîç VALIDATION RESULTS")
print("Accuracy :", accuracy_score(y_val, val_preds))
print("Precision:", precision_score(y_val, val_preds))



üîç VALIDATION RESULTS
Accuracy : 0.9063765182186235
Precision: 0.759493670886076


In [11]:
gb.fit(X, y)
et.fit(X, y)


In [12]:
final_model = {
    "gradient_boosting": gb,
    "extra_trees": et,
    "drop_columns": DROP_COLS
}

dump(final_model, "DataCrafters.joblib")

print("‚úÖ Model saved as DataCrafters.joblib")


‚úÖ Model saved as DataCrafters.joblib


In [13]:
gb_test = gb.predict_proba(test)[:, 1]
et_test = et.predict_proba(test)[:, 1]

test_probs = 0.65 * gb_test + 0.35 * et_test
test_preds = (test_probs >= 0.5).astype(int)

submission = pd.DataFrame({
    "ID": test_ids,
    "Revenue": test_preds
})

submission.to_csv("submission.csv", index=False)

print("\n‚úÖ submission.csv ready")
print(submission.head())
print("Shape:", submission.shape)



‚úÖ submission.csv ready
   ID  Revenue
0   0        1
1   1        1
2   2        0
3   3        0
4   4        0
Shape: (2470, 2)
