In [None]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [None]:
TARGET = "Revenue"

X = train.drop(columns=[TARGET])
y = train[TARGET].astype(int)

In [None]:
CAT_COLS = ["Month", "VisitorType", "Weekend"]

In [None]:
def preprocess(df):
    df = df.copy()

    # Handle categorical columns - ensure they are strings and handle any edge cases
    for col in CAT_COLS:
        if col in df.columns:
            # Convert to string and handle various edge cases
            df[col] = df[col].astype(str)
            df[col] = df[col].replace(['nan', 'None', 'null'], 'MISSING')
            df[col] = df[col].fillna("MISSING")
            # Ensure no float-like strings remain
            df[col] = df[col].replace('0.0', '0')
            df[col] = df[col].replace('1.0', '1')

    # Handle numerical NaNs (CatBoost can handle NaNs, but be explicit)
    num_cols = df.columns.difference(CAT_COLS)
    for col in num_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    return df

X = preprocess(X)
test = preprocess(test)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
model = CatBoostClassifier(
    iterations=500,
    depth=8,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="Precision",     
    l2_leaf_reg=10,
    random_seed=42,
    early_stopping_rounds=100,
    verbose=100
)

In [None]:
model.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    cat_features=CAT_COLS,
    use_best_model=True
)

0:	learn: 0.8080925	test: 0.8275862	best: 0.8275862 (0)	total: 89.6ms	remaining: 44.7s
100:	learn: 0.8542757	test: 0.8470588	best: 0.8492792 (89)	total: 4.05s	remaining: 16s
200:	learn: 0.8824522	test: 0.8512397	best: 0.8512960 (183)	total: 9.67s	remaining: 14.4s
300:	learn: 0.9155518	test: 0.8668555	best: 0.8674189 (294)	total: 14.1s	remaining: 9.32s
400:	learn: 0.9425034	test: 0.8703976	best: 0.8748159 (358)	total: 19.3s	remaining: 4.75s
499:	learn: 0.9585506	test: 0.8744186	best: 0.8765060 (437)	total: 22.9s	remaining: 0us

bestTest = 0.8765060241
bestIteration = 437

Shrink model to first 438 iterations.


<catboost.core.CatBoostClassifier at 0x7f094c0d2db0>

In [None]:
val_pred = model.predict(X_val)

print("Validation Accuracy :", accuracy_score(y_val, val_pred))
print("Validation Precision:", precision_score(y_val, val_pred))


Validation Accuracy : 0.9008097165991903
Validation Precision: 0.7028985507246377


In [None]:
test_pool = Pool(
    data=test,
    cat_features=CAT_COLS
)

test_pred = model.predict(test_pool)

submission = sample_submission.copy()
submission[TARGET] = test_pred.astype(bool)
submission.to_csv("answer.csv", index=False)
