In [19]:
%pip install catboost scikit-learn pandas numpy

Note: you may need to restart the kernel to use updated packages.


In [20]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score

In [21]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
sample_submission = pd.read_csv("../data/sample_submission.csv")

In [22]:
TARGET = "Revenue"

X = train.drop(columns=[TARGET])
y = train[TARGET].astype(int)

In [23]:
CAT_COLS = ["Month", "VisitorType", "Weekend"]

In [24]:
def preprocess(df):
    df = df.copy()

    for col in CAT_COLS:
        if col in df.columns:

            df[col] = df[col].astype(str)
            df[col] = df[col].replace(['nan', 'None', 'null'], 'MISSING')
            df[col] = df[col].fillna("MISSING")

            df[col] = df[col].replace('0.0', '0')
            df[col] = df[col].replace('1.0', '1')

    num_cols = df.columns.difference(CAT_COLS)
    for col in num_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    return df

X = preprocess(X)
test = preprocess(test)

In [25]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [26]:
model = CatBoostClassifier(
    iterations=500,
    depth=8,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="Precision",     
    l2_leaf_reg=10,
    random_seed=42,
    early_stopping_rounds=100,
    verbose=100
)

In [27]:
model.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    cat_features=CAT_COLS,
    use_best_model=True
)

0:	learn: 0.6694915	test: 0.6613546	best: 0.6613546 (0)	total: 24.2ms	remaining: 12.1s
100:	learn: 0.8286334	test: 0.8080808	best: 0.8247423 (54)	total: 1.11s	remaining: 4.4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.824742268
bestIteration = 54

Shrink model to first 55 iterations.


<catboost.core.CatBoostClassifier at 0x7f0329c23610>

In [28]:
val_pred = model.predict(X_val)

print("Validation Accuracy :", accuracy_score(y_val, val_pred))
print("Validation Precision:", precision_score(y_val, val_pred))


Validation Accuracy : 0.9078947368421053
Validation Precision: 0.8247422680412371


In [None]:
test_pool = Pool(
    data=test,
    cat_features=CAT_COLS
)

test_pred = model.predict(test_pool)

submission = sample_submission.copy()
submission[TARGET] = test_pred.astype(bool)
submission.to_csv("answer.csv", index=False)


In [30]:
import joblib

joblib.dump(model, "AravindAshokan.joblib")

['AravindAshokan.joblib']