In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('train_manual.csv')
test_data = pd.read_csv('test_manual.csv')

In [3]:
cat_features = [f for f in test_data if not pd.api.types.is_numeric_dtype(test_data[f])]
num_features = [f for f in test_data if f not in cat_features and f not in ['id', 'Unnamed: 0']]
features = cat_features + num_features

In [4]:
X_train, X_validation, y_train, y_validation = train_test_split(train_data[[c for c in train_data if c in features]], train_data['rainfall'], train_size=0.75, random_state=42)

In [5]:
# If we don't want to use auto class weights, here are class weights:
class_weights = [int(len(train_data) - (train_data.rainfall == c).sum()) / len(train_data) for c in (0, 1)]

In [6]:
model = CatBoostClassifier(
    iterations=1000,
    custom_loss=[metrics.Logloss(), metrics. AUC()],
    eval_metric=metrics.AUC(),
    random_seed=42,
    logging_level='Silent',
    auto_class_weights='Balanced',  # Disable this, if you want to use class weights.
    #class_weights=class_weights,
    #od_type='Iter',
    #od_wait=50,
)

In [7]:
model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    plot=True,
    use_best_model=True,
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [8]:
# Retrain with full dataset:

In [9]:
iteration_count = model.best_iteration_

In [11]:
model = CatBoostClassifier(
    iterations=int(iteration_count * 1.3),
    custom_loss=[metrics.AUC(), metrics.Accuracy(), metrics.Precision()],
    random_seed=42,
    logging_level='Silent',
    auto_class_weights='Balanced',  # Disable this, if you want to use class weights.
    #class_weights=class_weights,    
)

In [12]:
model.fit(
    train_data[features], train_data.rainfall,
    cat_features=cat_features,
    plot=True,
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [13]:
pd.DataFrame(model.predict_proba(test_data)[:, 1], columns=['rainfall']).join(test_data.id).set_index('id', drop=True).to_csv('submission_catboost.csv')