# $$CatBoost\ Object\ Importance\ Tutorial$$

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/model_analysis/object_importance_tutorial.ipynb)

#### In this tutorial we show how you can detect noisy objects in your dataset. 

In [10]:
import datetime
import gc
import os
import warnings
import random
from collections import defaultdict

import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import Pool, cv
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import RandomSampler

warnings.filterwarnings("ignore")
from dataclasses import dataclass

import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import StepLR,MultiStepLR
from torch.utils.data import DataLoader, Dataset
from torchmetrics import Recall
#from tqdm import tqdm

from tqdm.notebook import tqdm

%matplotlib inline
path_dir=os.getcwd()
path_train='../train_dataset_train.csv'
path_test='../test_dataset_test.csv'
path_subm='../Иннополис/sample_solution.csv'

@dataclass
class Config:
    RS: int = 42
    treshold: float = 0.0
    device='cpu'
    num_epochs: int = 120
    batch_size: int = 256
    num_workers: int = 0
        
def get_df(path,train=True):
    df=pd.read_csv(path)
    df=df.sort_index(axis=1)
    df.drop(['id','.geo','area'],axis=1,inplace=True)
    if train:
        df=df.loc[df.crop.isin([0,4])]
        y=df[['crop']]
        df.drop(['crop'],axis=1,inplace=True)  
    
    df.rename({c:pd.to_datetime(c.replace('nd_mean_','')) for c in df.columns},axis=1,inplace=True) 
    df[df<=0]=0    
    return (df.reset_index(drop=True).values,y.crop.reset_index(drop=True).values) if train else df.reset_index(drop=True).values

#### First, let's load the dataset:

In [18]:
X, y =  df,y=get_df(path_train)

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.10, random_state=42)
train_pool = Pool(X_train, y_train)
validation_pool = Pool(X_validation, y_validation)

print(train_pool.shape, validation_pool.shape)

(1299, 70) (145, 70)


#### Let's train CatBoost on clear data and take a look at the quality. We set a small learning rate to avoid overfitting when we start removing noisy objects.

In [19]:
params_model={
    'loss_function':'Logloss',
    'random_seed':42,
    'eval_metric':'Recall',
    'bootstrap_type':'Bernoulli',#'Bayesian',#'Bernoulli',#'Poisson',
    'l2_leaf_reg': 3,
    'early_stopping_rounds':4,
    'iterations':50,         
     'verbose':False,
    'depth': 1,
    'learning_rate': 0.3,    
}

In [20]:
indices, scores = cb.get_object_importance(
    train_pool,
    validation_pool,
    #importance_values_sign='Negative',
    importance_values_sign='Positive'
    # Positive values means that the optimized metric
    # value is increase because of given train objects.
    # So here we get the indices of bad train objects.
)

#### Let's inject random noise into 10% of training labels:

#### And train CatBoost on noisy data and take a look at the quality:

#### Now let's sample random 500 validate objects (because counting object importance on the entire validation dataset can take a long time) and calculate the train objects importance for these validation objects:

In [17]:
np.random.seed(42)
test_idx = np.random.choice(np.arange(y_validation.shape[0]), size=500, replace=False)
validation_pool_sampled = Pool(X_validation[test_idx], y_validation[test_idx], cat_features=cat_features)

indices, scores = cb.get_object_importance(
    validation_pool_sampled,
    train_pool_noisy,
    importance_values_sign='Positive' # Positive values means that the optimized metric
                                      # value is increase because of given train objects.
                                      # So here we get the indices of bad train objects.
)

ValueError: Cannot take a larger sample than population when 'replace=False'

#### Finally, in a loop, let's remove noisy objects in batches, retrain the model, and see how the quality on the test dataset improves:

In [None]:
def train_and_print_score(train_indices, remove_object_count):
    cb.fit(X_train[train_indices], y_train_noisy[train_indices], cat_features=cat_features)
    metric_value = cb.eval_metrics(validation_pool, ['RMSE'])['RMSE'][-1]
    s = 'RMSE on validation datset when {} harmful objects from train are dropped: {}'
    print(s.format(remove_object_count, metric_value))

batch_size = 250
train_indices = np.full(X_train.shape[0], True)
train_and_print_score(train_indices, 0)
for batch_start_index in range(0, 2000, batch_size):
    train_indices[indices[batch_start_index:batch_start_index + batch_size]] = False
    train_and_print_score(train_indices, batch_start_index + batch_size)

#### Therefore, we have the following RMSE values on the validation dataset:
    
||RMSE on the validation dataset|
|-|-|
|Clear train dataset: | 0.22947301323494568|
|Noisy train dataset: | 0.24770929523786442|
|Purified train dataset: | 0.231598588484771|

#### $$So\ now\ you\ can\ try\ to\ clear\ the\ train\ dataset\ of\ noisy\ objects\ and\ get\ better\ quality!$$