In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [459]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [1]:
from sklearn.metrics import recall_score, accuracy_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from lightgbm import Dataset, train
import gc 

def _recall_score(
    preds: np.ndarray, data, threshold: float=0.5):
    label = data.get_label()
    preds = preds.reshape(7, -1).T
    pred_label = np.argmax(preds, axis=1)
    rs = recall_score(pred_label, label, average='macro', zero_division=0)

    return '_recall_score', rs, True

random_state = 42
n_splits = 10
clfs = []
targets = ['crop']
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
categorical_feature = ['state', 'state_type','ISO3166-2-lvl4','municipality','county_type','municipality_type','region','county'] 
best_n_estimators = [300, 200, 200, 200, 400, 200, 200, 400, 400, 100]
X = train_df.drop(targets, axis=1, errors='ignore')
y = train_df[targets]   
for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_dataset = Dataset(data=X_train, label=y_train, categorical_feature=categorical_feature)
    eval_dataset = Dataset(data=X_test, label =y_test, categorical_feature=categorical_feature)
        
    print(X_train.shape, y_train.shape)    
    parameters = {
            'n_estimators' : best_n_estimators[i],
            'objective': 'multiclass',
            'boosting': 'dart',
            'feature_fraction': 0.5,
            'uniform_drop' : True,
            'max_depth' : 7,
            'lambda_l2' : 0.01,
            'bagging_freq': 125,
            'min_split_gain': 0.001,
            'num_class' : 7, 
            'drop_seed' : 7575,
            'random_seed' : 42, 
            'verbose': -1,
            'reg_lambda': 8.2532317400459,
        }

    clf = train(params = parameters,
                   train_set = train_dataset,
                   verbose_eval = 100,
                   valid_sets=eval_dataset,
                   feval=_recall_score,
                   )
    clfs.append(clf)

    
y_pred_lgb = np.zeros((sub.shape[0], 7))
scores = []
for n, clf in enumerate(clfs):
    y_pred_lgb += clf.predict(test_df)
    scores.append(clf.best_score['valid_0']['_recall_score'])

y_pred_lgb /= n_splits
print('_recall_score', np.mean(scores, dtype = 'float32'))

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from catboost import Pool, CatBoostClassifier, CatBoostRegressor 
import gc 
from sklearn.metrics import recall_score, accuracy_score

class RecallScore:
    @staticmethod
    def get_rs(pred_label, target):
        return recall_score(pred_label, target, average='macro', zero_division=0)
    
    def is_max_optimal(self):
        False

    def evaluate(self, approxes, target, weight):          
        pred_label = np.argmax(np.array(approxes).T, axis=1)
        rs = 1 - self.get_rs(pred_label, target)
        
        return rs, 1

    def get_final_error(self, error, weight):
        return error
    
random_state = 42
n_splits = 10
clfs = []
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
for train_index, test_index in kf.split(X, y):       
    X_train, X_test = X.iloc[train_index].drop('weight', axis=1,  errors = 'ignore'), X.iloc[test_index].drop('weight', axis=1,  errors = 'ignore')
    y_train, y_test = y.iloc[train_index].drop('weight', axis=1,  errors = 'ignore'), y.iloc[test_index].drop('weight', axis=1,  errors = 'ignore')

    train_dataset = Pool(data=X_train, label=y_train, cat_features=cat_features)
    eval_dataset = Pool(data=X_test, label=y_test, cat_features=cat_features)

    clf = CatBoostClassifier(iterations = 10000,
                                loss_function='MultiClass',
                                cat_features = cat_features,
                                random_seed = random_state,  
                                learning_rate = 0.005486411424635638,             
                                colsample_bylevel = 0.4152157026818,
                                subsample = 0.9563761143682146,
                                l2_leaf_reg = 9.178962968420354,
                                min_data_in_leaf = 243,
                                bootstrap_type='Bernoulli',
                                max_bin = 187,   
                                task_type='CPU',
                                early_stopping_rounds = 500,
                                eval_metric=RecallScore())

    clfs.append(clf)
    clf.fit(train_dataset, eval_set=eval_dataset,
                verbose = 200, use_best_model = True, plot = False)

y_pred_catboost = np.zeros((sub.shape[0], 7))
scores = []
for n, clf in enumerate(clfs):
    y_pred_catboost += clf.predict_proba(sub[final_features])
    scores.append(1 - clfs[0].get_best_score()['validation']['RecallScore'])

y_pred_catboost /= n_splits
print('mean Recall', np.mean(scores['Recall'], dtype = 'float32'), f'По {len(clfs)} моделям')

In [472]:
sample_solution = pd.read_csv('sample_solution.csv')
sample_solution['crop'] = np.argmax(0.05*y_pred_catboost + 0.95*y_pred_lgb, axis=1)

sub_num = 'best_ever'
sample_solution.to_csv(f'livington_sub_{sub_num}.csv', index=False)