In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error



In [2]:
train = pd.read_csv("/kaggle/input/blueberry-folds/blueberry_train_folds.csv")
test = pd.read_csv("/kaggle/input/playground-series-s3e14/test.csv")
sample_submission = pd.read_csv("/kaggle/input/playground-series-s3e14/sample_submission.csv")

In [3]:
train.columns

Index(['id', 'clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
       'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange',
       'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange',
       'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds',
       'yield', 'kfold'],
      dtype='object')

In [4]:
train.head(5)

Unnamed: 0,id,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield,kfold
0,0,25.0,0.5,0.25,0.75,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.425011,0.417545,32.460887,4476.81146,4
1,1,25.0,0.5,0.25,0.5,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.444908,0.422051,33.858317,5548.12201,3
2,2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.552927,0.470853,38.341781,6869.7776,2
3,3,12.5,0.25,0.25,0.63,0.5,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.565976,0.478137,39.467561,6880.7759,3
4,4,25.0,0.5,0.25,0.63,0.63,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.579677,0.494165,40.484512,7479.93417,1


In [5]:
useful_features = [c for c in train.columns if c not in ('id', 'kfold', 'yield')]

In [6]:
final_preds = []
for fold in range(5):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    ytrain = xtrain['yield']
    yvalid = xvalid['yield']
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = CatBoostRegressor(task_type='GPU', random_seed=fold, verbose=200)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_preds.append(preds_test)
    print(fold, mean_absolute_error(yvalid, preds_valid))
    

Learning rate set to 0.056075
0:	learn: 1279.3394747	total: 130ms	remaining: 2m 10s
200:	learn: 552.4240053	total: 8.72s	remaining: 34.7s
400:	learn: 547.1080231	total: 15.2s	remaining: 22.8s
600:	learn: 543.1259143	total: 22.4s	remaining: 14.9s
800:	learn: 539.9567723	total: 28.7s	remaining: 7.13s
999:	learn: 537.5801874	total: 36.3s	remaining: 0us
0 348.9541485120016
Learning rate set to 0.056075
0:	learn: 1278.7853283	total: 36.1ms	remaining: 36.1s
200:	learn: 547.4784308	total: 6.33s	remaining: 25.2s
400:	learn: 540.6084140	total: 12.6s	remaining: 18.8s
600:	learn: 537.7421111	total: 19.7s	remaining: 13.1s
800:	learn: 535.7624544	total: 27.2s	remaining: 6.76s
999:	learn: 534.2240098	total: 33s	remaining: 0us
1 354.87278712063534
Learning rate set to 0.056075
0:	learn: 1278.8458864	total: 23.9ms	remaining: 23.9s
200:	learn: 550.9965938	total: 6.22s	remaining: 24.7s
400:	learn: 543.6923435	total: 13.1s	remaining: 19.6s
600:	learn: 540.2048964	total: 20.2s	remaining: 13.4s
800:	learn:

In [7]:
preds = np.mean(np.column_stack(final_preds), axis=1)

In [9]:
sample_submission['yield'] = preds
sample_submission.to_csv("baseline-submission.csv", index=False)