### Модель NotNaN

Исключение демографических данных не привело к существенному росту скорости. Нужно разумно уменьшить массив.

**Гипотеза:** исключение все строк, содержащих NaN, приведет одновременно и к росту скорости обучения, и к росту точности прогнозов.

In [1]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score #, mean_squared_error
import pandas as pd
import os
os.chdir("D:/demo/ML/demo_automacon")

from src.utils import prepare_training_data

baseline_df = pd.read_csv('data/baseline_dataset.csv', index_col=0)
baseline_df.head()

Unnamed: 0,com_code,upc,descrip,size,store,week,move,price,sale,age60,...,educ,ethnic,income,hhlarge,workwom,hval150,sstrdist,sstrvol,cpdist5,cpwvol5
0,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,306,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
1,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,307,1,2.99,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
2,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,308,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
3,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,309,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
4,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,310,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417


In [3]:
baseline_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7173063 entries, 0 to 7173062
Data columns (total 21 columns):
 #   Column    Dtype  
---  ------    -----  
 0   com_code  int64  
 1   upc       int64  
 2   descrip   object 
 3   size      object 
 4   store     int64  
 5   week      int64  
 6   move      int64  
 7   price     float64
 8   sale      object 
 9   age60     float64
 10  age9      float64
 11  educ      float64
 12  ethnic    float64
 13  income    float64
 14  hhlarge   float64
 15  workwom   float64
 16  hval150   float64
 17  sstrdist  float64
 18  sstrvol   float64
 19  cpdist5   float64
 20  cpwvol5   float64
dtypes: float64(13), int64(5), object(3)
memory usage: 1.2+ GB


In [4]:
baseline_df.dropna(inplace=True)
baseline_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6911180 entries, 0 to 7173020
Data columns (total 21 columns):
 #   Column    Dtype  
---  ------    -----  
 0   com_code  int64  
 1   upc       int64  
 2   descrip   object 
 3   size      object 
 4   store     int64  
 5   week      int64  
 6   move      int64  
 7   price     float64
 8   sale      object 
 9   age60     float64
 10  age9      float64
 11  educ      float64
 12  ethnic    float64
 13  income    float64
 14  hhlarge   float64
 15  workwom   float64
 16  hval150   float64
 17  sstrdist  float64
 18  sstrvol   float64
 19  cpdist5   float64
 20  cpwvol5   float64
dtypes: float64(13), int64(5), object(3)
memory usage: 1.1+ GB


In [5]:
X_train, X_test, y_train, y_test = prepare_training_data(baseline_df, 
                cat_cols=['descrip', 'size', 'sale'],
                cols2drop=['com_code', 'upc', 'store'],
                target_cols=['move'])

clf = CatBoostRegressor(random_state=42,
                        cat_features=['descrip', 'size', 'sale'],
                        #task_type='GPU'
                        )


clf.fit( X_train, y_train,
         eval_set=(X_test, y_test),
         early_stopping_rounds=200,
         verbose=50, plot=False)

cat_features : 3 ['descrip', 'size', 'sale']
num_features : 14 ['week', 'price', 'age60', 'age9', 'educ', 'ethnic', 'income', 'hhlarge', 'workwom', 'hval150', 'sstrdist', 'sstrvol', 'cpdist5', 'cpwvol5']
targets ['move']
Learning rate set to 0.197011
0:	learn: 2.2334677	test: 2.3325369	best: 2.3325369 (0)	total: 1.73s	remaining: 28m 50s
50:	learn: 1.6548630	test: 1.7910076	best: 1.7910076 (50)	total: 50.7s	remaining: 15m 43s
100:	learn: 1.5927336	test: 1.7350536	best: 1.7350536 (100)	total: 1m 42s	remaining: 15m 10s
150:	learn: 1.5588385	test: 1.7080584	best: 1.7080584 (150)	total: 2m 37s	remaining: 14m 42s
200:	learn: 1.5327872	test: 1.6876541	best: 1.6876541 (200)	total: 3m 32s	remaining: 14m 6s
250:	learn: 1.5089165	test: 1.6730013	best: 1.6730013 (250)	total: 4m 29s	remaining: 13m 23s
300:	learn: 1.4855073	test: 1.6626596	best: 1.6626596 (300)	total: 5m 25s	remaining: 12m 36s
350:	learn: 1.4668784	test: 1.6427660	best: 1.6427660 (350)	total: 6m 20s	remaining: 11m 44s
400:	learn: 1.

<catboost.core.CatBoostRegressor at 0x2c0719f70b0>

In [6]:
clf.save_model('model/01-NotNaN.cbm', format="cbm")

In [7]:
# инференс модели
y_pred = clf.predict(X_test)
print(f'\n Accuracy: {accuracy_score(y_test, y_pred.astype(int))}')


 Accuracy: 0.7570646401916894


In [8]:
fi = clf.get_feature_importance(prettified=True)
fi

Unnamed: 0,Feature Id,Importances
0,price,29.18998
1,descrip,24.690622
2,week,13.526707
3,size,11.051347
4,sale,6.42426
5,sstrvol,1.854362
6,ethnic,1.825591
7,cpwvol5,1.770176
8,age60,1.690582
9,educ,1.331589


### Тестирование с половиной датасета

И все же очень интересно, можно ли без дополнительной настройки параметров ускорить CatBoost на baseline датасете. Возьмем 15% данных!

In [2]:
X_train, X_test, y_train, y_test = prepare_training_data(baseline_df, test_size=0.85,
                cat_cols=['descrip', 'size', 'sale'],
                cols2drop=['com_code', 'upc', 'store'],
                target_cols=['move'])

clf = CatBoostRegressor(random_state=42,
                        cat_features=['descrip', 'size', 'sale'],
                        #task_type='GPU'
                        )


clf.fit( X_train, y_train,
         eval_set=(X_test, y_test),
         early_stopping_rounds=200,
         verbose=50, plot=False)

cat_features : 3 ['descrip', 'size', 'sale']
num_features : 14 ['week', 'price', 'age60', 'age9', 'educ', 'ethnic', 'income', 'hhlarge', 'workwom', 'hval150', 'sstrdist', 'sstrvol', 'cpdist5', 'cpwvol5']
targets ['move']
Learning rate set to 0.152366
0:	learn: 2.1941129	test: 2.3234813	best: 2.3234813 (0)	total: 1.34s	remaining: 22m 24s
50:	learn: 1.5724934	test: 1.7638531	best: 1.7638531 (50)	total: 32.8s	remaining: 10m 9s
100:	learn: 1.5206379	test: 1.7285551	best: 1.7285551 (100)	total: 1m 9s	remaining: 10m 20s
150:	learn: 1.4889635	test: 1.7078684	best: 1.7078684 (150)	total: 1m 44s	remaining: 9m 45s
200:	learn: 1.4644583	test: 1.6927338	best: 1.6927338 (200)	total: 2m 24s	remaining: 9m 34s


: 

#### Вывод
Все равно долго: 20 мин. Нужно настраивать сам CatBoost.