In [1]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score #, mean_squared_error
import pandas as pd
import os

os.chdir("D:/demo/ML/demo_automacon")

baseline_df = pd.read_csv('data/baseline_dataset.csv', index_col=0)
baseline_df.head()

Unnamed: 0,com_code,upc,descrip,size,store,week,move,price,sale,age60,...,educ,ethnic,income,hhlarge,workwom,hval150,sstrdist,sstrvol,cpdist5,cpwvol5
0,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,306,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
1,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,307,1,2.99,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
2,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,308,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
3,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,309,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
4,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,310,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417


In [2]:
baseline_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7173063 entries, 0 to 7173062
Data columns (total 21 columns):
 #   Column    Dtype  
---  ------    -----  
 0   com_code  int64  
 1   upc       int64  
 2   descrip   object 
 3   size      object 
 4   store     int64  
 5   week      int64  
 6   move      int64  
 7   price     float64
 8   sale      object 
 9   age60     float64
 10  age9      float64
 11  educ      float64
 12  ethnic    float64
 13  income    float64
 14  hhlarge   float64
 15  workwom   float64
 16  hval150   float64
 17  sstrdist  float64
 18  sstrvol   float64
 19  cpdist5   float64
 20  cpwvol5   float64
dtypes: float64(13), int64(5), object(3)
memory usage: 1.2+ GB


## Обучение baseline модели
1. Фильтруем признаки
2. Обучаем CatBoost
3. Сохраняем модель
4. Выводы

In [3]:
features2drop = ['com_code', 'upc', 'store'] 
targets = ['move']  
cat_features = ['descrip', 'size', 'sale'] 

filtered_features = [i for i in baseline_df.columns if (i not in targets and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]


print('cat_features :', len(cat_features), cat_features)
print('num_features :', len(num_features), num_features)
print('targets', targets)

cat_features : 3 ['descrip', 'size', 'sale']
num_features : 14 ['week', 'price', 'age60', 'age9', 'educ', 'ethnic', 'income', 'hhlarge', 'workwom', 'hval150', 'sstrdist', 'sstrvol', 'cpdist5', 'cpwvol5']
targets ['move']


In [4]:
X = baseline_df[filtered_features].drop(targets, axis=1, errors='ignore')
y = baseline_df['move']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = CatBoostRegressor(random_state=42,
                        cat_features=cat_features,
                        #task_type='GPU'
                        )


clf.fit( X_train, y_train,
         eval_set=(X_test, y_test),
         early_stopping_rounds=200,
         verbose=50, plot=False)



Learning rate set to 0.158147
0:	learn: 2.2470799	test: 2.4147563	best: 2.4147563 (0)	total: 4.77s	remaining: 1h 19m 24s
50:	learn: 1.6627764	test: 1.8816990	best: 1.8816990 (50)	total: 1m	remaining: 18m 51s
100:	learn: 1.6200910	test: 1.8440723	best: 1.8440723 (100)	total: 2m	remaining: 17m 52s
150:	learn: 1.5901528	test: 1.8230255	best: 1.8230255 (150)	total: 3m 1s	remaining: 17m 1s
200:	learn: 1.5614709	test: 1.8003202	best: 1.8003202 (200)	total: 3m 58s	remaining: 15m 49s
250:	learn: 1.5391549	test: 1.7867110	best: 1.7867110 (250)	total: 4m 56s	remaining: 14m 44s
300:	learn: 1.5193307	test: 1.7747330	best: 1.7747330 (300)	total: 5m 57s	remaining: 13m 50s
350:	learn: 1.5079974	test: 1.7647857	best: 1.7647857 (350)	total: 6m 57s	remaining: 12m 51s
400:	learn: 1.4972848	test: 1.7571181	best: 1.7571181 (400)	total: 7m 58s	remaining: 11m 55s
450:	learn: 1.4859126	test: 1.7441103	best: 1.7441103 (450)	total: 8m 57s	remaining: 10m 54s
500:	learn: 1.4757335	test: 1.7380635	best: 1.7379946 

<catboost.core.CatBoostRegressor at 0x197f1843440>

In [6]:
# инференс модели
y_pred = clf.predict(X_test)
print(f'\n Accuracy: {accuracy_score(y_test, y_pred.astype(int))}')


 Accuracy: 0.7582274801636399


In [7]:
fi = clf.get_feature_importance(prettified=True)
fi

Unnamed: 0,Feature Id,Importances
0,price,30.487298
1,descrip,23.051027
2,week,12.909588
3,sale,9.456202
4,size,8.400924
5,sstrvol,2.415333
6,ethnic,1.967587
7,hhlarge,1.428075
8,age60,1.414026
9,cpwvol5,1.272081


#### Выводы по baseline модели

- Достигнута точность в 0.758 правильных прогнозов
- Долго обучается (Intel Xeon E5-2670 v3, 3GHz, 32GB RAM) -> нужно сократить число фич
- Самые важные фичи: price, descrip, week, sale, size

Что имеет смысл сделать в следующей версии модели:
- Добавить данные о праздниках
- Разбить size на две фичи: число и тип (таблетка, унция, миллилитр и т.п.)
- Визуализировать корреляцию/совстречаемость признаков и их влияние на таргет
- Уточнить границы понятия "спрос": сейчас пытаемся предсказать спрос на конкретное количество таблеток для каждой марки лекарства, не факт, что с точки зрения бизнеса это самое важное
- Убрать демографические признаки, либо признаки со значимостью ниже 1.5