### Модель Baseline v2

Главной проблемой первой версии модели стало время обучения. Я мало что наэксперементирую в рамках тестового задания, когда один цикл обучения модели занимает полчаса. Поэтому уточним понятие спроса, чтобы вписаться в имеющиеся вычислительные мощности с минимальным отступом от интересов бизнеса.

Сейчас модель прогнозирует спрос на каждый размер упаковки каждого наименования лекарства. Получается слишком большое дерево признаков. Попробуем построить модель для какой-либо одной, наиболее распространенной позиции товара.

In [1]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score #, mean_squared_error
import pandas as pd
import os
os.chdir("D:/demo/ML/demo_automacon")

from src.utils import prepare_training_data

baseline_df = pd.read_csv('data/baseline_dataset.csv', index_col=0)
baseline_df.head()

Unnamed: 0,com_code,upc,descrip,size,store,week,move,price,sale,age60,...,educ,ethnic,income,hhlarge,workwom,hval150,sstrdist,sstrvol,cpdist5,cpwvol5
0,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,306,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
1,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,307,1,2.99,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
2,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,308,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
3,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,309,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
4,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,310,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417


In [9]:
baseline_df.upc.value_counts()

upc
30573015040    32110
30573016040    32110
30045046850    32109
3828161017     32109
31284310117    32109
               ...  
30067013663       57
30536374512       36
3680029694        24
30573031506        6
31284316520        3
Name: count, Length: 640, dtype: int64

In [10]:
baseline_df = baseline_df[(baseline_df.upc == 30573015040) | (baseline_df.upc == 30573016040)]
baseline_df.sample(5)

Unnamed: 0,com_code,upc,descrip,size,store,week,move,price,sale,age60,...,educ,ethnic,income,hhlarge,workwom,hval150,sstrdist,sstrvol,cpdist5,cpwvol5
3255399,953,30573016040,ADVIL COATED CAPLETS,100 CT,106,159,1,7.73,N,0.109887,...,0.157939,0.190586,10.506846,0.146639,0.367483,0.099963,6.250903,0.904762,0.991095,0.103492
3234203,953,30573016040,ADVIL COATED CAPLETS,100 CT,8,302,5,7.69,B,0.252394,...,0.095173,0.035243,10.59701,0.13175,0.283075,0.054227,2.636333,1.5,2.905384,0.641016
3236315,953,30573016040,ADVIL COATED CAPLETS,100 CT,28,84,0,0.0,N,0.213309,...,0.233163,0.055935,10.798534,0.103666,0.389059,0.44514,2.912922,1.727273,1.820777,0.814221
3078115,953,30573015040,ADVIL,100 CT,131,365,3,8.99,N,0.170655,...,0.271396,0.074656,10.793537,0.0901,0.414841,0.455995,3.159728,0.625,1.490036,0.383227
3064289,953,30573015040,ADVIL,100 CT,88,365,1,8.99,N,0.160414,...,0.151633,0.142928,10.549805,0.135168,0.4019,0.189573,4.981955,1.6,2.087539,0.489797


In [11]:
X_train, X_test, y_train, y_test = prepare_training_data(baseline_df, 
                cat_cols=['size', 'sale'],
                cols2drop=['descrip', 'com_code', 'upc', 'store'],
                target_cols=['move'])

clf = CatBoostRegressor(random_state=42,
                        cat_features=['size', 'sale'],
                        #task_type='GPU'
                        )


clf.fit( X_train, y_train,
         eval_set=(X_test, y_test),
         early_stopping_rounds=200,
         verbose=50, plot=False)

cat_features : 2 ['size', 'sale']
num_features : 14 ['week', 'price', 'age60', 'age9', 'educ', 'ethnic', 'income', 'hhlarge', 'workwom', 'hval150', 'sstrdist', 'sstrvol', 'cpdist5', 'cpwvol5']
targets ['move']
Learning rate set to 0.094512
0:	learn: 2.4641785	test: 2.5041677	best: 2.5041677 (0)	total: 29.4ms	remaining: 29.4s
50:	learn: 1.7793218	test: 1.8763529	best: 1.8763529 (50)	total: 1.38s	remaining: 25.6s
100:	learn: 1.7369603	test: 1.8444711	best: 1.8444711 (100)	total: 2.74s	remaining: 24.4s
150:	learn: 1.7113970	test: 1.8281604	best: 1.8281604 (150)	total: 4.11s	remaining: 23.1s
200:	learn: 1.6904217	test: 1.8114042	best: 1.8114042 (200)	total: 5.44s	remaining: 21.6s
250:	learn: 1.6743359	test: 1.8034221	best: 1.8034221 (250)	total: 6.8s	remaining: 20.3s
300:	learn: 1.6607927	test: 1.7982281	best: 1.7979386 (297)	total: 8.15s	remaining: 18.9s
350:	learn: 1.6509399	test: 1.7940027	best: 1.7940027 (350)	total: 9.5s	remaining: 17.6s
400:	learn: 1.6423995	test: 1.7929267	best: 1.7

<catboost.core.CatBoostRegressor at 0x1d031537b00>

In [14]:
y_pred = clf.predict(X_test)
print(f'\n Accuracy: {accuracy_score(y_test, y_pred.astype(int))}')


 Accuracy: 0.4547648707567736


#### Вывод
Слишком мало данных, чтобы с ходу получить хорошую точность по отдельному лекарству.

### Попробуем полный массив данных, но без демографии
Включим идентификатор магазина store в модель, как категориальный признак. 

In [15]:
baseline_df = pd.read_csv('data/baseline_dataset.csv', index_col=0)
baseline_df.head()

Unnamed: 0,com_code,upc,descrip,size,store,week,move,price,sale,age60,...,educ,ethnic,income,hhlarge,workwom,hval150,sstrdist,sstrvol,cpdist5,cpwvol5
0,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,306,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
1,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,307,1,2.99,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
2,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,308,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
3,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,309,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417
4,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,310,0,0.0,N,0.149192,...,0.087712,0.425324,10.140613,0.144374,0.296353,0.09633,3.55838,0.909091,1.075632,0.214417


In [2]:
baseline_df = baseline_df.drop(columns=baseline_df.columns[9:])
baseline_df.head()

Unnamed: 0,com_code,upc,descrip,size,store,week,move,price,sale
0,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,306,0,0.0,N
1,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,307,1,2.99,N
2,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,308,0,0.0,N
3,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,309,0,0.0,N
4,953,1192603016,CAFFEDRINE CAPLETS 1,16 CT,76,310,0,0.0,N


In [3]:
baseline_df['store_str'] = baseline_df.store.astype(str)
baseline_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7173063 entries, 0 to 7173062
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   com_code   int64  
 1   upc        int64  
 2   descrip    object 
 3   size       object 
 4   store      int64  
 5   week       int64  
 6   move       int64  
 7   price      float64
 8   sale       object 
 9   store_str  object 
dtypes: float64(1), int64(5), object(4)
memory usage: 602.0+ MB


In [5]:
X_train, X_test, y_train, y_test = prepare_training_data(baseline_df, 
                cat_cols=['descrip', 'size', 'sale', 'store_str'],
                cols2drop=['com_code', 'upc', 'store'],
                target_cols=['move'])

clf = CatBoostRegressor(random_state=42,
                        cat_features=['descrip', 'size', 'sale', 'store_str'],
                        #task_type='GPU'
                        )


clf.fit( X_train, y_train,
         eval_set=(X_test, y_test),
         early_stopping_rounds=200,
         verbose=50, plot=False)

cat_features : 4 ['descrip', 'size', 'sale', 'store_str']
num_features : 2 ['week', 'price']
targets ['move']
Learning rate set to 0.198165
0:	learn: 2.2353693	test: 2.4028039	best: 2.4028039 (0)	total: 1.87s	remaining: 31m 5s
50:	learn: 1.6362068	test: 1.8541021	best: 1.8541021 (50)	total: 56.2s	remaining: 17m 24s
100:	learn: 1.5843708	test: 1.8093547	best: 1.8093547 (100)	total: 2m 2s	remaining: 18m 7s
150:	learn: 1.5542016	test: 1.7860015	best: 1.7860015 (150)	total: 3m 5s	remaining: 17m 24s
200:	learn: 1.5313789	test: 1.7740269	best: 1.7740269 (200)	total: 4m 11s	remaining: 16m 40s
250:	learn: 1.5103708	test: 1.7629523	best: 1.7629523 (250)	total: 5m 12s	remaining: 15m 32s
300:	learn: 1.4990255	test: 1.7536684	best: 1.7536684 (300)	total: 6m 17s	remaining: 14m 36s
350:	learn: 1.4899869	test: 1.7477979	best: 1.7477979 (350)	total: 7m 18s	remaining: 13m 31s
400:	learn: 1.4787775	test: 1.7400274	best: 1.7400274 (400)	total: 8m 25s	remaining: 12m 34s


: 

#### Вывод
Отсутствие демографической информации не дает существенного выигрыша в скорости обучения. Нужно либо уменьшать датасет (что может снизить достигаемую точность), либо менять модель на более быструю.
1. Строим датасет без NaN и смотрим, насколько меньше он получится.
2. Обучаем LightGBM