In [28]:
import pandas as pd
import numpy as np

NUM_OF_ITERATIONS = 50

# читаем размеченные данные из файла
beer_dataset = pd.read_excel('beer_data_set.xlsx')

data = [item for item in beer_dataset[['SKU_NAME']].values.flatten()]
volumes = beer_dataset['Объем'].values.flatten()

# выводим несколько первых строк таблицы
beer_dataset.head()

Unnamed: 0,barcode,SKU_NAME,Наименование,Алкоголь,Объем,Производитель,Бренд,Саб-бренд,Тип упаковки,Мультипак,тип
0,4600721002206,Пиво БагБир 0.5л ст/бут,BAGBIER - светлое - 4.2% - 0.5л стекло,0.042,0.5,AB InBev,BAGBIER,,Стекло,,Светлое
1,4600721003197,Пиво БАГ-БИР св.ст/б 0.5л,BAGBIER - светлое - 4.2% - 0.5л стекло,0.042,0.5,AB InBev,BAGBIER,,Стекло,,Светлое
2,4600721003203,"Пиво BAGBIER светлое 4,9% 1.5л",BAGBIER - светлое - 4.9% - 1.5л пэт,0.049,1.5,AB InBev,BAGBIER,,ПЭТ,,Светлое
3,4600721005191,Пиво БАГ-БИР св.ПЭТ 2.5л,BAGBIER - светлое - 4.2% - 2.5л пэт,0.042,2.5,AB InBev,BAGBIER,,ПЭТ,,Светлое
4,4600721009366,"Пиво БАГ БИР ГОЛЬДЕН светлое ПЭТ 4% 1,5л",BAGBIER GOLDEN - светлое - 4.7% - 1.5л пэт,0.047,1.5,AB InBev,BAGBIER,Golden,ПЭТ,,Светлое


In [29]:
import re 
import random
import time

random.seed(time.time())
analyzer = 'word'
#token_pattern = r"(?u)\b\w\w+\b|[0-9]+[0-9\.,]+\w+"
token_pattern = r"(?u)[0-9]+[0-9\.,]+\w+|[0-9]+\w"

def tokenize(items, token_pattern):
    pat = re.compile(token_pattern)
    tokens = []
    for item in items:
        tokens.append([match.group() for match in re.finditer(pat, item)])
    data = zip(items, tokens)
    return pd.DataFrame(data, columns=['Строка', 'Токены'])

tokenize(random.sample(data, 10), token_pattern)

Unnamed: 0,Строка,Токены
0,Пиво ЛЕДОКОЛ свет.8% ж/б 0.5л,[0.5л]
1,Пиво ДОН ЖИВОЕ св.непаст.4% 0.47л,[0.47л]
2,KOFF - светлое 4.5% - 0.44л стекло,"[4.5, 0.44л]"
3,БАЛТИКА - светлое 0% - 0.5л ж.б.,[0.5л]
4,ZHIGULYOVSKOE 2 1L P BO L X,[1L]
5,Пиво CORONA EXTRA MEX.бут 0.33л,[0.33л]
6,ZHIGULYOVSK.(BRYANSKPIVO) 0.5L J BO L X,[0.5L]
7,ХАМОВНИКИ - светлое 4.5% - 0.5л стекло,"[4.5, 0.5л]"
8,FAXE - светлое 4.9% - 0.48л ж.б.,"[4.9, 0.48л]"
9,"Пиво ЗОЛ.БОЧКА ПШЕНИЧ.5,3% ст/б 0.5л","[5,3, 0.5л]"


In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, CatBoostClassifier
from joblib import load

def dense_vectors(vectors):
    return np.array([np.asarray(item.todense())[0] for item in vectors])

train_data, test_data, train_volumes, test_volumes = train_test_split(data, volumes, test_size=0.33, random_state=42)

def enrich(vectors, data):
    container_predictor = CatBoostClassifier()
    container_predictor.load_model('beer_container_catboost_400.cbm')
    vect = load('beer_container_catboost_400_vect.job')
    data = dense_vectors(vect.transform(data))
    padding = container_predictor.predict(data)
    enriched = np.concatenate((vectors, padding), axis=1)
    return enriched

def build_model(data, volumes, iterations=200):
        vectorizer = CountVectorizer(lowercase=True, token_pattern=token_pattern)
        
        compressed_data = vectorizer.fit_transform(data)
        vectorized_data = dense_vectors(compressed_data)
        enriched_data = enrich(vectorized_data, data)
        print(enriched_data.shape)
        reg = CatBoostClassifier(iterations=iterations)
        reg.fit(enriched_data, volumes, silent=False)
        
        return (vectorizer, reg)


In [31]:
model = build_model(train_data, train_volumes, iterations=300)

(3902, 217)
0:	learn: -3.2117047	total: 182ms	remaining: 54.5s
1:	learn: -2.8973649	total: 356ms	remaining: 53s
2:	learn: -2.6859763	total: 549ms	remaining: 54.4s
3:	learn: -2.5072487	total: 719ms	remaining: 53.2s
4:	learn: -2.3665727	total: 845ms	remaining: 49.9s
5:	learn: -2.2590021	total: 900ms	remaining: 44.1s
6:	learn: -2.1524072	total: 957ms	remaining: 40s
7:	learn: -2.0615065	total: 1.01s	remaining: 37s
8:	learn: -1.9873080	total: 1.07s	remaining: 34.8s
9:	learn: -1.9152530	total: 1.13s	remaining: 32.8s
10:	learn: -1.8468918	total: 1.18s	remaining: 31s
11:	learn: -1.7820973	total: 1.24s	remaining: 29.7s
12:	learn: -1.7310553	total: 1.3s	remaining: 28.6s
13:	learn: -1.6763270	total: 1.35s	remaining: 27.6s
14:	learn: -1.6245082	total: 1.41s	remaining: 26.7s
15:	learn: -1.5784614	total: 1.47s	remaining: 26.2s
16:	learn: -1.5388579	total: 1.54s	remaining: 25.6s
17:	learn: -1.5047806	total: 1.59s	remaining: 24.9s
18:	learn: -1.4694432	total: 1.65s	remaining: 24.3s
19:	learn: -1.43329

159:	learn: -0.4002345	total: 15.6s	remaining: 13.7s
160:	learn: -0.3992833	total: 15.9s	remaining: 13.7s
161:	learn: -0.3982952	total: 16s	remaining: 13.7s
162:	learn: -0.3974041	total: 16.3s	remaining: 13.7s
163:	learn: -0.3959731	total: 16.5s	remaining: 13.7s
164:	learn: -0.3952075	total: 16.6s	remaining: 13.6s
165:	learn: -0.3942399	total: 16.7s	remaining: 13.5s
166:	learn: -0.3933321	total: 16.8s	remaining: 13.3s
167:	learn: -0.3918039	total: 16.8s	remaining: 13.2s
168:	learn: -0.3906413	total: 16.9s	remaining: 13.1s
169:	learn: -0.3893478	total: 17s	remaining: 13s
170:	learn: -0.3880017	total: 17s	remaining: 12.8s
171:	learn: -0.3868276	total: 17.1s	remaining: 12.7s
172:	learn: -0.3852582	total: 17.1s	remaining: 12.6s
173:	learn: -0.3844134	total: 17.2s	remaining: 12.5s
174:	learn: -0.3831889	total: 17.3s	remaining: 12.4s
175:	learn: -0.3819702	total: 17.4s	remaining: 12.2s
176:	learn: -0.3808575	total: 17.5s	remaining: 12.2s
177:	learn: -0.3797277	total: 17.7s	remaining: 12.1s
1

In [32]:
from itertools import accumulate
from collections import Counter

vect, reg = model

test_vect_data = enrich(dense_vectors(vect.transform(test_data)), test_data)
predicted = reg.predict(test_vect_data)

volumes_stats = Counter(volumes)
p = list(accumulate(count/len(volumes) for _, count in volumes_stats.items()))
top_volumes = np.sort([volume for i, volume in enumerate(volumes_stats)
                if p[i] <= 0.8])

def closest(volumes, v):
    closest = np.abs(volumes - v).argmin()
    return volumes[closest]

table = {
    'Input': test_data,
    'Predicted volume': np.array([ closest(top_volumes, v) for v in predicted]),
    'Valid volume': test_volumes,
}

result = pd.DataFrame(table, columns = ['Input', 'Predicted volume', 'Valid volume'])
result.head(20)

Unnamed: 0,Input,Predicted volume,Valid volume
0,"Пиво СТЕПАН РАЗИН ПЕТР.св.4,7% ПЭТ 1.4л",1.5,1.4
1,ZLATY BAZANT - светлое 4.1% - 0.5л ж.б.,0.5,0.5
2,"Пиво MILLER MIDNIGHT тем.4,8% ст/б 0.5л",0.5,0.5
3,КЛИНСКОЕ - светлое 4.5% - 0.5л стекло,0.5,0.5
4,"ПивоANDECH.ВАЙСБ.ХЕФЕТР.св.5,5%ст/б 0.5л",0.5,0.5
5,ЖИГУЛЕВСКОЕ - светлое 4.2% - 1.5л пэт,1.5,1.5
6,"Пиво ЯРПИВО КРЕПК.св. 7,2% 0.5л",0.5,0.5
7,БАЛТИКА - светлое 5.4% - 0.5л стекло,0.5,0.5
8,SIBIRSKAYA KORONA(AB INBEV) KLASSICHESKOE 0.5L...,0.5,0.5
9,OKHOTA(HEINEKEN BEL) KREPKOE EKSPORTNOE 1.4L P...,1.5,1.4


In [34]:
len(result[result['Predicted volume']!=result['Valid volume']])/len(result)

0.23048907388137357

In [None]:
from joblib as load, dump

name = f"beer_volume"

vect, reg = model
dump(vect, f"{name}_vect.job")
reg.save