In [117]:
import re
from typing import (
    List,
    Dict
)

import numpy as np
import pandas as pd

from navec import Navec

from warnings import filterwarnings
filterwarnings("ignore")


data_list: Dict[str, str] = {
    "sm_train":  "../data/supermarket_train.tsv",
    "sm_val":    "../data/supermarket_val.tsv",
    "sm_vt":     "../data/supermarket_val_target.tsv",
    "cos_train": "../data/cosmetic_train.tsv",
    "cos_val":   "../data/cosmetic_val.tsv",
    "cos_vt":    "../data/cosmetic_val_target.tsv"
}

emb = Navec.load("../navec_hudlit_v1_12B_500K_300d_100q.tar")

[nltk_data] Downloading package stopwords to /Users/lulchak-
[nltk_data]     pavel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [76]:
def read_data(path: str, sep: str) -> pd.DataFrame:
    return pd.read_csv(path, sep)

def get_data_desc(
    mapping: Dict[str, str],
    sep: str = '\t'
) -> Dict[str, pd.DataFrame]:
    return {key : read_data(value, sep) for key, value in mapping.items()}

data = get_data_desc(data_list, sep='\t')
sm_train = data["sm_train"]

In [272]:
def clean_first_step(string: str) -> str:
    r1: str = r"\d+(\.\d+)?[гслмк]?[рк]?[м]?[\%]?"
    r2: str = r"(?<!\S)./"
    r3: str = r" \/[^\/]*\/"
    r4: str = r"\w+\."
    r5: str = r"[а-яА-Я]\/"
    r6: str = r"\w+\([^\)]*\)"
    r7: str = r"\*"

    regex = re.compile(r"(%s|%s|%s|%s|%s|%s|%s)" % (r1, r2, r3, r4, r5, r6, r7), re.IGNORECASE)

    return re.sub(regex, '', string)

def clean_second_step(string: str) -> str:
    r1: str = r"\(\+\)"
    r2: str = r""
    
    regex = re.compile(r"(%s|%s)" % (r1, r2), re.IGNORECASE)
    
    return re.sub(regex, '', string)

def specify_deduction(string: str) -> str:
    return string \
                .replace("Сиг-ты", "Сигареты") \
                .replace("К-са", "Колбаса")

def process_sentence(sent: str) -> str:
    return specify_deduction(clean_second_step(clean_first_step(sent)))

In [399]:
def get_sentence_embedding(string: str) -> np.array:
    sent: List[np.array] = []
    
    for word in process_sentence(string).split():
        curr_emb: np.array = emb.get(word.lower())

        if curr_emb is not None: sent.append(curr_emb)
        else: sent.append(emb["<unk>"])
                          
    return np.mean(np.array(sent), axis=0)

def get_cart_features(data: pd.DataFrame) -> pd.DataFrame:
    data["price"] = data["price"] * data["quantity"]
    receipt_info = sm_train.groupby(by="receipt_id").agg(
        {
            "server_date": max,
            "local_date": max,
            "item_id": (list, len),
            "name": list,
            "price": [list, max, min, np.mean],
            "quantity": [list, max, min, np.mean]
        }
    )

    receipt_info.columns = [
        "server_date", "local_date", "receipt_items",
        "cnt_items", "names", "prices", "max_price", "min_price",
        "mean_price", "quantities", "max_quantity", "min_quantity",
        "mean_quantity"
    ]
    
    receipt_info["prices"] = np.array(receipt_info["prices"])
    receipt_info["quantities"] = np.array(receipt_info["quantities"])
    
    receipt_info["percentile90_price"] = [np.percentile(p, 90) for p in receipt_info.prices]
    receipt_info["percentile95_price"] = [np.percentile(p, 95) for p in receipt_info.prices]
    receipt_info["percentile99_price"] = [np.percentile(p, 99) for p in receipt_info.prices]
    
    receipt_info["percentile90_quantity"] = [np.percentile(q, 90) for q in receipt_info.quantities]
    receipt_info["percentile95_quantity"] = [np.percentile(q, 95) for q in receipt_info.quantities]
    receipt_info["percentile99_quantity"] = [np.percentile(q, 99) for q in receipt_info.quantities]
    
    receipt_info["item_emb"] = [np.array([get_sentence_embedding(item) for item in ri]) for ri in receipt_info.names]
    receipt_info["emb"] = [np.mean(item, axis=0) for item in receipt_info["item_emb"]]

    receipt_info["cart2item_dot"] = [[np.dot(cart, item) for item in items] for items, cart in zip(receipt_info["item_emb"], receipt_info["emb"])]

    return receipt_info.reset_index().drop(columns=["names", "prices", "quantities"])

In [368]:
sm_train.head()

Unnamed: 0,device_id,receipt_id,item_id,server_date,local_date,name,price,quantity,my_ckecker
0,352398080550058,13014665572,104821,2022-10-16 14:38:00,2022-10-16 14:38:01,Картофель /новый урожай/Россия/,31.9,28.45,
1,352398080550058,13014665572,107726,2022-10-16 14:38:00,2022-10-16 14:38:01,Молоко Станичное 3.2% 1.0л ультрапастеризованн...,89.0,1.0,
2,352398080550058,13014716101,100671,2022-10-16 14:42:06,2022-10-16 14:42:06,Вафли Коламбина /телевизор/Кузнецов/,367.0,2.0,
3,352398080550058,13014716101,104645,2022-10-16 14:42:06,2022-10-16 14:42:06,К-ты Хо-хо-чу глаз.солен.карамель /Невск.конди...,334.0,1.0,
4,352398080550058,13015069818,107708,2022-10-16 15:11:39,2022-10-16 15:11:39,Молоко Молочный гостинец 3.2% 930гр ультрапаст...,89.0,1.0,


In [400]:
receipt_info = get_cart_features(sm_train)

In [406]:
receipt_info.head()

Unnamed: 0,receipt_id,server_date,local_date,receipt_items,cnt_items,max_price,min_price,mean_price,max_quantity,min_quantity,mean_quantity,percentile90_price,percentile95_price,percentile99_price,percentile90_quantity,percentile95_quantity,percentile99_quantity,item_emb,emb,cart2item_dot
0,9127014611,2021-09-01 08:58:39,2021-09-01 08:58:36,"[106012, 104817]",2,129.0,85.0,107.0,1.0,1.0,1.0,124.6,126.8,128.56,1.0,1.0,1.0,"[[0.37616935, 0.07839778, 0.17193902, -0.06938...","[0.21218355, -0.060995065, 0.11528913, -0.0025...","[6.262515, 26.214245]"
1,9127023800,2021-09-01 08:59:31,2021-09-01 08:59:31,"[113733, 113733]",2,129.0,129.0,129.0,1.0,1.0,1.0,129.0,129.0,129.0,1.0,1.0,1.0,"[[0.3054164, -0.034268033, -0.13488911, 0.0271...","[0.3054164, -0.034268033, -0.13488911, 0.02714...","[12.719209, 12.719209]"
2,9127039146,2021-09-01 09:00:58,2021-09-01 09:00:58,"[101325, 107679, 112112]",3,101.0,2.890625,54.630208,1.0,0.5,0.833333,92.8,96.9,100.18,1.0,1.0,1.0,"[[0.35758993, -0.12257907, 0.14001304, -0.2847...","[0.015179549, 0.009136441, 0.115134336, -0.152...","[6.0069213, 4.9663224, 3.6711915]"
3,9127126561,2021-09-01 09:08:53,2021-09-01 09:08:51,"[115873, 100316, 109607, 107358, 112972]",5,126.9,5.0,49.86,1.0,1.0,1.0,98.1,112.5,124.02,1.0,1.0,1.0,"[[0.107293725, 0.091573894, 0.20074944, -0.186...","[0.09209911, -0.075679764, 0.094947204, -0.005...","[4.6965847, 6.2420053, 2.6254356, 3.8314066, 7..."
4,9127254242,2021-09-01 09:19:17,2021-09-01 09:19:15,"[107679, 100677, 112906]",3,3904.0,0.759697,1301.880112,2.0,0.36,0.92,3123.376128,3513.688064,3825.937613,1.68,1.84,1.968,"[[-0.3967675, 0.094947174, 0.23026557, -0.0875...","[-0.28915545, -0.039075788, -0.043727774, 0.06...","[5.9871016, 7.155784, 12.5507]"


In [None]:
uitems = set(sm_train.item_id)
res = []

for row in receipt_info.values:
    indexes = np.random.choice(len(row[3]) - 1, 1)
    wrong_cands = sm_train[sm_train["item_id"] == np.random.choice(list(uitems - set(row[3])))].values
    wrong_item = wrong_cands[np.random.choice(1 if len(wrong_cands) == 1 else len(wrong_cands) - 1)]

    wr = receipt_info[receipt_info["receipt_id"] == wrong_item[1]].values[0]
    widx = wr[3].index(wrong_item[2])

    for idx in indexes:
        r1 = list([row[0]])
        r2 = list([row[3][idx]])
        r3 = list(row[4:-3])
        r4 = list([row[-1][idx]])
        
        r1w = list([wr[0]])
        r2w = list([wr[3][widx]])
        r3w = list(wr[4:-3])
        r4w = list([wr[-1][widx]])
        
        res.append(r1 + r2 + r3 + r4 + [1])
        res.append(r1w + r2w + r3w + r4w + [0])

In [None]:
len(res)