In [1]:
import gc
import itertools
from pathlib import Path

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import sklearn.linear_model
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.semi_supervised import LabelPropagation
import sklearn.multioutput
import tensorflow_text as text  # tf registers ops on import
import tensorflow as tf
from tensorflow.keras.regularizers import L2
import tensorflow_hub as hub
import transformers

from retrain_bert import settings
from retrain_bert.preprocessor import load_labels, get_labels_conf
from retrain_bert.utils import (ExactCategoryScore, make_targets)


2024-01-15 14:33:45.622651: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-15 14:33:45.622682: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-15 14:33:45.624842: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-15 14:33:45.759664: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
labels = load_labels(settings.DATA_DIR / "labels.csv")

human_verified = pd.read_csv(settings.DATA_DIR / "train/human_train.csv")
human_labels = load_labels(settings.DATA_DIR / "human_labels.csv")

In [3]:
labels_conf = get_labels_conf(labels)
human_labels_conf = get_labels_conf(human_labels)

In [4]:
client_facing = pd.read_csv(settings.DATA_DIR / "raw/classified_ocr_2023_11_01.csv", dtype={"Category_MasterProduct": str})
client_facing = client_facing.dropna(subset=["OcrValue"])

human_raw = pd.read_csv(settings.DATA_DIR / "raw/human_classified_ocr.csv", dtype={"Category_MasterProduct": str})

In [5]:
master_categories = pd.read_csv(settings.DATA_DIR / "raw/masterProduct__Categories.csv", dtype=str)

In [6]:
comparing_raw = ExactCategoryScore(labels_conf=labels_conf)

comparing_df = pd.merge(
    client_facing[["Category_MasterProduct", "OcrValue"]], human_raw[["Category_MasterProduct", "OcrValue"]], on="OcrValue", how="inner", suffixes=("_client", "_human")
)

scores = [comparing_raw.exact_category_score_from_codes(comparing_df.Category_MasterProduct_client, comparing_df.Category_MasterProduct_human, level + 1) for level in range(settings.DEEPEST_LEVEL)]

In [7]:
comparing_raw.exact_category_score_from_codes(comparing_df.Category_MasterProduct_client, comparing_df.Category_MasterProduct_human, 5)

0.8865266770160961

In [8]:
for level in range(settings.DEEPEST_LEVEL):
    # mask = master_categories[f"Level{level + 1}Code"].isna() | master_categories[f"Level{level + 1}Code"].str.len() == 2 * (level + 1) 
    # print(mask.mean())
    correct_length = master_categories[f"Level{level + 1}Code"].str.len() == 2 * (level + 1)
    isna = master_categories[f"Level{level + 1}Code"].isna()
    mask = correct_length | isna
    master_categories = master_categories[mask]

In [9]:
master_categories.head()

Unnamed: 0,Code,Level1Code,Level1Name,Level2Code,Level2Name,Level3Code,Level3Name,Level4Code,Level4Name,Level5Code,Level5Name
0,0,0,Sin Clasificación,,Sin Clasificación,,Sin Clasificación,,Sin Clasificación,,
1,1,1,Alimentación y bebidas,,Sin Clasificación,,Sin Clasificación,,Sin Clasificación,,
2,101,1,Alimentación y bebidas,101.0,Alimentación seca,,Sin Clasificación,,Sin Clasificación,,
3,10101,1,Alimentación y bebidas,101.0,Alimentación seca,10101.0,Aceites,,Sin Clasificación,,
4,1010101,1,Alimentación y bebidas,101.0,Alimentación seca,10101.0,Aceites,1010101.0,Oliva,,


In [10]:
unclassified = master_categories.query("Code == '00'").Level1Name.iloc[0]
unclassified

'Sin Clasificación'

In [11]:
def compute_confusion_matrix(code_true: pd.Series, code_pred: pd.Series, level: int, master_categories: pd.DataFrame = master_categories):
    code_true = code_true.str.slice(0, 2*level)
    code_pred = code_pred.str.slice(0, 2*level)
    mask = (code_true.str.len() == 2*level).values
    code_true = code_true[mask]
    code_pred = code_pred[mask]

    code_pred[code_pred.str.len() < 2*level] = "0" * (2*level)

    code_mapping = master_categories[[f"Level{level}Name", f"Level{level}Code"]].drop_duplicates(f"Level{level}Code")
    code_mapping.dropna(inplace=True)
    code_mapping.set_index(f"Level{level}Code", inplace=True)
    code_mapping = code_mapping.squeeze()
    code_mapping["0" * (2*level)] = unclassified
    code_mapping.sort_index(inplace=True)

    code_pred[~code_pred.isin(code_mapping.index)] = "0" * (2*level)
    code_mapping = code_mapping[code_mapping.index.isin(code_true.unique()) | code_mapping.index.isin(code_pred.unique())]

    matrix = pd.DataFrame(confusion_matrix(code_true, code_pred), index=code_mapping.copy(), columns=code_mapping.copy())
    matrix.index.name = "Predicted category"
    matrix.columns.name = "True category"
    return matrix

def identify_largest_mixups(confusion_matrix, proportional=True):
    proportional_matrix = confusion_matrix / confusion_matrix.sum(axis=0)

    melted = confusion_matrix.reset_index().melt(id_vars="Predicted category", var_name="True category", value_name="Count")
    melted_prop = proportional_matrix.reset_index().melt(id_vars="Predicted category", var_name="True category", value_name="Proportion")

    merged = pd.merge(melted, melted_prop, on=["Predicted category", "True category"])

    merged = merged[merged["Predicted category"] != merged["True category"]]
    if proportional:
        merged = merged.sort_values("Proportion", ascending=False)
    else:
        merged = merged.sort_values("Count", ascending=False)
    return merged

compute_confusion_matrix(comparing_df.Category_MasterProduct_client, comparing_df.Category_MasterProduct_human, 3)

True category,Sin Clasificación,Aceites,Cafés y sucedáneos,Infusiones,Chocolates,Cacao,Azúcar y edulcorantes,Golosinas,Salsas,"Sal, vinagre y especias",...,Automovil,Electrónica de consumo y electrodomésticos,Sección ocio,Casa-Hogar,Aire libre,Sector bebé,Juguetes,Papeleria,Mascotas,Bolsas de compra
Predicted category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sin Clasificación,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Aceites,0,339,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cafés y sucedáneos,0,0,807,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Infusiones,0,0,0,2154,0,0,1,7,0,12,...,0,0,0,2,0,0,0,0,0,0
Chocolates,1,0,5,0,3568,2,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sector bebé,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,10,0,0,0,0
Juguetes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Papeleria,0,0,0,0,0,0,0,0,0,0,...,0,0,4,0,0,0,0,7,0,0
Mascotas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,45,0


In [12]:
human_matrix = compute_confusion_matrix(comparing_df.Category_MasterProduct_client, comparing_df.Category_MasterProduct_human, 3)
human_mixups = identify_largest_mixups(human_matrix, proportional=False)
human_mixups.head(10)

Unnamed: 0,Predicted category,True category,Count,Proportion
17284,Pizzas,Platos elaborados,150,0.218659
2373,Bollería,Pastelería y bollería industrial,139,0.152412
6999,Bebidas refrescantes,Zumos,133,0.051852
3326,Frutos secos y fruta seca,"Patatas Fritas , cortezas y otros fritos",132,0.018734
19005,Bebidas refrescantes,Frutas,131,0.076878
3499,"Patatas Fritas , cortezas y otros fritos",Frutos secos y fruta seca,105,0.012282
16474,Conservas vegetales,Verduras y hortalizas,103,0.099291
16475,Conservas vegetales,Verduras y hortalizas,103,0.03147
1918,Chocolates,Galletas,101,0.012161
19327,Pastelería y bollería industrial,Bollería,85,0.205314


In [13]:
custom_objects = {"KerasLayer": hub.KerasLayer}
with tf.keras.utils.custom_object_scope(custom_objects):
    new_model = tf.keras.models.load_model(settings.PROJECT_DIR / "models/bert_finetuned.keras")

2024-01-15 14:33:55.682622: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-15 14:33:55.846123: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-15 14:33:55.846335: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [14]:
predictions = new_model.predict(comparing_df.OcrValue, batch_size=32)

2024-01-15 14:38:01.514038: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700




In [15]:
class Labeler:
    def __init__(self, labels):
        self.labels = labels
        self.labels_conf = get_labels_conf(self.labels)
        self.labels_to_category_mapping = dict(
            zip((self.labels["label"]), self.labels.index.get_level_values("cat"))
        )

    def convert_to_labels(self, predictions):
        level_predictions = []
        for level, conf in enumerate(self.labels_conf):
            preds = predictions[level]
            level_predictions.append(np.argmax(preds, axis=1) + conf["start"])
        level_predictions = np.stack(level_predictions, axis=1)
        return level_predictions

    def convert_to_probabilities(self, predictions):
        level_predictions = []
        for level, conf in enumerate(self.labels_conf):
            level_predictions.append(
                np.max(predictions[:, conf["start"] : conf["end"]], axis=1)
            )
        level_predictions = np.stack(level_predictions, axis=1)
        return level_predictions

    def convert_to_level_categories(self, predictions):
        labels = self.convert_to_labels(predictions)
        categories = np.vectorize(self.labels_to_category_mapping.get)(labels)
        return categories
    
    def convert_to_codes(self, predictions):
        level_categories = self.convert_to_level_categories(predictions)
        codes = ["".join(cat) for cat in level_categories]
        return codes


In [16]:
predicted_codes = Labeler(labels).convert_to_codes(predictions)


In [17]:
comparing_df["BERT_code"] = predicted_codes

In [18]:
bert_matrix = compute_confusion_matrix(comparing_df.Category_MasterProduct_client, comparing_df.BERT_code, 3)
bert_mixups = identify_largest_mixups(bert_matrix, proportional=True)
bert_mixups.head(10)

Unnamed: 0,Predicted category,True category,Count,Proportion
5968,"Sopas, caldos y purés",Escabeches,1,1.0
11564,Panadería industrial,Base pescado a granel,2,1.0
11134,Mantequilla,Base pasta a granel,1,1.0
16481,Conservas de Carne y patés,Pates y sobrasadas,11,0.785714
8754,Chocolates,Equino,7,0.7
11315,Pastelería y Repostería,Base arroz a granel,2,0.666667
28451,Verduras y hortalizas,Deporte,3,0.6
28449,Verduras y hortalizas,Deporte,0,0.6
22705,Otras especialidades regionales,Surtidos,3,0.6
8649,Chorizo,Ovino y caprino,10,0.526316


In [19]:
bert_mixups.groupby("True category")["Count"].sum().sort_values(ascending=False).head(10)

True category
Sin Clasificación                           755
Cafés y sucedáneos                          692
Verduras y hortalizas                       604
Galletas                                    470
Frutos secos y fruta seca                   420
Chocolates                                  408
Patatas Fritas , cortezas y otros fritos    344
Pastelería y bollería industrial            342
Aceites                                     306
Porcino                                     287
Name: Count, dtype: int64

In [20]:
predictions = new_model.predict(human_raw.OcrValue, batch_size=128)
human_raw["BERT_code"] = Labeler(labels).convert_to_codes(predictions)



In [21]:
without_train_data = human_raw[~human_raw.OcrValue.isin(comparing_df.OcrValue)]
comparing_raw.exact_category_score_from_codes(true_codes=without_train_data.Category_MasterProduct, pred_codes=without_train_data.BERT_code, level=3)

0.7089985060412802

In [22]:
bert_matrix = compute_confusion_matrix(code_true=without_train_data.Category_MasterProduct, code_pred=without_train_data.BERT_code, level=2)
bert_mixups = identify_largest_mixups(bert_matrix, proportional=True)
bert_mixups.query("Count > 20").head(20)

Unnamed: 0,Predicted category,True category,Count,Proportion
519,Casa-Hogar,Sector Bebé,134,0.369146
576,Alimentación seca,Mascotas,193,0.369025
1,Alimentación seca,Sin Clasificación,424,0.232839
158,Congelados,Platos cocinados y precocinados,1227,0.22301
619,Casa-Hogar,Bolsas de compra,269,0.214002
308,Congelados,Pescados y mariscos,276,0.190345
276,Alimentación seca,Panadería y pastelería,976,0.190328
444,Casa-Hogar,"Ferretería, bricolaje y automóvil",134,0.14823
570,Sector Bebé,Papeleria,48,0.134078
19,Casa-Hogar,Sin Clasificación,212,0.11642
