In [1]:
from concurrent.futures import ThreadPoolExecutor
import gc
import itertools
import json
from pathlib import Path

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import sklearn.linear_model
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.semi_supervised import LabelPropagation
import sklearn.multioutput
import tensorflow_text as text  # tf registers ops on import
import tensorflow as tf
from tensorflow.keras.regularizers import L2
import tensorflow_hub as hub
import transformers

from retrain_bert import settings
from retrain_bert.preprocessor import load_labels, split_into_categories

2023-11-15 03:06:41.677182: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-15 03:06:41.677236: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-15 03:06:41.678648: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-15 03:06:41.800825: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    print("GPU is available.")
else:
    print("GPU is not available.")

GPU is available.


2023-11-14 12:20:26.792294: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-14 12:20:26.815044: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-14 12:20:26.815288: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [2]:
if Path('/code').exists():
    settings.PROJECT_DIR = Path('/code')
    settings.DATA_DIR = settings.PROJECT_DIR / "data"
    settings.CATEGORIES_PATH = settings.DATA_DIR / "/categories.csv"
settings.INFERENCE_MODEL_PATH = settings.PROJECT_DIR / "models/bert_finetuned.keras"
generated_train_data_path = settings.DATA_DIR / "train/generated_train.csv"
prediction_file_path = settings.DATA_DIR / "predictions.jsonl"

In [3]:
labels = load_labels(settings.DATA_DIR / "labels.csv")
uncategorized = pd.read_csv(settings.DATA_DIR / "raw/unverified classifications.csv", usecols=["OCR", "CategoryCode"]).rename(columns={"OCR": "OcrValue"}).dropna(subset=["OcrValue"])
if generated_train_data_path.exists():
    already_generated = pd.read_csv(generated_train_data_path)
    uncategorized = uncategorized[~uncategorized["OcrValue"].isin(already_generated["OcrValue"])]

In [8]:
custom_objects = {"KerasLayer": hub.KerasLayer}
with tf.keras.utils.custom_object_scope(custom_objects):
    model = tf.keras.models.load_model(settings.INFERENCE_MODEL_PATH)

2023-11-14 12:22:09.167839: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-14 12:22:09.168269: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-14 12:22:09.168578: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [None]:
# print(f"Before: {len(uncategorized)}")
# uncategorized = uncategorized[uncategorized.CategoryCode.isna()]
# print(f"After: {len(uncategorized)}")

In [9]:
def save_predictions(ocr, level1, level2, level3, level4, level5, file_path):
    with open(file_path, "a") as f:
        f.write(json.dumps({
            "ocr": ocr,
            "level_1": level1.tolist(),
            "level_2": level2.tolist(),
            "level_3": level3.tolist(),
            "level_4": level4.tolist(),
            "level_5": level5.tolist(),
        }) + "\n")

batch_size = 128

prediction_file_path.unlink(missing_ok=True)

# Set up the executor with the desired number of worker threads
with ThreadPoolExecutor() as executor:
    # Submit tasks to the executor for asynchronous write operations
    for batch_start in range(0, len(uncategorized), batch_size):
        ocrs = uncategorized["OcrValue"].iloc[batch_start: batch_start + batch_size]
        predictions = model.predict(ocrs, batch_size=batch_size)
        for ocr, level1, level2, level3, level4, level5 in zip(ocrs, *predictions):
            executor.submit(save_predictions, ocr, level1, level2, level3, level4, level5, prediction_file_path)


2023-11-14 12:25:44.387996: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700




KeyboardInterrupt: 

In [32]:
try:
    del uncategorized
    del already_generated
except NameError:
    pass
probas = {"level_1": [], "level_2": [], "level_3": [], "level_4": [], "level_5": []}
cats = {k: [] for k in probas.keys()}
ocrs = []
with open(prediction_file_path) as f:
    for line in f:
        data = json.loads(line)
        ocrs.append(data["ocr"])
        for level in probas:
            probas[level].append(np.max(data[level]))
            cats[level].append(np.argmax(data[level]))

probas = pd.DataFrame(probas)
cats = pd.DataFrame(cats)
ocrs = pd.DataFrame({"OcrValue": ocrs})

In [7]:
probas.head()

Unnamed: 0,level_1,level_2,level_3,level_4,level_5
0,0.884881,0.399181,0.384116,0.719063,0.892706
1,0.998188,0.996439,0.944249,0.520416,0.822916
2,1.0,0.999996,0.999992,0.999998,0.999987
3,1.0,1.0,0.999998,1.0,1.0
4,0.859141,0.965627,0.995257,0.891527,0.529264


In [9]:
((probas.level_1 > .5) & (probas.level_2 > .5) & (probas.level_3 > .5)).mean()

0.45309068978021316

In [11]:
(probas.level_1 > .5).mean()

0.854965979371059

In [12]:
cats

Unnamed: 0,level_1,level_2,level_3,level_4,level_5
0,3,3,3,0,0
1,1,5,0,2,0
2,3,8,16,0,0
3,3,8,16,0,0
4,1,6,0,0,0
...,...,...,...,...,...
6353787,1,4,0,1,19
6353788,1,8,15,0,19
6353789,1,2,2,3,19
6353790,1,2,4,0,19


In [13]:
labels_conf = []
level_start = 0
level_end = 0
for level in range(settings.DEEPEST_LEVEL):
    level_end = level_start + len(labels.loc[level + 1])
    labels_conf.append({
        "level": level + 1,
        "start": level_start,
        "end": level_end,
        "num_classes": len(labels.loc[level + 1])
    })
    level_start = level_end

In [18]:
for col, level_conf in zip(cats.columns, labels_conf):
    cats.loc[cats[col] == level_conf['num_classes'] - 1, col] = pd.NA
cats.isna().mean()

level_1    0.000088
level_2    0.002538
level_3    0.002643
level_4    0.018459
level_5    0.650560
dtype: float64

In [26]:
useful_threshold = 0.9

useful = cats.copy()
useful[~(probas > useful_threshold)] = pd.NA

useful.dropna(how="all", axis=0, inplace=True)

useful.isna().mean()

level_1    0.107274
level_2    0.311281
level_3    0.541555
level_4    0.641589
level_5    0.841767
dtype: float64

In [27]:
useful.level_1.value_counts()

level_1
0.0    1305046
1.0    1279068
2.0     419242
3.0     346342
Name: count, dtype: int64

In [28]:
useful

Unnamed: 0,level_1,level_2,level_3,level_4,level_5
1,1.0,5.0,0.0,,
2,3.0,8.0,16.0,0.0,0.0
3,3.0,8.0,16.0,0.0,0.0
4,,6.0,0.0,,
5,1.0,,,,
...,...,...,...,...,...
6353785,0.0,,,,
6353787,1.0,4.0,0.0,,
6353788,1.0,8.0,,,
6353789,1.0,,,,


In [29]:
na_mask = np.zeros(len(useful), dtype=bool)
for col in useful.columns:
    useful.loc[na_mask, col] = np.nan
    na_mask = useful[col].isna().values

In [30]:
useful = useful.dropna(subset=["level_3"], axis=0)
for col, level_conf in zip(useful.columns, labels_conf):
    useful[col].fillna(level_conf['num_classes'] - 1, inplace=True)

useful = useful.astype(int)
useful

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  useful[col].fillna(level_conf['num_classes'] - 1, inplace=True)


Unnamed: 0,level_1,level_2,level_3,level_4,level_5
1,1,5,0,33,19
2,3,8,16,0,0
3,3,8,16,0,0
7,1,5,0,33,19
8,0,3,0,0,19
...,...,...,...,...,...
6353774,1,5,0,0,19
6353775,3,7,15,0,19
6353777,1,5,1,33,19
6353778,1,4,0,33,19


In [33]:
gen_train = useful.merge(ocrs, left_index=True, right_index=True)
if generated_train_data_path.exists():
    old_gen_train = pd.read_csv(generated_train_data_path)
    gen_train = pd.concat([old_gen_train, gen_train], ignore_index=True).drop_duplicates(subset=["OcrValue"])
else:
    gen_train = gen_train.drop_duplicates(subset=["OcrValue"])
gen_train.to_csv(generated_train_data_path, index=False)