In [1]:
%load_ext autoreload
%autoreload 2

%pip install transformers

from google.colab import drive, auth
drive.mount('/content/drive', force_remount=True)

import random
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
tf.config.optimizer.set_jit(True)

from tqdm.auto import tqdm
from transformers import AdamWeightDecay
from transformers import (DistilBertTokenizerFast, BertTokenizer,
                          BartTokenizer, RobertaTokenizer,
                          T5Tokenizer)

import os
from pathlib import Path

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/modules')
import pipeline, post_processing, tfmodels
import quad_search_fc, order_check_fc, md_pointwise_fc, exists_fc

print('Libraries Imported')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 14.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 61.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 82.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [2]:
# Check GPU Model and Status
# A100 > V100 > T4 > P100
pipeline.check_for_gpu_status()
gpu_strategy = tf.distribute.MirroredStrategy()

!nvidia-smi
!nvidia-smi --query-gpu=gpu_name, driver_version, memory.total --format=csv

Found GPU at: /device:GPU:0.
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Tue Jul 12 18:15:56 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    38W / 300W |    491MiB / 16160MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                     

In [3]:
# Setup models to analyze

quad_trans_model = 'cbert'
quad_model_name = 'cbert_quad_search'

order_trans_model = 'cbert'
order_model_name = 'cbert_order_check'

params = {'batch_size': 64,
          'code_cells_per_block': 8,
          'quad_seq_len': 512,
          'order_seq_len': 512}

In [4]:
# Initalize Model Name and Tokenizer with Custom Tokens
trans_options = {'bert': ['distilbert-base-uncased', DistilBertTokenizerFast],
                 'bert_multi': ['bert-base-multilingual-cased', BertTokenizer],
                 'bart': ['facebook/bart-base', BartTokenizer],
                 'cbert': ['microsoft/codebert-base', RobertaTokenizer],
                 't5': ['Salesforce/codet5-small', T5Tokenizer]}

custom_tokens = ['[DIVIDER]', '[EMPTY]', '[EMOJI]']

# Quad Search Model
quad_trans = trans_options[quad_trans_model] 
quad_trans_name = quad_trans[0]
quad_tokenizer_class = quad_trans[1]
quad_tokenizer = quad_tokenizer_class.from_pretrained(quad_trans_name)
quad_tokenizer = pipeline.add_custom_tokens_to_tokenizer(quad_tokenizer, custom_tokens + ['<c>'])

# Order Model
order_trans = trans_options[order_trans_model] 
order_trans_name = order_trans[0]
order_tokenizer_class = order_trans[1]
order_tokenizer = order_tokenizer_class.from_pretrained(order_trans_name)
order_tokenizer = pipeline.add_custom_tokens_to_tokenizer(order_tokenizer, custom_tokens + ['<c>', '<m>'])

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498 [00:00<?, ?B/s]

Custom tokens have been added to the tokenizer
Custom tokens have been added to the tokenizer


In [5]:
# Unzip files from drive to disk
data_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/AI4Code.zip'
disk_path = '/content'
pipeline.unzip_files(data_path, disk_path)

Unzipping files:   0%|          | 0/139263 [00:00<?, ?it/s]


 Done unzipping data to disk path.


In [7]:
# Load in saved models

quad_model_path = f'/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/saved_models/{quad_model_name}'
order_model_path = f'/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/saved_models/{order_model_name}'

with gpu_strategy.scope():
  lr_scheduler = {'AdamWeightDecay': AdamWeightDecay, 'WarmupLinearDecay': tfmodels.WarmupLinearDecay}
  load_options = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
  
  quad_model = tf.keras.models.load_model(quad_model_path, custom_objects=lr_scheduler, options=load_options)
  order_model = tf.keras.models.load_model(order_model_path, custom_objects=lr_scheduler, options=load_options)
  print('Loaded in quad and order models')

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

In [8]:
# Load in excluded IDs and drop them from the main data filepath
excluded_ids_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/excluded_ids.yaml'

excluded_keys = ['similars']
excluded_ids = pipeline.load_excluded_ids(excluded_ids_path)
excluded_ids = pipeline.remove_only_excluded_keys(excluded_ids, excluded_keys)

data_dir = Path(disk_path)
data_paths = list((data_dir / 'train').glob('*.json'))
data_paths = pipeline.remove_excluded_id_paths(data_paths, excluded_ids)

print(f"There are {len(data_paths)} files after removing outliers and similar files.")

Safe loaded in excluded IDs.
There are 130271 files after removing outliers and similar files.


In [9]:
# We should just load it all in to see what it all looks like
def read_notebook(path):
    return (pd.read_json(path, dtype={'cell_type': 'category', 'source': 'str'})
              .assign(id=path.stem)
              .rename_axis('cell_id'))

start_path = 90000
end_path = 91000

test_paths = data_paths[start_path:end_path]
notebooks = [read_notebook(path) for path in tqdm(test_paths, desc='Train NBs')]
df = (pd.concat(notebooks)
        .set_index('id', append=True)
        .swaplevel()
        .sort_index(level='id', sort_remaining=False))

df_orders = pd.read_csv(data_dir / 'train_orders.csv',
                        index_col='id',
                        squeeze=True).str.split()

doc_ids = df.index.unique(level=0)

Train NBs:   0%|          | 0/1000 [00:00<?, ?it/s]

## Assemble Predicted Order

### Group By Document Size

In [10]:
# Group By Document Size

# Outlier mean code cell count is 94 and median is 84
# Median code cell size sits at 24 mean and 21 median without outliers

base_ids = []
small_ids = []

for doc_id in tqdm(doc_ids, desc="Grouping Doc Ids By Document Size"):
  code_markdown_locs = df.loc[doc_id, 'cell_type']
  code_cell_count = code_markdown_locs.value_counts()['code']

  if code_cell_count >= 4:  # Can apply both quadrant searcher and order check here for beam search
    base_ids.append(doc_id)

  else: # Contains three code cells or less, goes straight to order_check
    small_ids.append(doc_id)

Grouping Doc Ids By Document Size:   0%|          | 0/1000 [00:00<?, ?it/s]

### Quadrant Searcher

In [11]:
# Creature Quadrant Searcher Features
rng = np.random.default_rng()
base_maps = {}
base_input_ids = []
base_attention_masks = []

for base_id in tqdm(base_ids, desc='Creating Quadrant Searcher Features'):
  md_code_groupings = []
  current_doc = df.loc[base_id]
  code_markdown_locs, text = current_doc['cell_type'], current_doc['source']
  cell_metadata = [code_markdown_locs.to_dict()]
  markdown_idxs = np.where(code_markdown_locs == 'markdown')[0]
  code_cell_count = code_markdown_locs.value_counts()['code']

  text = pipeline.preprocess_text(text, code_markdown_locs, disable_print=True)
  input_ids = pipeline.encode_text_for_input_ids(text, quad_tokenizer, disable_print=True)
  md_code_groupings = quad_search_fc.collect_md_code_groupings_no_labels(input_ids, cell_metadata, disable_print=True)
  features = quad_search_fc.create_quadrant_features(md_code_groupings, quad_tokenizer, disable_print=True)
  
  base_maps[base_id] = len(markdown_idxs)
  base_input_ids.append(features[0])
  base_attention_masks.append(features[1])

base_input_ids = np.concatenate(base_input_ids, axis=0, dtype=np.int32)
base_attention_masks = np.concatenate(base_attention_masks, axis=0, dtype=np.int32)

Creating Quadrant Searcher Features:   0%|          | 0/979 [00:00<?, ?it/s]

In [12]:
# Predictions (Estimated around 2hrs inference time for 20,000 ids)
base_features = tf.data.Dataset.from_tensor_slices((base_input_ids, base_attention_masks))
dummy_labels = tf.data.Dataset.from_tensor_slices(np.zeros((len(base_input_ids), 4), dtype=np.float32))
dataset = (tf.data.Dataset.zip((base_features, dummy_labels))
            .batch(params['batch_size']))

pred_steps = math.ceil(len(base_input_ids) / params['batch_size'])
quad_preds = quad_model.predict(dataset, steps=pred_steps, verbose=1)



In [13]:
# Process Quad Predictions
pred_start = 0
base_quad_preds = {}

for base_id in tqdm(base_ids, desc='Processing Quad Preds'):
  md_quad_preds = []
  md_count = base_maps[base_id]
  doc_quad_preds = quad_preds[pred_start:pred_start+md_count]

  for i in range(md_count):
    md_pred = doc_quad_preds[i]
    best_quad = np.argsort(md_pred)[-1]
    md_quad_preds.append(best_quad)
  
  pred_start += md_count
  base_quad_preds[base_id] = np.asarray(md_quad_preds, dtype=np.int32)

Processing Quad Preds:   0%|          | 0/979 [00:00<?, ?it/s]

### Order Check Blocks

In [39]:
# Create Base Order Check Features
possible_pos = {}
base_input_ids = []
base_attention_masks = []

for base_id in tqdm(base_ids, desc="Creating Base-ID Order Check Features"):
  current_doc = df.loc[base_id]
  code_markdown_locs, text = current_doc['cell_type'], current_doc['source']
  possible_pos[base_id] = []
  doc_quad_preds = base_quad_preds[base_id]

  # Create the Document's Quadrants
  cell_metadata = [code_markdown_locs.to_dict()]
  code_cell_count = code_markdown_locs.value_counts()['code']
  quadrant_splits = pipeline.create_quadrant_splits(code_cell_count)

  idx_map = []
  md_locs = []

  for md_num, md_quad in enumerate(doc_quad_preds):
    md_idx = md_num + code_cell_count
    quad_range = np.arange(quadrant_splits[md_quad], quadrant_splits[md_quad+1])
    quad_idx_map, md_loc = post_processing.create_idx_map_from_quadrant(md_idx, quad_range, code_cell_count)

    idx_map.append(quad_idx_map)
    md_locs.append(md_loc)
    possible_pos[base_id].append(post_processing.convert_range_to_doc_pos(quad_range))
    
  # Preprocess text and create the initial input ids with the tokenizer
  text = pipeline.preprocess_text(text, code_markdown_locs, disable_print=True)
  input_ids = pipeline.encode_text_for_input_ids(text, order_tokenizer, disable_print=True)
  input_ids =  order_check_fc.adjust_input_ids_for_ordering(input_ids, cell_metadata)
  
  # Setup the idx maps to collect the input ids for the features
  idx_map = np.concatenate(idx_map, axis=0 , dtype=np.int32)
  md_locs = np.concatenate(md_locs, axis=0 , dtype=np.int32)
  mapped_input_ids = order_check_fc.index_input_ids_to_map(input_ids, idx_map)
  features = order_check_fc.add_special_tokens_and_masks(mapped_input_ids,
                                                         md_locs,
                                                         order_tokenizer,
                                                         params['order_seq_len'],
                                                         disable_print=True)
  base_input_ids.append(features[0])
  base_attention_masks.append(features[1])

base_input_ids = np.concatenate(base_input_ids, axis=0, dtype=np.int32)
base_attention_masks = np.concatenate(base_attention_masks, axis=0, dtype=np.int32)

Creating Base-ID Order Check Features:   0%|          | 0/979 [00:00<?, ?it/s]

  result = getattr(asarray(obj), method)(*args, **kwds)


In [38]:
# Create Small Order Check Features
small_input_ids = []
small_attention_masks = []

for small_id in tqdm(small_ids, desc="Creating Small-ID Order Check Features"):
  current_doc = df.loc[small_id]
  code_markdown_locs, text = current_doc['cell_type'], current_doc['source']
  possible_pos[small_id] = []

  cell_metadata = [code_markdown_locs.to_dict()]
  code_cell_count = code_markdown_locs.value_counts()['code']

  # Preprocess text and create the initial input ids with the tokenizer
  text = pipeline.preprocess_text(text, code_markdown_locs, disable_print=True)
  input_ids = pipeline.encode_text_for_input_ids(text, order_tokenizer, disable_print=True)
  input_ids =  order_check_fc.adjust_input_ids_for_ordering(input_ids, cell_metadata)
  
  # Setup the idx maps to collect the input ids for the features
  idx_map, md_locs = order_check_fc.create_idx_map(cell_metadata, disable_print=True)
  mapped_input_ids = order_check_fc.index_input_ids_to_map(input_ids, idx_map)
  mapped_input_ids = np.concatenate(mapped_input_ids, dtype='object')
  features = order_check_fc.add_special_tokens_and_masks(mapped_input_ids,
                                                         md_locs,
                                                         order_tokenizer,
                                                         params['order_seq_len'],
                                                         disable_print=True)
  small_input_ids.append(features[0])
  small_attention_masks.append(features[1])

if len(small_ids) != 0:
  small_input_ids = np.concatenate(small_input_ids, axis=0, dtype=np.int32)
  small_attention_masks = np.concatenate(small_attention_masks, axis=0, dtype=np.int32)

Creating Small-ID Order Check Features:   0%|          | 0/21 [00:00<?, ?it/s]

  result = getattr(asarray(obj), method)(*args, **kwds)


In [41]:
# Run model for order predictions

if len(small_ids) != 0:
  all_input_ids = np.concatenate([base_input_ids, small_input_ids], axis=0, dtype=np.int32)
  all_attention_masks = np.concatenate([base_attention_masks, small_attention_masks], axis=0, dtype=np.int32)
  pred_len = len(base_input_ids) + len(small_input_ids)
else:
  all_input_ids = base_input_ids
  all_attention_masks = base_attention_masks
  pred_len = len(base_input_ids)

order_features = tf.data.Dataset.from_tensor_slices((all_input_ids, all_attention_masks))
dummy_labels = tf.data.Dataset.from_tensor_slices(np.zeros((pred_len, 1), dtype=np.float32))
dataset = (tf.data.Dataset.zip((order_features, dummy_labels))
            .batch(params['batch_size']))
pred_steps = math.ceil(len(all_input_ids) / params['batch_size'])

order_preds = order_model.predict(dataset, steps=pred_steps, verbose=1)
base_order_preds = order_preds[:len(base_input_ids)]
small_order_preds = order_preds[len(base_input_ids):]
print('Completed order predictions')

Completed order predictions


In [61]:
# Process Base Order Predictions
all_md_pos = {}
pred_start = 0

for base_id in tqdm(base_ids, desc="Processing Order Check Preds"):
  code_markdown_locs = df.loc[base_id, 'cell_type']

  code_cell_count = code_markdown_locs.value_counts()['code']
  md_cell_count = code_markdown_locs.value_counts()['markdown']
  all_md_pos[base_id] = []

  for md_num in range(md_cell_count):
    possible_md_pos = possible_pos[base_id][md_num]
    pred_end = pred_start + len(possible_md_pos)
    md_preds = base_order_preds[pred_start:pred_end]
    best_pred = np.argmax(md_preds)

    pred_start += len(possible_md_pos)
    all_md_pos[base_id].append(possible_md_pos[best_pred])

Processing Order Check Preds:   0%|          | 0/979 [00:00<?, ?it/s]

In [64]:
# Process Small Order Predictions
pred_start = 0
for small_id in tqdm(small_ids, desc="Processing Order Check Preds"):
  code_markdown_locs = df.loc[small_id, 'cell_type']

  code_cell_count = code_markdown_locs.value_counts()['code']
  md_cell_count = code_markdown_locs.value_counts()['markdown']
  md_pred_len = code_cell_count + 1
  all_md_pos[small_id] = []

  for md_num in range(md_cell_count):
    pred_end = pred_start + md_pred_len
    all_md_pos[small_id].append(np.argmax(small_order_preds[pred_start:pred_end]))
    pred_start += md_pred_len

Processing Order Check Preds:   0%|          | 0/21 [00:00<?, ?it/s]

### Assemble Final Doc Order

In [66]:
# Final Assembly
y_preds = {}

for doc_id in tqdm(doc_ids, desc="Creating Features"):
  current_doc = df.loc[doc_id]
  code_ids = current_doc[current_doc['cell_type'] == 'code'].index  # Code cells are ordered
  markdown_ids = current_doc[current_doc['cell_type'] == 'markdown'].index  # Markdown cells are unordered
  markdown_ranks = np.asarray(all_md_pos[doc_id], dtype=np.int32)

  cell_order = post_processing.assemble_doc_order(code_ids, markdown_ids, markdown_ranks)
  y_preds[doc_id] = cell_order

y_preds = pd.Series(y_preds)
ground_truth = df_orders.loc[doc_ids]
print('\nCollected both predictions and ground truths.')

Creating Features:   0%|          | 0/1000 [00:00<?, ?it/s]


Collected both predictions and ground truths.


In [67]:
# Calculate Kendall Tau Scores
kendall_tau_score = post_processing.calculate_kendall_tau(ground_truth, y_preds)
print(f"The Kendall Tau Scores from documents {start_path}-{end_path} is {kendall_tau_score:.4f} ")

The Kendall Tau Scores from documents 90000-91000 is 0.8365 


### Kendall Tau Analysis and ID Collection

In [None]:
# Collect highest Kendall-Tau's and Lowest Scores, let's get their doc ids
kendall_taus = []

for gt, y_pred in zip(ground_truth, y_preds):
  kendall_taus.append(post_processing.calculate_kendall_tau([gt], [y_pred]))

kendall_taus = np.asarray(kendall_taus)
print(f'The mean score was {kendall_taus.mean():.4f} and the median score was {np.median(kendall_taus):.4f}.')

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
ax = sns.histplot(kendall_taus, ax=ax)
_ = ax.set(title='Kendall Tau Score Distribution')

In [None]:
# Collect Worst and Highest Ids
# Looking into this, the outputs share the same doc ids, unsure

highest_idx = []
lowest_idx = []

highest_kt = np.sort(kendall_taus)[::-1][:1000]
lowest_kt = np.sort(kendall_taus)[:1000]
print(f"Highest Scores are {highest_kt[:5]}")
print(f"\nLowest Scores are {lowest_kt[:5]}")

for i in range(len(highest_kt)):
  highest_idx.append(np.where(kendall_taus == highest_kt[i])[0])
  lowest_idx.append(np.where(kendall_taus == lowest_kt[i])[0])

highest_idx = np.concatenate(highest_idx)  # np.where operation will return all same indices that match
lowest_idx = np.concatenate(lowest_idx)

highest_doc_id = doc_ids[highest_idx]
lowest_doc_id = doc_ids[lowest_idx]

In [None]:
selected_id = lowest_doc_id[2]
ordered_df = post_processing.get_ordered_df(selected_id, df, df_orders)
display(ordered_df)

In [None]:
predicted_order = y_preds.loc[selected_id]
predicted_df = df.loc[selected_id].loc[predicted_order]
display(df.loc[selected_id].loc[predicted_order])

In [None]:
pred_text = df.loc[selected_id].loc[predicted_order]['source']
pred_metadata = df.loc[selected_id].loc[predicted_order]['cell_type']
pred_text = pipeline.preprocess_text(pred_text, pred_metadata, disable_print=True)
display(pred_text)

### Exists Blocks

In [None]:
exists_input_ids = []
exists_attention_masks = []
code_input_ids = []
code_attention_masks = []
exists_map = {}

for base_id in tqdm(base_ids, desc='Creating Exists Features'):
  md_code_groupings = []
  current_doc = df.loc[base_id]
  code_markdown_locs, text = current_doc['cell_type'], current_doc['source']
  cell_metadata = [code_markdown_locs.to_dict()]

  text = pipeline.preprocess_text(text, code_markdown_locs, disable_print=True)
  input_ids = pipeline.encode_text_for_input_ids(text, exists_tokenizer, disable_print=True)
  idx_map, code_vectors = exists_fc.create_idx_map(cell_metadata, disable_print=True)
  code_vector_input_ids = exists_fc.map_code_vector_input_ids(input_ids, code_vectors, exists_tokenizer)
  input_ids = exists_fc.adjust_input_ids_for_exist_blocks(input_ids, cell_metadata)
  features = exists_fc.create_features(idx_map, input_ids, code_vector_input_ids, exists_tokenizer, disable_print=True)

  exists_map[base_id] = code_markdown_locs.value_counts()['markdown']
  exists_input_ids.append(features[0])
  exists_attention_masks.append(features[1])
  code_input_ids.append(features[2])
  code_attention_masks.append(features[3])

exists_input_ids = np.concatenate(exists_input_ids, axis=0, dtype=np.int32)
exists_attention_masks = np.concatenate(exists_attention_masks, axis=0, dtype=np.int32)
code_input_ids = np.concatenate(code_input_ids, axis=0, dtype=np.int32)
code_attention_masks = np.concatenate(code_attention_masks, axis=0, dtype=np.int32)

Creating Exists Features:   0%|          | 0/393 [00:00<?, ?it/s]

In [None]:
# Predictions (Estimated around 2hrs inference time for 20,000 ids)

# Check against auto setting / sharding
base_features = tf.data.Dataset.from_tensor_slices((exists_input_ids,
                                                    exists_attention_masks,
                                                    code_input_ids,
                                                    code_attention_masks))
dummy_labels = tf.data.Dataset.from_tensor_slices(np.zeros((len(exists_input_ids), 1), dtype=np.float32))
dataset = (tf.data.Dataset.zip((base_features, dummy_labels))
            .batch(params['batch_size']))

pred_steps = math.ceil(len(exists_input_ids) / params['batch_size'])
exists_preds = exists_model.predict(dataset, steps=pred_steps, verbose=1)



KeyboardInterrupt: ignored

In [None]:
# Process Exists Predictions
pred_start = 0
base_quad_preds = {}

for base_id in tqdm(base_ids, desc='Processing Quad Preds'):
  md_quad_preds = []
  md_count = exists_map[base_id]
  pred_len = md_count * 4

  for i in range(md_count):
    md_preds = exists_preds[pred_start+i:pred_start+i+4]
    best_quads = np.argsort(md_preds, axis=0)[::-1][:params['beam_width']]
    md_preds = np.take(md_preds, best_quads, axis=0).reshape(2, 1)
    md_quad_preds.append([best_quads, md_preds])
  
  pred_start += pred_len
  base_quad_preds[base_id] = np.asarray(md_quad_preds, dtype=np.float32)