In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
from topic_predictor import *

In [0]:
import os
#os.environ["TF_USE_LEGACY_KERAS"] = "1"

import pickle 
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, pipeline, AutoTokenizer
from sentence_transformers import SentenceTransformer
import model_config as cfg

# Load all artifacts
with open(os.path.join(cfg.model_path, "target_vocab.pkl"), "rb") as f:
    cfg.target_vocab = pickle.load(f)

with open(os.path.join(cfg.model_path, "inv_target_vocab.pkl"), "rb") as f:
    cfg.inv_target_vocab = pickle.load(f)

with open(os.path.join(cfg.model_path, "citation_feature_vocab.pkl"), "rb") as f:
    cfg.citation_feature_vocab = pickle.load(f)

with open(os.path.join(cfg.model_path, "gold_to_id_mapping_dict.pkl"), "rb") as f:
    cfg.gold_to_label_mapping = pickle.load(f)

with open(os.path.join(cfg.model_path, "gold_citations_dict.pkl"), "rb") as f:
    cfg.gold_dict = pickle.load(f)

with open(os.path.join(cfg.model_path, "non_gold_citations_dict.pkl"), "rb") as f:
    cfg.non_gold_dict = pickle.load(f)

cfg.emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.language_model_name, truncate=True)

print("✅ All model artifacts loaded into model_config")

In [0]:
from topic_predictor import *

# Loading the models
pred_model = create_model(len(cfg.target_vocab), 
                          len(cfg.citation_feature_vocab)+2)
#status = pred_model.load_weights("/Volumes/openalex/works/models/topic_classifier_v1/model_checkpoint/citation_part_only.keras",
#                        skip_mismatch=True)

print("✅ Model created.")
cfg.xla_predict = tf.function(pred_model, jit_compile=True)

#pt_model = AutoModelForSequenceClassification.from_pretrained(language_model_name, output_hidden_states=True)
#pt_model.eval()

language_model = TFAutoModelForSequenceClassification.from_pretrained(cfg.language_model_name, output_hidden_states=True)
language_model.trainable = False
cfg.xla_predict_lang_model = tf.function(language_model, jit_compile=True)

# # Sending a blank prediction through the model in order to get it "warmed up"
# _ = xla_predict(create_input_feature([[101, 102] + [0]*510, 
#                                       [1, 1] + [0]*510,
#                                       [1]+[0]*15, 
#                                       [1]+[0]*127,
#                                       np.zeros(384, dtype=np.float32)]))
print("✅ Model initialized")


# model.save("/dbfs/models/citation_part_only_full.keras")
# print("✅ Full model saved.")

In [0]:
pred_model.summary()

### Weights seem to be defined OK if though there are warnings?

In [0]:
for layer in pred_model.layers:
    weights = layer.get_weights()
    print(f"{layer.name}: {len(weights)} weights, shapes: {[w.shape for w in weights]}")

### Test Harness

In [0]:
import requests
open_req = "https://api.openalex.org/works/W4205779344"
resp = requests.get(open_req).json()
print(resp['id'])

if resp['primary_location']['source']:
    journal_display_name = resp['primary_location']['source']['display_name']
else:
    journal_display_name = ""


input_json = {'title': resp['title'], 
               'abstract_inverted_index': resp['abstract_inverted_index'], 
               'journal_display_name': journal_display_name, 
               'referenced_works': resp['referenced_works'],
               'inverted': True}

display(input_json)

In [0]:
json_str = json.dumps(input_json)
print(json_str)

In [0]:
#json_str = json.dumps(input_json)
input_df = pd.DataFrame.from_dict([input_json]).reset_index().rename(columns={'index': 'UID'})
result = transform_json(input_df)
print(result)

### PLAIN Test Bed - Needs to work first

In [0]:
def run_model_inference(model, record, ignore_citations_journal=False):
    # Step 1: Clean and merge title + abstract
    title = clean_title(record['title'])
    abstract = clean_abstract(record['abstract_inverted_index'], record['inverted'])
    merged_text = merge_title_and_abstract(title, abstract)
    print("🧹 Cleaned + Merged Text:\n", merged_text[:300], "...\n")

    # Step 2: Tokenize text
    tokenized = tokenize([merged_text])  # expects batch
    ids = tokenized[0][0]
    attention_mask = tokenized[1][0]
    print("🧪 Token IDs:", ids[:20])
    print("📏 Attention Mask:", attention_mask[:20])
    print("📚 Decoded Text from IDs:", cfg.tokenizer.decode(ids))

    # Step 3: Convert citation URLs to integers
    raw_refs = record.get("referenced_works", [])
    citation_ids = [int(x.split("https://openalex.org/W")[1]) for x in raw_refs]
    print("🔗 Raw Citation IDs:", citation_ids)

    if ignore_citations_journal:
        citation_0 = np.zeros(16, dtype=np.float32)
        citation_1 = np.zeros(128, dtype=np.float32)
    else:
        citation_0_ids, citation_1_ids = get_gold_citations_from_all_citations(citation_ids, cfg.gold_dict, cfg.non_gold_dict)
        print("🏅 Gold Citation IDs:", citation_0_ids)
        print("🥈 Non-Gold Citation IDs:", citation_1_ids)
        citation_0 = get_final_citations_feature(citation_0_ids, 16)
        citation_1 = get_final_citations_feature(citation_1_ids, 128)

    print("🧾 Citation_0 Features:", citation_0[:10])
    print("🧾 Citation_1 Features:", citation_1[:10])

    # Step 4: Get journal embedding
    if ignore_citations_journal:
        journal_emb = np.zeros(384, dtype=np.float32)
    else:
        journal_emb = get_journal_emb(record.get("journal_display_name", ""))

    print("📄 Journal Embedding (first 10):", journal_emb[:10])

    # Step 5: Prepare tensors
    tensor_inputs = create_input_feature([ids, attention_mask, citation_0, citation_1, journal_emb])
    all_rows = [tf.convert_to_tensor([tensor_inputs[i][0]]) for i in range(5)]

    lang_model_inputs = cfg.tokenizer(text, max_length=512, truncation=True, padding='max_length', return_tensors='tf')
    lang_output = cfg.xla_predict_lang_model(**lang_model_inputs).hidden_states[-1]
    # Step 6: Get language model output and make prediction
    # lang_output = get_lang_model_output(all_rows[0], all_rows[1])
    print("🧠 Lang Model Output Shape:", lang_output.shape)

    preds = model((all_rows[2], all_rows[3], all_rows[4], lang_output))
    print("📈 Raw Prediction Tensor Shape:", preds.shape)

    # Step 7: Get top-k
    topk = tf.math.top_k(preds, k=5)
    indices = topk.indices.numpy().tolist()[0]
    scores = topk.values.numpy().tolist()[0]
    labels = [cfg.inv_target_vocab.get(i, f"UNKNOWN_{i}") for i in indices]

    print("🏷️ Predicted Label IDs:", indices)
    print("🔥 Scores:", scores)
    print("🏷️ Labels:", labels)

    # Step 8: Return enriched record
    return {
        "UID": record.get("UID", 0),
        "preds": indices,
        "scores": scores,
        "labels": labels,
    }


results = run_model_inference(pred_model, input_json, ignore_citations_journal=False)
display(results)


In [0]:
model.config.id2label[2149]

### Uuse `process_data_as_df` he model output does not look correct - look more

In [0]:
input_df = pd.DataFrame.from_dict([input_json]).reset_index().rename(columns={'index': 'UID'})
final_preds = process_data_as_df(input_df)

def map_preds_to_labels(preds_list, vocab_dict):
    return [[vocab_dict.get(i, f"UNKNOWN_{i}") for i in preds] for preds in preds_list]

final_preds["pred_labels"] = map_preds_to_labels(final_preds["preds"], cfg.inv_target_vocab)
final_preds["pred_labels_auto"] = map_preds_to_labels(final_preds["preds"], model.config.id2label)


display(final_preds)

In [0]:
output = postprocess_predictions(final_preds.iloc[0])
print(output)

In [0]:
classifier_multi = pipeline(model="OpenAlex/bert-base-multilingual-cased-finetuned-openalex-topic-classification-title-abstract", top_k=5)
classifier_multi("""<TITLE>Supplemental Material: Estimating paleotidal constituents from Pliocene “tidal gauges”—an example from the paleo-Orinoco Delta, Trinidad""")

### Works as expected

In [0]:
olfactory_input = """<TITLE>The Shape of the Olfactory Bulb Predicts Olfactory Function<ABSTRACT>The olfactory bulb (OB) plays a key role in the processing of olfactory information. A large body of research has shown that OB volumes correlate with olfactory function, which provides diagnostic and prognostic information in olfactory dysfunction. Still, the potential value of the OB shape remains unclear. Based on our clinical experience we hypothesized that the shape of the OB predicts olfactory function, and that it is linked to olfactory loss, age, and gender. The aim of this study was to produce a classification of OB shape in the human brain, scalable to clinical and research applications. Results from patients with the five most frequent causes of olfactory dysfunction (n = 192) as well as age/gender-matched healthy controls (n = 77) were included. Olfactory function was examined in great detail using the extended “Sniffin’ Sticks” test. A high-resolution structural T2-weighted MRI scan was obtained for all. The planimetric contours (surface in mm2) of OB were delineated manually, and then all surfaces were added and multiplied to obtain the OB volume in mm3. OB shapes were outlined manually and characterized on a selected slice through the posterior coronal plane tangential to the eyeballs. We looked at OB shapes in terms of convexity and defined two patterns/seven categories based on OB contours: convex"""

paleotidal_input = """<TITLE>Supplemental Material: Estimating paleotidal constituents from Pliocene “tidal gauges”—an example from the paleo-Orinoco Delta, Trinidad"""

print(classifier_multi(olfactory_input))
print(classifier_multi(paleotidal_input))

# 10971	Olfactory and Sensory Function Studies
# 11667	Advanced Chemical Sensor Technologies
# 14144	Neurological Disease Mechanisms and Treatments

In [0]:
from transformers import pipeline
import time

# Prepare inputs
X = 64
inputs = [olfactory_input, paleotidal_input] * (X // 2)

# Try different batch sizes
for bs in [32,64]:
    print(f"\n🚀 Testing batch_size={bs}")

    # Re-initialize pipeline with new batch size
    classifier_multi = pipeline(
        model="OpenAlex/bert-base-multilingual-cased-finetuned-openalex-topic-classification-title-abstract",
        top_k=5,
        batch_size=bs
    )

    # Warm-up (important for GPU/XLA)
    _ = classifier_multi(inputs[:2])

    # Time batch inference
    start = time.time()
    _ = classifier_multi(inputs)
    duration = time.time() - start

    print(f"⏱️  Total time: {duration:.2f} sec for {X} inputs")
    print(f"⚡ Avg time per input: {duration / X:.3f} sec")


In [0]:
label2id = classifier_multi.model.config.label2id
print(label2id["971: Olfactory Dysfunction in Health and Disease"])

In [0]:
print(classifier_multi.model.config.id2label[1733])
print(cfg.inv_target_vocab[1733])

In [0]:
matches = [(k, v) for k, v in cfg.inv_target_vocab.items() if v.startswith("971: ")]
print(matches)


## Use BERT Model Only for now...

In [0]:
from transformers import AutoConfig

auto_conf = AutoConfig.from_pretrained("OpenAlex/bert-base-multilingual-cased-finetuned-openalex-topic-classification-title-abstract")


In [0]:
print(auto_conf.id2label)

In [0]:
cfg.inv_target_vocab[972]

In [0]:
# # /Volumes/openalex/works/models/topic_classifier_v1/full_model.keras
# pred_model.save("/dbfs/tmp/full_model.keras")
# print("✅ Full Keras model saved.")

#pred_model.save("/Volumes/openalex/works/models/topic_classifier_v1/tf_savedmodel", save_format="tf")
#print("✅ Saved model saved.")

In [0]:
from keras.models import load_model
#size 39,654,502 (checkpoint 39,650,086)
test_model = load_model("/Volumes/openalex/works/models/topic_classifier_v1/full_model.keras")


In [0]:
test_model.summary()

In [0]:
for layer in test_model.layers:
    if "output_layer" in layer.name:
        weights = layer.get_weights()
        print(f"{layer.name}: {[w.shape for w in weights]}")

## SUCCESS - raw BERT use (no hugging_face pipeline)

In [0]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import numpy as np

# Load model and tokenizer
model_name = "OpenAlex/bert-base-multilingual-cased-finetuned-openalex-topic-classification-title-abstract"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

# Example text
text = olfactory_input

# Tokenize
inputs = tokenizer(text, max_length=512, truncation=True, padding='max_length', return_tensors='tf')

# Forward pass
outputs = model(**inputs)
logits = outputs.logits  # shape [1, num_classes]

# Apply softmax to get probabilities
probs = tf.nn.softmax(logits, axis=1)

# Get top-5 predictions
topk = tf.math.top_k(probs, k=5)
indices = topk.indices.numpy()[0]
scores = topk.values.numpy()[0]

# Map indices to labels
labels = [model.config.id2label[i] for i in indices]

# Create readable output
readable = [{"label": lbl, "score": round(float(scr), 4)} for lbl, scr in zip(labels, scores)]

print(readable)
