In [0]:
from nltk import tokenize
from transformers import BertTokenizer, BertTokenizerFast
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import tensorflow as tf
from tensorflow import convert_to_tensor
from transformers import TFBertModel

# Load from H5 (no Internet; this uses the embedded weights)
SRC_H5 = "/Volumes/openalex/works/models/sdg/SDG-BERT-v1.1-AURORA_v5.h5"
SRC_KERAS = "/Volumes/openalex/works/models/sdg/SDG-BERT-v1.1-AURORA_v5.keras"
SRC_SAVED_MODEL = "/Volumes/openalex/works/models/sdg/saved_model"

model = tf.saved_model.load(SRC_SAVED_MODEL)
tokenizer = BertTokenizerFast.from_pretrained("/Volumes/openalex/works/models/sdg/tokenizer")
predict = model.signatures['serving_default']

In [0]:
print(model.signatures)

In [0]:
abstract1 = """
Climate change is intensifying heatwaves and flooding in low-income regions. 
We quantify national greenhouse-gas mitigation scenarios and model projected temperature anomalies through 2050. 
Our results show that rapid decarbonization, renewable energy deployment, and reforestation reduce extreme weather risk and improve public health outcomes. 
Policy implications include carbon pricing, grid-scale storage, and climate adaptation finance for vulnerable communities.
"""  # Likely SDG 13 (Climate action)

abstract2 = """
Access to safely managed drinking water remains unequal across rural districts. 
We evaluate a low-cost chlorination and filtration intervention using longitudinal water-quality tests and child health surveys. 
Households receiving the intervention saw significant reductions in diarrheal disease and time spent collecting water, especially among women and girls. 
Findings support scalable infrastructure and governance reforms for universal and equitable water services.
"""  # Likely SDG 6 (Clean water and sanitation)

In [0]:
goal_names = {
    "Goal 1": "No poverty",
    "Goal 2": "Zero hunger",
    "Goal 3": "Good health and well-being",
    "Goal 4": "Quality Education",
    "Goal 5": "Gender equality",
    "Goal 6": "Clean water and sanitation",
    "Goal 7": "Affordable and clean energy",
    "Goal 8": "Decent work and economic growth",
    "Goal 9": "Industry, innovation and infrastructure",
    "Goal 10": "Reduced inequalities",
    "Goal 11": "Sustainable cities and communities",
    "Goal 12": "Responsible consumption and production",
    "Goal 13": "Climate action",
    "Goal 14": "Life below water",
    "Goal 15": "Life in Land",
    "Goal 16": "Peace, Justice and strong institutions",
    "Goal 17": "Partnerships for the goals"
}

def get_predictions(abstract: str):
    # tokenize to tensors
    enc = tokenizer(
        abstract.lower(),
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="tf"
    )
    # call SavedModel (keys must match signature)
    out = predict(
        input_ids = enc["input_ids"],
        attention_masks = enc["attention_mask"],  # plural per your signature
    )
    logits = out["target_layer"].numpy()[0]             # float32 [1,17]
    #probs = tf.math.sigmoid(logits)[0].numpy()         # multi-label â†’ sigmoid

    # build your existing output shape
    response = []
    for idx, p in enumerate(logits):
        sdg_number = idx + 1
        sdg_label  = f"Goal {sdg_number}"
        response.append({
            "prediction": float(p),
            "sdg": {
                "@type": "sdg",
                "id":   f"https://metadata.un.org/sdg/{sdg_number}",
                "label": sdg_label,
                "code":  str(sdg_number),
                "name":  goal_names[sdg_label],
                "type":  "Goal",
            }
        })
    # sort by prediction descending
    response.sort(key=lambda k: k["prediction"], reverse=True)
    return response

In [0]:
preds = get_predictions(abstract2)
preds[:5]  # top few

In [0]:
from transformers import BertTokenizer, BertTokenizerFast
layer = tf.keras.layers.TFSMLayer(SRC_SAVED_MODEL, call_endpoint="serving_default")
tokenizer = BertTokenizerFast.from_pretrained("/Volumes/openalex/works/models/sdg/saved_model/tokenizer")

 

In [0]:
import os
import tensorflow as tf
from transformers import TFBertModel
from transformers.models.bert import modeling_tf_bert as tfbert
from nltk import tokenize
from transformers import BertTokenizer, TFAutoModelForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import tensorflow as tf
from tensorflow import convert_to_tensor
from transformers import TFBertModel


# Build a custom_objects map (cover plain and "Custom>" names)
custom_objects = {
    "TFBertMainLayer":   tfbert.TFBertMainLayer,
    "TFBertEmbeddings":  tfbert.TFBertEmbeddings,
    "TFBertEncoder":     tfbert.TFBertEncoder,
    "TFBertPooler":      tfbert.TFBertPooler,
}
# Also register the "Custom>" aliases some saves use
custom_objects.update({f"Custom>{k}": v for k, v in custom_objects.items()})

# Load from H5 (no Internet; this uses the embedded weights)
SRC_H5 = "/Volumes/openalex/works/models/sdg/SDG-BERT-v1.1-AURORA_v5.h5"
SRC_KERAS = "/Volumes/openalex/works/models/sdg/SDG-BERT-v1.1-AURORA_v5.keras"
SRC_SAVED_MODEL = "/Volumes/openalex/works/models/sdg/saved_model"

# custom_objects={'TFBertMainLayer': TFBertModel, 'Custom>TFBertMainLayer': TFBertModel}
with tf.keras.utils.custom_object_scope(custom_objects):
  model = load_model(SRC_H5)

In [0]:
model.save("/Volumes/openalex/works/models/sdg/saved_model", save_format="tf")

In [0]:
%sh
ls -la /Volumes/openalex/works/models/sdg/saved_model/variables/

In [0]:
model.save_weights("/tmp/SDG-BERT-v1.1-AURORA_v5.weights.h5")

In [0]:
model.summary()

In [0]:
import tensorflow as tf
from transformers.models.bert.modeling_tf_bert import (
    TFBertMainLayer, TFBertEmbeddings, TFBertEncoder, TFBertPooler
)

custom_objects = {
    "TFBertMainLayer": TFBertMainLayer,
    "TFBertEmbeddings": TFBertEmbeddings,
    "TFBertEncoder": TFBertEncoder,
    "TFBertPooler": TFBertPooler,
}
# also register the legacy "Custom>" aliases some saves used
custom_objects.update({f"Custom>{k}": v for k, v in custom_objects.items()})

SRC_H5 = "/Volumes/openalex/works/models/sdg/SDG-BERT-v1.1-AURORA_v5.h5"

with tf.keras.utils.custom_object_scope(custom_objects):
    model = tf.keras.models.load_model(
        SRC_H5,
        compile=False,
        safe_mode=False,   # <-- important for Keras 3 legacy graphs
    )


In [0]:
model = TFAutoModelForSequenceClassification.from_pretrained(SRC_H5, output_hidden_states=True)
model.trainable = False

In [0]:
# Clear the custom objects
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"
from transformers import TFBertModel

model = TFBertModel.from_pretrained(SRC_H5)

# model = tf.keras.models.load_model(
#     SRC_H5,
#     custom_objects={'Custom>TFBertMainLayer': TFBertMainLayer},
#     compile=False,
#     safe_mode=False,   # <-- important for Keras 3 legacy graphs
# )

In [0]:
from pyspark.sql.types import *
import pyspark.sql.functions as F

df = (spark.table("openalex.works.openalex_works")
    .where("ids.doi IS NOT NULL")
    .select("ids")
    .withColumn("ids", 
        F.transform_values("ids", 
            lambda k, v: F.when(k == "doi", 
                    F.concat(F.lit("https://doi.org/"),v)).otherwise(v)
        )
    )
)
display(df)

In [0]:
secret = {'username': dbutils.secrets.get(scope = "postgres-works", key = "user"),
        'password': dbutils.secrets.get(scope = "postgres-works", key = "password"),
        'host': dbutils.secrets.get(scope = "postgres-works", key = "host"),
        'dbname': dbutils.secrets.get(scope = "postgres-works", key = "dbname"),
        'port': dbutils.secrets.get(scope = "postgres-works", key = "port"),
        'engine': dbutils.secrets.get(scope = "postgres-works", key = "engine")}

df = (spark.read
        .format("postgresql")
        .option("dbtable", f"(select paper_id, predictions from mid.work_sdg) as new_table")
        .option("host", secret['host'])
        .option("port", secret['port'])
        .option("database", secret['dbname'])
        .option("user", secret['username'])
        .option("password", secret['password'])
        .option("partitionColumn", "paper_id")
        .option("lowerBound", "0")
        .option("upperBound", "4413863578")
        .option("numPartitions", "512").load())

df.write.format("delta").mode("overwrite").saveAsTable("openalex.mid.work_sdg")

In [0]:
secret = {'username': dbutils.secrets.get(scope = "postgres-works", key = "user"),
        'password': dbutils.secrets.get(scope = "postgres-works", key = "password"),
        'host': dbutils.secrets.get(scope = "postgres-works", key = "host"),
        'dbname': dbutils.secrets.get(scope = "postgres-works", key = "dbname"),
        'port': dbutils.secrets.get(scope = "postgres-works", key = "port"),
        'engine': dbutils.secrets.get(scope = "postgres-works", key = "engine")}

df = (spark.read
        .format("postgresql")
        .option("dbtable", f"(select * from mid.institution_ancestors_mv) as new_table")
        .option("host", secret['host'])
        .option("port", secret['port'])
        .option("database", secret['dbname'])
        .option("user", secret['username'])
        .option("password", secret['password'])
        .option("partitionColumn", "author_id")
        .option("lowerBound", "0")
        .option("upperBound", "10000109602")
        .option("numPartitions", "256").load())

df.write.format("delta").mode("overwrite").saveAsTable("openalex.mid.institution_ancestors")

In [0]:
%sql
SELECT count(*) FROM openalex.mid.author