# Mapping ML articles to ML tasks

The ML tasks are defined in the ML-Ontology (ml-ontology/ML_Ontology.ttl)

In [3]:
# Imports

import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

In [None]:
# Cell A: Rask diagnose av GPU/PyTorch
import torch

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
print("torch cuda:", torch.version.cuda)
print("device count:", torch.cuda.device_count())

if torch.cuda.is_available():
    print("gpu name:", torch.cuda.get_device_name(0))
    print("capability:", torch.cuda.get_device_capability(0)) 


torch: 2.9.1+cu128
cuda available: True
torch cuda: 12.8
device count: 1
gpu name: Tesla P100-PCIE-16GB
capability: (6, 0)


In [10]:
ml_tasks_path = "../data/ml_tasks.csv"
articles_path = "../data/ml_articles_dataset.csv"

model_name = "sentence-transformers/all-mpnet-base-v2"
TOP_K = 5
THRESHOLD = 0.35
BATCH_SIZE = 512


In [None]:
# Load data
tasks = pd.read_csv(ml_tasks_path)
articles = pd.read_csv(articles_path)

# Task profile : label + description
tasks["task_text"] = (tasks["taskLabel"].fillna("") + ". " + tasks["description"].fillna("")).str.strip()

# Document text : abstract + title
articles["doc_text"] = (articles["title"].fillna("") + ". " + articles["clean_abs"].fillna("")).str.strip()





Unnamed: 0,doi,title
0,10.3390/asi6050076,Measuring Carbon in Cities and Their Buildings...
1,10.1016/j.resconrec.2023.107073,Predictive modeling for the quantity of recycl...
2,10.30638/eemj.2023.018,END-OF-LIFE VEHICLES ASSESSMENT OF THE AUTOMOB...
3,10.1115/DETC2023-114718,PREDICTING THE QUANTITY OF RECYCLED END-OF-LIF...
4,10.1007/978-3-031-69626-8_78,Machine Learning Integration in LCA: Addressin...


In [16]:
tasks[["task", "taskLabel"]].head()

Unnamed: 0,task,taskLabel
0,http://h-da.de/ml-ontology/action_recognition,action recognition
1,http://h-da.de/ml-ontology/anomaly_detection,anomaly detection
2,http://h-da.de/ml-ontology/association_rule_le...,association rule learning
3,http://h-da.de/ml-ontology/audio_classification,audio classification
4,http://h-da.de/ml-ontology/audio_regression,audio regression


In [18]:
articles[["doi", "title", "doc_text"]].head()

Unnamed: 0,doi,title,doc_text
0,10.3390/asi6050076,Measuring Carbon in Cities and Their Buildings...,Measuring Carbon in Cities and Their Buildings...
1,10.1016/j.resconrec.2023.107073,Predictive modeling for the quantity of recycl...,Predictive modeling for the quantity of recycl...
2,10.30638/eemj.2023.018,END-OF-LIFE VEHICLES ASSESSMENT OF THE AUTOMOB...,END-OF-LIFE VEHICLES ASSESSMENT OF THE AUTOMOB...
3,10.1115/DETC2023-114718,PREDICTING THE QUANTITY OF RECYCLED END-OF-LIF...,PREDICTING THE QUANTITY OF RECYCLED END-OF-LIF...
4,10.1007/978-3-031-69626-8_78,Machine Learning Integration in LCA: Addressin...,Machine Learning Integration in LCA: Addressin...


In [22]:
# Load embedding model

model = SentenceTransformer(model_name)

# Tokenizer + max length
tokenizer = model.tokenizer
max_len = model.get_max_seq_length()

print("max length :", max_len)

max length : 384


In [None]:
# Embedding + normalization
def get_embedding(text: str):
    if not isinstance(text, str):
        text = ""
    return model.encode(
        text,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=True,  # dot-cosine
    )


In [24]:
# Cell 5: Chunking-embedding for lange tekster (unngår truncation)
def embed_with_chunking(text, max_len=max_len):
    # Håndter NaN/ikke-streng
    if not isinstance(text, str):
        text = ""

    # Tokeniser til word pieces
    tokens = tokenizer.tokenize(text)

    # Kort nok -> embed direkte
    if len(tokens) <= max_len:
        return get_embedding(text)

    # Del opp i biter som passer maks-lengden
    chunks = [
        tokenizer.convert_tokens_to_string(tokens[i:i + max_len])
        for i in range(0, len(tokens), max_len)
    ]

    # Embed hver chunk
    emb_chunks = [get_embedding(chunk) for chunk in chunks]

    # Gjennomsnitt (beholder dimensjon)
    return np.mean(emb_chunks, axis=0)


In [25]:
# Cell 6: Batchet chunking-embedding (raskere enn 1 og 1)
def embed_texts_chunked(texts, batch_size=BATCH_SIZE):
    embs = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        for t in batch:
            embs.append(embed_with_chunking(t))
    return np.vstack(embs)


In [27]:
# Cell 7: Lag embeddings for tasks og artikler
task_emb = embed_texts_chunked(tasks["task_text"].tolist())
doc_emb  = embed_texts_chunked(articles["doc_text"].tolist())

task_emb.shape, doc_emb.shape


  0%|          | 0/1 [00:00<?, ?it/s]

AcceleratorError: CUDA error: no kernel image is available for execution on the device
Search for `cudaErrorNoKernelImageForDevice' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
