In [None]:
!pip install keybert
!pip install keybert[flair]
!pip install keybert[gensim]
!pip install keybert[spacy]
!pip install keybert[use]
!pip install keybert sentence-transformers flair spacy gensim tensorflow-hub transformers
!python -m spacy download en_core_web_md
!pip install sklearn rouge_score
!pip install datasets
!pip install datasets
!pip install rouge_score


In [None]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from flair.embeddings import TransformerDocumentEmbeddings
import spacy
import gensim.downloader as api
import tensorflow_hub as hub
import flair
import numpy as np
import torch
from transformers import AutoModelForSeq2SeqLM, AutoModel, AutoTokenizer, T5ForConditionalGeneration, BartForConditionalGeneration
from datasets import load_dataset
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
#load Inspec dataset with trust_remote_code=True
ds = load_dataset("taln-ls2n/inspec", trust_remote_code=True)

In [None]:
texts = [sample["abstract"] for sample in ds["train"]]  #using 'abstract' as the main text
ground_truth_keywords = [sample["keyphrases"] for sample in ds["train"]]  #'keyphrases' contains ground truth


In [None]:
#load embedding-based models
embedding_models = {
    "sentence-transformers": SentenceTransformer("all-MiniLM-L6-v2"),
    "flair": TransformerDocumentEmbeddings("bert-base-uncased"),
    "roberta": TransformerDocumentEmbeddings("roberta-base"),
    "spacy": spacy.load("en_core_web_md"),
    "gensim": api.load("glove-wiki-gigaword-100"),  #GloVe 100D
    "use": hub.load("https://tfhub.dev/google/universal-sentence-encoder/4"),
}

#categorize models correctly
bert_models = {
    "bert-keyword-extractor": "yanekyuk/bert-keyword-extractor"
}

generative_models = {
    "vlt5-base-keywords": "Voicelab/vlt5-base-keywords",
    "t5": "t5-small",
    "bart": "facebook/bart-large-cnn",
}

#load tokenizers
tokenizers = {name: AutoTokenizer.from_pretrained(model) for name, model in {**bert_models, **generative_models}.items()}

#load models properly
embedding_models.update({name: AutoModel.from_pretrained(model) for name, model in bert_models.items()})
embedding_models.update({name: AutoModelForSeq2SeqLM.from_pretrained(model) for name, model in generative_models.items()})


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
#function to extract embeddings
def get_embedding(model_name, text):
    #Sentence Transformers
    if model_name == "sentence-transformers":
        return embedding_models[model_name].encode([text])[0]

    #Flair or RoBERTa-based models
    elif model_name in ["flair", "roberta"]:
        sentence = flair.data.Sentence(text)
        embedding_models[model_name].embed(sentence)
        return sentence.embedding.cpu().detach().numpy()

    #spaCy embeddings
    elif model_name == "spacy":
        return embedding_models[model_name](text).vector

    #Gensim embeddings
    elif model_name == "gensim":
        words = text.lower().split()
        vectors = [embedding_models[model_name].get_vector(w) for w in words if w in embedding_models[model_name]]
        return np.mean(vectors, axis=0) if vectors else np.zeros(100)

    #Universal Sentence Encoder (USE)
    elif model_name == "use":
        return embedding_models[model_name]([text])[0].numpy()

    #BERT-based models
    elif model_name in ["bert-keyword-extractor", "vlt5-base-keywords"]:
        tokenizer = tokenizers[model_name]
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        outputs = embedding_models[model_name](**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    #Generative models (T5, BART)
    elif model_name in ["t5", "bart"]:
        tokenizer = tokenizers[model_name]
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        model_output = embedding_models[model_name](**inputs)
        return model_output.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    else:
        raise ValueError(f"Unsupported model: {model_name}")


In [None]:
#run KeyBERT extraction for embedding models
results = {}

#define KeyBERT parameters
keybert_params = {

    "keyphrase_ngram_range": (1, 3),
    "top_n": 15,
    "stop_words": None,
    "use_mmr": True,
    "diversity": 1.0

}


for model_name, model in embedding_models.items():
    print(f"Testing model: {model_name}")

    #skip generative models
    if model_name in generative_models:
        print(f"Skipping {model_name}, as it requires text generation.")
        continue

    kw_model = KeyBERT(model=model)

    extracted_keywords = [
        [kw[0].lower() for kw in kw_model.extract_keywords(text, **keybert_params)]
        for text in texts
    ]

    results[model_name] = extracted_keywords

In [None]:
#function for generative keyword extraction
def generate_keywords_with_model(model_name, text):
    tokenizer = tokenizers[model_name]
    model = embedding_models[model_name]

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    #define generation parameters
    generation_params = {

      "max_length": 100,
      "num_beams": 5,
      "temperature": 1.8,
      "top_k": 200,
      "top_p": 0.99,
      "repetition_penalty": 1.0

}


    with torch.no_grad():
        outputs = model.generate(**inputs, **generation_params)

    return tokenizer.decode(outputs[0], skip_special_tokens=True).lower().split()

In [None]:
#run keyword generation for generative models
for model_name in generative_models.keys():
    print(f"Generating keywords with {model_name}...")
    extracted_keywords = [generate_keywords_with_model(model_name, text) for text in texts]
    results[model_name] = extracted_keywords

In [None]:
#initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

#store ROUGE results
rouge_results = {}

#compute ROUGE scores for each model
for model_name, extracted in results.items():
    print(f"Calculating ROUGE for {model_name}...")

    rouge_scores = {"rouge1": [], "rouge2": [], "rougeL": []}

    for gt, pred in zip(ground_truth_keywords, extracted):
        #convert list of keywords to a single string
        gt_text = " ".join(gt)
        pred_text = " ".join(pred)

        #compute ROUGE scores
        scores = scorer.score(gt_text, pred_text)

        #store results
        rouge_scores["rouge1"].append(scores["rouge1"].fmeasure)
        rouge_scores["rouge2"].append(scores["rouge2"].fmeasure)
        rouge_scores["rougeL"].append(scores["rougeL"].fmeasure)

    #compute average scores
    rouge_results[model_name] = {metric: np.mean(values) for metric, values in rouge_scores.items()}

#print results
for model, scores in rouge_results.items():
    print(f"\nROUGE Scores for {model}:")
    for metric, score in scores.items():
        print(f"  {metric}: {score:.4f}")


In [None]:
#store cosine similarity results
cosine_similarities = {}

for model_name, extracted in results.items():
    print(f"Calculating Cosine Similarity for {model_name}...")

    cos_sim = []

    for gt, pred in zip(ground_truth_keywords, extracted):
        if not gt or not pred:
            continue

        #get embeddings
        gt_embedding = get_embedding("sentence-transformers", " ".join(gt))
        pred_embedding = get_embedding("sentence-transformers", " ".join(pred))

        #compute cosine similarity
        similarity = cosine_similarity([gt_embedding], [pred_embedding])[0][0]
        cos_sim.append(similarity)

    #compute average similarity
    cosine_similarities[model_name] = np.mean(cos_sim) if cos_sim else 0

#print results
for model, sim in cosine_similarities.items():
    print(f"\nCosine Similarity for {model}: {sim:.4f}")
