In [33]:
import numpy as np
import pandas as pd

In [1]:
import transformers

In [222]:
def calculate_f1_score(actual_keyphrases, predicted_keyphrases):
    actual_set = set([s.lower() for s in actual_keyphrases])
    predicted_set = set(s.lower() for s in predicted_keyphrases)

    true_positives = len(actual_set.intersection(predicted_set))
    precision = true_positives / len(predicted_set) if predicted_set else 0
    recall = true_positives / len(actual_set) if actual_set else 0

    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    print(f"Precision: {precision} \t Reacll: {recall} \t F1 Score: {f1_score}")
    return f1_score


actual_keyphrases = ['keyphrase one', 'keyphrase two', 'keyphrase three']
predicted_keyphrases = ['keyphrase one', 'keyphrase four', 'keyphrase two']
f1_score = calculate_f1_score(actual_keyphrases, predicted_keyphrases)
print(f"F1 Score: {f1_score}")

Precision: 0.6666666666666666 	 Reacll: 0.6666666666666666 	 F1 Score: 0.6666666666666666
F1 Score: 0.6666666666666666


In [223]:
def convert_to_unigrams(keyphrases):
    """Convert a list of keyphrases to a set of unigrams."""
    return set(unigram for keyphrase in keyphrases for unigram in keyphrase.split())

def partial_f1_score(gold_keyphrases, extracted_keyphrases):
    """Calculate the F1 score for partial matches between two lists of keyphrases."""
    gold_unigrams = convert_to_unigrams(gold_keyphrases)
    extracted_unigrams = convert_to_unigrams(extracted_keyphrases)

    true_positives = len(gold_unigrams.intersection(extracted_unigrams))
    false_positives = len(extracted_unigrams - gold_unigrams)
    false_negatives = len(gold_unigrams - extracted_unigrams)

    if true_positives == 0:
        return 0  # Return 0 to avoid division by zero if there are no true positives

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return f1_score

# Example usage
gold_keyphrases = ['machine learning', 'deep learning', 'neural networks']
extracted_keyphrases = ['learning in machines', 'networks', 'deep neural learning']

f1_score = partial_f1_score(gold_keyphrases, extracted_keyphrases)
print(f"F1 Score for partial match: {f1_score}")


F1 Score for partial match: 0.7272727272727272


In [31]:
from datasets import load_dataset

dataset = load_dataset("taln-ls2n/inspec")

In [34]:
df = pd.DataFrame(dataset['train'])

In [247]:
test_df = pd.DataFrame(dataset['test'])
validation_df = pd.DataFrame(dataset['validation'])

In [36]:
def lowercase(my_list):
    return [i.lower() for i in my_list]

In [40]:
df['abstract'] = df['abstract'].apply(lambda x: x.lower())
df['keyphrases'] = df['keyphrases'].apply(lowercase)

In [41]:
df

Unnamed: 0,id,title,abstract,keyphrases,prmu
0,761,Towards a NMR implementation of a quantum latt...,recent theoretical results suggest that an arr...,"[nmr implementation, quantum lattice gas algor...","[P, P, P, P, P, P, P, M]"
1,724,Banking on SMA funds [separately managed accou...,from investment management to technology to ba...,"[separately managed accounts, investment manag...","[P, P, P, P, P, P]"
2,1371,Design methodology for diagnostic strategies f...,this paper presents a method for the construct...,"[design methodology, diagnostic strategies, in...","[P, P, P, P, P]"
3,1334,A shy invariant of graphs,moving from a well known result of p.l. hammer...,"[graph invariant, induced odd cycles, minimum ...","[P, P, P, P, P, P]"
4,1419,PacketVideo. One step ahead of the streaming w...,"go beyond the hype, however, and it's clear th...","[packetvideo, wireless devices, mpeg-4, wirele...","[P, P, P, P, P, P, P, R]"
...,...,...,...,...,...
995,1124,Data extraction from the Web based on pre-defi...,"with the development of the internet, the worl...","[data extraction, schema, internet, informatio...","[P, P, P, P, P, R, M, U, U]"
996,118,Sensorless control of induction motor drives,controlled induction motor drives without mech...,"[sensorless control, induction motor drives, r...","[P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, ..."
997,635,Detection and estimation of abrupt changes in ...,detection of change-points in normal means is ...,"[generalized likelihood ratio test statistic, ...","[P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, ..."
998,1260,A dataflow computer which accelerates executio...,"in the dataflow machine, it is important to av...","[dataflow computer, sequential programs, prece...","[P, P, P, P, P, P, P, P, P, R, M, M]"


In [2]:
# Model parameters
from transformers import (
    Text2TextGenerationPipeline,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)


class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
    def __init__(self, model, keyphrase_sep_token=";", *args, **kwargs):
        super().__init__(
            model=AutoModelForSeq2SeqLM.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )
        self.keyphrase_sep_token = keyphrase_sep_token

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs
        )
        return [[keyphrase.strip() for keyphrase in result.get("generated_text").split(self.keyphrase_sep_token) if keyphrase != ""] for result in results]


In [3]:
# Load pipeline
model_name = "ml6team/keyphrase-generation-t5-small-inspec"
generator = KeyphraseGenerationPipeline(model=model_name)

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

In [19]:
text = 'Recent theoretical results suggest that an array of quantum information processors communicating via classical channels can be used to solve fluid dynamics problems. Quantum lattice-gas algorithms (QLGA) running on such architectures have been shown to solve the diffusion equation and the nonlinear Burgers equations. In this report, we describe progress towards an ensemble nuclear magnetic resonance (NMR) implementation of a QLGA that solves the diffusion equation. The methods rely on NMR techniques to encode an initial mass density into an ensemble of two-qubit quantum information processors. Using standard pulse techniques, the mass density can then manipulated and evolved through the steps of the algorithm. We provide the experimental results of our first attempt to realize the NMR implementation. The results qualitatively follow the ideal simulation, but the observed implementation errors highlight the need for improved control'

keyphrases = generator(text)

print(keyphrases)

[['quantum information processors', 'fluid dynamics problems', 'diffusion equation', 'nonlinear']]




In [177]:
import spacy
from keybert import KeyBERT

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
# text = "The quick brown fox jumps over the lazy dog. It barked and played in the garden."

# Process the text with spaCy for POS tagging
doc = nlp(text)

# Filter words based on POS tags
# categories = ['NOUN', 'PROPN', 'VERB', 'ADJ']
# filtered_text = " ".join([token.text for token in doc if token.pos_ in categories])

# Now, use KeyBERT to extract keywords from the filtered text
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=100)

# Print all extracted keywords and their scores
all_words = [word for word,_ in keywords if _>0]
keywords_score_dict = {}
for word, score in keywords:
    if score>0:
        print(f"{word}: {score}")
        keywords_score_dict[word]=score

qubit: 0.3605
quantum: 0.3381
nmr: 0.3305
lattice: 0.2493
diffusion: 0.2144
processors: 0.2031
dynamics: 0.2009
simulation: 0.187
density: 0.1866
ensemble: 0.1732
gas: 0.1584
algorithms: 0.153
implementation: 0.1522
pulse: 0.1501
magnetic: 0.1438
architectures: 0.142
qlga: 0.1381
resonance: 0.1371
algorithm: 0.1349
steps: 0.1305
encode: 0.1301
nonlinear: 0.1273
through: 0.1246
fluid: 0.1196
channels: 0.1072
via: 0.0968
nuclear: 0.0946
initial: 0.0865
communicating: 0.0802
burgers: 0.0801
classical: 0.0785
information: 0.0746
array: 0.0745
follow: 0.0654
techniques: 0.0615
first: 0.061
theoretical: 0.058
progress: 0.0565
qualitatively: 0.0538
running: 0.0533
been: 0.0529
recent: 0.05
have: 0.0479
observed: 0.0463
in: 0.046
equations: 0.0453
solves: 0.0452
an: 0.0447
mass: 0.0321
then: 0.0262
the: 0.0233
describe: 0.0229
evolved: 0.0168
improved: 0.0163
two: 0.0163
methods: 0.0126
suggest: 0.0104
shown: 0.0099
manipulated: 0.0071
experimental: 0.0067
on: 0.0052
control: 0.0046


In [178]:
sorted(all_words)

['algorithm',
 'algorithms',
 'an',
 'architectures',
 'array',
 'been',
 'burgers',
 'channels',
 'classical',
 'communicating',
 'control',
 'density',
 'describe',
 'diffusion',
 'dynamics',
 'encode',
 'ensemble',
 'equations',
 'evolved',
 'experimental',
 'first',
 'fluid',
 'follow',
 'gas',
 'have',
 'implementation',
 'improved',
 'in',
 'information',
 'initial',
 'lattice',
 'magnetic',
 'manipulated',
 'mass',
 'methods',
 'nmr',
 'nonlinear',
 'nuclear',
 'observed',
 'on',
 'processors',
 'progress',
 'pulse',
 'qlga',
 'qualitatively',
 'quantum',
 'qubit',
 'recent',
 'resonance',
 'running',
 'shown',
 'simulation',
 'solves',
 'steps',
 'suggest',
 'techniques',
 'the',
 'then',
 'theoretical',
 'through',
 'two',
 'via']

In [179]:
true_values = []
for i in df['keyphrases'][0]:
    true_values += i.split(" ")
true_values = list(set(true_values))
true_values

['magnetic',
 'information',
 'processors',
 'implementation',
 'nmr',
 'burgers',
 'algorithm',
 'nuclear',
 'diffusion',
 'gas',
 'quantum',
 'two-qubit',
 'dynamics',
 'nonlinear',
 'problems',
 'lattice',
 'information.processors',
 'equations',
 'fluid',
 'equation',
 'resonance']

In [180]:
calculate_f1_score(true_values, all_words)

0.4096385542168674

In [181]:
def get_sequential_tokens(doc, interested_pos, lower):
    # to get sequential tokens from the text
    sentences = []
    for sent in doc.sents:
        selected_words = []
        for token in sent:
            if token.pos_ in interested_pos:
                if lower:
                    selected_words.append(token.text.lower())
                else:
                    selected_words.append(token.text)
            else:
                selected_words.append("*")
        sentences.append(selected_words)
    return sentences

def get_n_grams(sentences):
    # to get n_grams outta sequential tokens
    n_grams = []
    for sent in sentences:
        temp_list = []
        for word in sent:
            if word == '*':
                if len(temp_list)!=0:
                    n_grams.append(temp_list)
                    temp_list = []
            else:
                temp_list.append(word)
    return n_grams

In [214]:
def find_longest_sequence_and_scores(n_grams, unigrams, keywords_score_dict):
    longest_sequences = []
    
    for n_gram in n_grams:
        current_sequence = []
        longest_sequence = []
        
        for word in n_gram:
            if word in unigrams:
                current_sequence.append(word)
                if len(current_sequence) > len(longest_sequence):
                    longest_sequence = current_sequence.copy()
            else:
                current_sequence = []
        
        if len(longest_sequence)>0:
            longest_sequences.append(longest_sequence)
    
    longest_sequences = [' '.join(i) for i in sorted(longest_sequences, key=len, reverse=True)]

    final_scores = {}
    for seq in longest_sequences:
        score = 0
        for word in seq.split(" "):
            score += keywords_score_dict[word]
        final_scores[seq] = score
    
    return longest_sequences, final_scores

In [215]:
def lemmed(my_list):
    lemmed_list = []
    for seq in my_list:
        lemmed_list.append(' '.join([token.lemma_ for token in nlp(seq)]))
    return lemmed_list

In [183]:
interested_pos = ['NOUN', 'PROPN', 'VERB', 'ADJ']

In [184]:
doc = nlp(df['abstract'][0].replace('-',' '))

In [185]:
sents = get_sequential_tokens(doc, interested_pos, lower=False)
n_grams = get_n_grams(sents)

In [186]:
n_grams

[['recent', 'theoretical', 'results', 'suggest'],
 ['array'],
 ['quantum', 'information', 'processors', 'communicating'],
 ['classical', 'channels'],
 ['used'],
 ['solve', 'fluid', 'dynamics', 'problems'],
 ['quantum', 'lattice', 'gas', 'algorithms'],
 ['qlga'],
 ['running'],
 ['such', 'architectures'],
 ['shown'],
 ['solve'],
 ['diffusion', 'equation'],
 ['nonlinear', 'burgers', 'equations'],
 ['report'],
 ['describe', 'progress'],
 ['ensemble', 'nuclear', 'magnetic', 'resonance'],
 ['nmr'],
 ['implementation'],
 ['qlga'],
 ['solves'],
 ['diffusion', 'equation'],
 ['methods', 'rely'],
 ['nmr', 'techniques'],
 ['encode'],
 ['initial', 'mass', 'density'],
 ['ensemble'],
 ['qubit', 'quantum', 'information', 'processors'],
 ['using', 'standard', 'pulse', 'techniques'],
 ['mass', 'density'],
 ['manipulated'],
 ['evolved'],
 ['steps'],
 ['algorithm'],
 ['provide'],
 ['experimental', 'results'],
 ['first', 'attempt'],
 ['realize'],
 ['nmr', 'implementation'],
 ['results'],
 ['follow'],
 ['id

In [187]:
sorted(all_words)

['algorithm',
 'algorithms',
 'an',
 'architectures',
 'array',
 'been',
 'burgers',
 'channels',
 'classical',
 'communicating',
 'control',
 'density',
 'describe',
 'diffusion',
 'dynamics',
 'encode',
 'ensemble',
 'equations',
 'evolved',
 'experimental',
 'first',
 'fluid',
 'follow',
 'gas',
 'have',
 'implementation',
 'improved',
 'in',
 'information',
 'initial',
 'lattice',
 'magnetic',
 'manipulated',
 'mass',
 'methods',
 'nmr',
 'nonlinear',
 'nuclear',
 'observed',
 'on',
 'processors',
 'progress',
 'pulse',
 'qlga',
 'qualitatively',
 'quantum',
 'qubit',
 'recent',
 'resonance',
 'running',
 'shown',
 'simulation',
 'solves',
 'steps',
 'suggest',
 'techniques',
 'the',
 'then',
 'theoretical',
 'through',
 'two',
 'via']

In [194]:
res, scores = find_longest_sequence_and_scores(n_grams, all_words, keywords_score_dict)
res

['quantum information processors communicating',
 'quantum lattice gas algorithms',
 'ensemble nuclear magnetic resonance',
 'qubit quantum information processors',
 'nonlinear burgers equations',
 'initial mass density',
 'recent theoretical',
 'classical channels',
 'fluid dynamics',
 'describe progress',
 'nmr techniques',
 'pulse techniques',
 'mass density',
 'nmr implementation',
 'observed implementation',
 'array',
 'qlga',
 'running',
 'architectures',
 'shown',
 'diffusion',
 'nmr',
 'implementation',
 'qlga',
 'solves',
 'diffusion',
 'methods',
 'encode',
 'ensemble',
 'manipulated',
 'evolved',
 'steps',
 'algorithm',
 'experimental',
 'first',
 'follow',
 'simulation']

In [195]:
scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
scores

{'qubit quantum information processors': 0.9763,
 'quantum lattice gas algorithms': 0.8988,
 'quantum information processors communicating': 0.696,
 'ensemble nuclear magnetic resonance': 0.5487,
 'nmr implementation': 0.4827,
 'nmr techniques': 0.392,
 'nmr': 0.3305,
 'fluid dynamics': 0.3205,
 'initial mass density': 0.30519999999999997,
 'nonlinear burgers equations': 0.2527,
 'mass density': 0.21869999999999998,
 'diffusion': 0.2144,
 'pulse techniques': 0.2116,
 'observed implementation': 0.1985,
 'simulation': 0.187,
 'classical channels': 0.1857,
 'ensemble': 0.1732,
 'implementation': 0.1522,
 'architectures': 0.142,
 'qlga': 0.1381,
 'algorithm': 0.1349,
 'steps': 0.1305,
 'encode': 0.1301,
 'recent theoretical': 0.10800000000000001,
 'describe progress': 0.0794,
 'array': 0.0745,
 'follow': 0.0654,
 'first': 0.061,
 'running': 0.0533,
 'solves': 0.0452,
 'evolved': 0.0168,
 'methods': 0.0126,
 'shown': 0.0099,
 'manipulated': 0.0071,
 'experimental': 0.0067}

In [196]:
final_res = list(scores.keys())[:10]

In [217]:
final_res

['qubit quantum information processors',
 'quantum lattice gas algorithms',
 'quantum information processors communicating',
 'ensemble nuclear magnetic resonance',
 'nmr implementation',
 'nmr techniques',
 'nmr',
 'fluid dynamics',
 'initial mass density',
 'nonlinear burgers equations']

In [198]:
df['keyphrases'][0]

['nmr implementation',
 'quantum lattice gas algorithm',
 'quantum information processors',
 'fluid dynamics problems',
 'diffusion equation',
 'nonlinear burgers equations',
 'nuclear magnetic resonance',
 'two-qubit quantum information.processors']

In [224]:
calculate_f1_score(lemmed(df['keyphrases'][0]),lemmed(final_res) )

Precision: 0.3 	 Reacll: 0.375 	 F1 Score: 0.33333333333333326


0.33333333333333326

In [225]:
partial_f1_score(lemmed(df['keyphrases'][0]),lemmed(final_res) )

0.7555555555555555

In [226]:
df

Unnamed: 0,id,title,abstract,keyphrases,prmu
0,761,Towards a NMR implementation of a quantum latt...,recent theoretical results suggest that an arr...,"[nmr implementation, quantum lattice gas algor...","[P, P, P, P, P, P, P, M]"
1,724,Banking on SMA funds [separately managed accou...,from investment management to technology to ba...,"[separately managed accounts, investment manag...","[P, P, P, P, P, P]"
2,1371,Design methodology for diagnostic strategies f...,this paper presents a method for the construct...,"[design methodology, diagnostic strategies, in...","[P, P, P, P, P]"
3,1334,A shy invariant of graphs,moving from a well known result of p.l. hammer...,"[graph invariant, induced odd cycles, minimum ...","[P, P, P, P, P, P]"
4,1419,PacketVideo. One step ahead of the streaming w...,"go beyond the hype, however, and it's clear th...","[packetvideo, wireless devices, mpeg-4, wirele...","[P, P, P, P, P, P, P, R]"
...,...,...,...,...,...
995,1124,Data extraction from the Web based on pre-defi...,"with the development of the internet, the worl...","[data extraction, schema, internet, informatio...","[P, P, P, P, P, R, M, U, U]"
996,118,Sensorless control of induction motor drives,controlled induction motor drives without mech...,"[sensorless control, induction motor drives, r...","[P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, ..."
997,635,Detection and estimation of abrupt changes in ...,detection of change-points in normal means is ...,"[generalized likelihood ratio test statistic, ...","[P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, ..."
998,1260,A dataflow computer which accelerates executio...,"in the dataflow machine, it is important to av...","[dataflow computer, sequential programs, prece...","[P, P, P, P, P, P, P, P, P, R, M, M]"


In [231]:
def extract(text, number=10):
    def get_sequential_tokens(doc, interested_pos, lower):
        # to get sequential tokens from the text
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                if token.pos_ in interested_pos:
                    if lower:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
                else:
                    selected_words.append("*")
            sentences.append(selected_words)
        return sentences
    
    def get_n_grams(sentences):
        # to get n_grams outta sequential tokens
        n_grams = []
        for sent in sentences:
            temp_list = []
            for word in sent:
                if word == '*':
                    if len(temp_list)!=0:
                        n_grams.append(temp_list)
                        temp_list = []
                else:
                    temp_list.append(word)
        return n_grams
        
    def find_longest_sequence_and_scores(n_grams, unigrams, keywords_score_dict):
        longest_sequences = []
        
        for n_gram in n_grams:
            current_sequence = []
            longest_sequence = []
            
            for word in n_gram:
                if word in unigrams:
                    current_sequence.append(word)
                    if len(current_sequence) > len(longest_sequence):
                        longest_sequence = current_sequence.copy()
                else:
                    current_sequence = []
            
            if len(longest_sequence)>0:
                longest_sequences.append(longest_sequence)
        
        longest_sequences = [' '.join(i) for i in sorted(longest_sequences, key=len, reverse=True)]
    
        final_scores = {}
        for seq in longest_sequences:
            score = 0
            for word in seq.split(" "):
                score += keywords_score_dict[word]
            final_scores[seq] = score
        
        return longest_sequences, final_scores
    
    def lemmed(my_list):
        lemmed_list = []
        for seq in my_list:
            lemmed_list.append(' '.join([token.lemma_ for token in nlp(seq)]))
        return lemmed_list

    
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    
    kw_model = KeyBERT()
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=100)
    
    all_words = [word for word,_ in keywords if _>0]
    keywords_score_dict = {}
    for word, score in keywords:
        if score>0:
            # print(f"{word}: {score}")
            keywords_score_dict[word]=score
    
    interested_pos = ['NOUN', 'PROPN', 'VERB', 'ADJ']
    sents = get_sequential_tokens(doc, interested_pos, lower=False)
    n_grams = get_n_grams(sents)

    res, scores = find_longest_sequence_and_scores(n_grams, all_words, keywords_score_dict)
    scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
    
    final_res = list(scores.keys())[:number]

    return lemmed(final_res)

In [233]:
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

# Assuming 'lemmed' and 'extract' are your functions for lemmatization and keyphrase extraction.
# And assuming 'df' is your DataFrame with columns 'keyphrases' and 'abstract'.

# Apply the 'lemmed' function to the 'keyphrases' column with a progress bar
df['true'] = df['keyphrases'].progress_apply(lemmed)

# Apply the 'extract' function to the 'abstract' column with a progress bar
df['preds'] = df['abstract'].progress_apply(extract)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [234]:
df.to_csv("t5.csv")

In [239]:
pred = list(df['preds'])
true_values = list(df['true'])

total_f1 = 0
partial_f1 = 0
for y,y_hat in zip(true_values, pred):
    total_f1 += calculate_f1_score(y,y_hat)
    partial_f1 += partial_f1_score(y,y_hat)
print(f"Exact F1: {total_f1/len(pred)}")
print(f"Partial F1:{partial_f1/len(pred)}")

Precision: 0.1 	 Reacll: 0.125 	 F1 Score: 0.11111111111111112
Precision: 0.4 	 Reacll: 0.6666666666666666 	 F1 Score: 0.5
Precision: 0.1 	 Reacll: 0.2 	 F1 Score: 0.13333333333333333
Precision: 0.2 	 Reacll: 0.3333333333333333 	 F1 Score: 0.25
Precision: 0.5 	 Reacll: 0.625 	 F1 Score: 0.5555555555555556
Precision: 0.2 	 Reacll: 0.16666666666666666 	 F1 Score: 0.1818181818181818
Precision: 0.1 	 Reacll: 0.2 	 F1 Score: 0.13333333333333333
Precision: 0.2 	 Reacll: 0.25 	 F1 Score: 0.22222222222222224
Precision: 0.4 	 Reacll: 0.2 	 F1 Score: 0.26666666666666666
Precision: 0.4 	 Reacll: 0.2857142857142857 	 F1 Score: 0.3333333333333333
Precision: 0.0 	 Reacll: 0.0 	 F1 Score: 0
Precision: 0.1 	 Reacll: 0.25 	 F1 Score: 0.14285714285714288
Precision: 0.2 	 Reacll: 0.125 	 F1 Score: 0.15384615384615385
Precision: 0.3 	 Reacll: 0.5 	 F1 Score: 0.37499999999999994
Precision: 0.1 	 Reacll: 0.25 	 F1 Score: 0.14285714285714288
Precision: 0.0 	 Reacll: 0.0 	 F1 Score: 0
Precision: 0.4 	 Reacll:

In [241]:
df['true'][0]

['nmr implementation',
 'quantum lattice gas algorithm',
 'quantum information processor',
 'fluid dynamic problem',
 'diffusion equation',
 'nonlinear burger equation',
 'nuclear magnetic resonance',
 'two - qubit quantum information.processor']

In [242]:
df['preds'][0]

['qubit quantum information processor',
 'quantum information processor communicate',
 'quantum lattice',
 'ensemble nuclear magnetic resonance',
 'nmr implementation',
 'nmr technique',
 'nmr',
 'fluid dynamic',
 'gas algorithm',
 'initial mass density']

In [244]:
calculate_f1_score(df['true'][0], df['preds'][0])

Precision: 0.1 	 Reacll: 0.125 	 F1 Score: 0.11111111111111112


0.11111111111111112

In [248]:
test_df

Unnamed: 0,id,title,abstract,keyphrases,prmu
0,2007,The creation of a high-fidelity finite element...,A detailed finite element model of the human k...,"[high-fidelity finite element model, kidney, t...","[P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, ..."
1,2042,Hybrid simulation of space plasmas: models wit...,"For pt.III. see Prikl. Mat. Informatika, MAKS ...","[hybrid simulation, space plasmas, massless fl...","[P, P, P, P, P, P, P, P, P, P, P, P, P]"
2,308,On-line Homework/Quiz/Exam applet: freely avai...,The Homework/Quiz/Exam applet is a freely avai...,"[freely available Java software, database conn...","[P, P, P, P, P, P, P, P, P, M, M, R, R, R, R, ..."
3,215,A conceptual framework for evaluation of infor...,The decision to acquire a new information tech...,"[information technology investments, technolog...","[P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, ..."
4,250,Aim for the enterprise: Microsoft Project 2002,"A long-time favorite of project managers, Micr...","[Microsoft Project 2002, Web-based collaborati...","[P, P, P, P, P, P, R]"
...,...,...,...,...,...
495,341,How should team captains order golfers on the ...,I used game theory to examine how team captain...,"[game theory, slate, golf, golfer ordering, Ry...","[P, P, U, R, R]"
496,219,Firewall card shields data,The SlotShield 3000 firewall on a PCI card sav...,"[SlotShield 3000 firewall, PCI card, security,...","[P, P, P, P]"
497,1967,Modeling daily realized futures volatility wit...,"Using singular spectrum analysis (SSA), we mod...","[daily realized futures volatility, singular s...","[P, P, P, P, P, P, P, P, P, P, M, U, M]"
498,2116,Optimization of the characteristics of computa...,The scalableness of resources is taken to mean...,"[computational processes, scalable resources, ...","[P, P, P, P, P, P, P, P, P, P, M]"
