## Function for preprocessing

In [1]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):

    # Tokenize and lower case
    tokens = word_tokenize(text.lower())

    # Remove punctuation and numbers and filter out stopwords, then lemmatize
    processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in string.punctuation and not re.search(r'\d', token) and token not in stop_words]
    
    # Allow only alphabetical values
    processed_tokens = [token for token in processed_tokens if token.isalpha()]

    # Create a list of sentences
    sentences = [processed_tokens]

    # Learn and apply bigram transformations
    bigram_phrases = Phrases(sentences, min_count=1, threshold=1)
    bigram = Phraser(bigram_phrases)
    bigram_tokens = bigram[sentences[0]]

    # Learn and apply trigram transformations
    trigram_phrases = Phrases(bigram_tokens, min_count=1, threshold=1)
    trigram = Phraser(trigram_phrases)
    trigram_tokens = trigram[bigram_tokens]

    return trigram_tokens


## Helper functions for the TF-IDF Calculation

In [2]:
from collections import defaultdict
import math

def compute_idf(documents):
    idf = defaultdict(int)
    total_docs = len(documents)

    # Calculate the amount of documents each word occurs in
    for doc in documents:
        for word in set(doc):
            idf[word] += 1
            
    # Calculate the idf values
    for word, val in idf.items():
        idf[word] = math.log(total_docs / float(val))
    
    return idf

In [3]:
from collections import Counter

def compute_tf(tokens):
    # Count the occurences of every token
    return Counter(tokens)

In [4]:
def compute_tfidf(tf, idf):
    tfidf = {}
    # Compute the tf-tdf values form the given tf and idf values by multiplying them
    for word, val in tf.items():
        tfidf[word] = val * idf[word]
    return tfidf

## Preparations for the preprocessing and TF-IDF calculations

In [5]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('./databases/publications-database.db')

# Enable foreign key support
conn.execute("PRAGMA foreign_keys = ON")

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

cursor.execute('''
    SELECT * FROM Publication
''')

# saving the 
publications = cursor.fetchall()

corpus_titles = []
corpus_abstracts = []

for publication in publications:
    publication_title = publication[1]
    publication_abstract = publication[2]

    # Adding the titles and the abstracts to the corpora
    corpus_titles.append(publication_title)
    corpus_abstracts.append(publication_abstract)

# Close the database connection
conn.close()

## Calculating the TF-IDF values

In [6]:
# Preprocessing the texts

abstracts_tokens = [preprocess_text(text) for text in corpus_abstracts]
titles_tokens = [preprocess_text(text) for text in corpus_titles]

In [7]:
# Computing the tf values for the titles and the absrtacts

abstracts_tf = [compute_tf(tokens) for tokens in abstracts_tokens]
titles_tf = [compute_tf(tokens) for tokens in titles_tokens]

In [8]:
# Computing the idf values for the titles and the absrtacts

abstracts_idf = compute_idf(abstracts_tokens)
titles_idf = compute_idf(titles_tokens)

In [9]:
# Computing the tf-idf values for the titles and the absrtacts

abstracts_tfidf = [compute_tfidf(tf, abstracts_idf) for tf in abstracts_tf]
titles_tfidf = [compute_tfidf(tf, titles_idf) for tf in titles_tf]

### Checking the results

In [10]:
print("Sorted tf-idf values of the first absrtact: ", dict(sorted(abstracts_tfidf[0].items(), key=lambda x: x[1], reverse=True)))
print("Amount of tf-idf values that exist for the first abstract: ", len(abstracts_tfidf[0]))

Sorted tf-idf values of the first absrtact:  {'shear_zone': 48.96310948486503, 'colloid': 28.247623835237583, 'transport': 16.38858162445567, 'groundwater_flow': 16.32103649495501, 'uranine': 16.32103649495501, 'hydraulic': 15.347987429262247, 'tracer_test': 14.934742133835119, 'cfm': 14.123811917618792, 'fairly': 12.162153411595337, 'migration': 11.191137780031937, 'modelling': 10.519673691959945, 'numerical': 10.01810574663061, 'planned': 9.883284845218608, 'grimsel': 8.160518247477505, 'darcy': 8.160518247477505, 'packer': 8.160518247477505, 'flow': 7.834326487746855, 'bentonite': 7.4673710669175595, 'sealed': 7.4673710669175595, 'gradient': 7.29931748192131, 'breakthrough': 6.551080335043404, 'project': 6.409382379752489, 'somewhat': 6.36875877824945, 'field': 6.367941982666102, 'fractured': 6.081076705797669, 'hole': 5.762622974679134, 'radionuclides': 5.595568890015969, 'facilitated': 5.521460917862246, 'dipole': 5.521460917862246, 'circular': 5.27014648958134, 'confidence': 5.16

In [11]:
print("Sorted tf-idf values of the 3050th title: ", dict(sorted(titles_tfidf[3050].items(), key=lambda x: x[1], reverse=True)))
print("Amount of tf-idf values that exist for the 3050th title: ", len(titles_tfidf[3050]))

Sorted tf-idf values of the 3050th title:  {'hypersurface': 8.160518247477505, 'random': 5.762622974679134, 'extended': 5.521460917862246, 'tracking': 5.387929525237724, 'object': 5.1647859739235145, 'model': 2.713780875811195}
Amount of tf-idf values that exist for the 3050th title:  6


## Helper functions and preparations needed for the ontology creation

In [12]:
from itertools import islice

def take_first_percentages(dictionary, percentages_to_take):
    """Returns the first (len(dictionary) * percentages_to_take) entries of the dictionary. The amount is rounded up.

    Args:
        dictionary: The input dictionary to cut.
        percentages_to_take (float): Should be a number from [0,1]. The percentages of how many entries should be returned.

    Returns:
        dictionary: A dictionary that contains the first (len(dictionary) * percentages_to_take) entries of the input dictionary.
    """
    num_entries = len(dictionary)
    num_to_take = math.ceil(num_entries * percentages_to_take)  # Calculate the number of entries to take, round up

    first_percent = dict(islice(dictionary.items(), num_to_take))
    return first_percent


In [13]:
# List of all chemical elements (removed lead)
chemical_elements = [
    "Hydrogen", "Helium", "Lithium", "Beryllium", "Boron", "Carbon", "Nitrogen",
    "Oxygen", "Fluorine", "Neon", "Sodium", "Magnesium", "Aluminum", "Silicon",
    "Phosphorus", "Sulfur", "Chlorine", "Argon", "Potassium", "Calcium", "Scandium",
    "Titanium", "Vanadium", "Chromium", "Manganese", "Iron", "Cobalt", "Nickel",
    "Copper", "Zinc", "Gallium", "Germanium", "Arsenic", "Selenium", "Bromine",
    "Krypton", "Rubidium", "Strontium", "Yttrium", "Zirconium", "Niobium", "Molybdenum",
    "Technetium", "Ruthenium", "Rhodium", "Palladium", "Silver", "Cadmium", "Indium",
    "Tin", "Antimony", "Tellurium", "Iodine", "Xenon", "Cesium", "Barium", "Lanthanum",
    "Cerium", "Praseodymium", "Neodymium", "Promethium", "Samarium", "Europium",
    "Gadolinium", "Terbium", "Dysprosium", "Holmium", "Erbium", "Thulium", "Ytterbium",
    "Lutetium", "Hafnium", "Tantalum", "Tungsten", "Rhenium", "Osmium", "Iridium",
    "Platinum", "Gold", "Mercury", "Thallium", "Bismuth", "Polonium",
    "Astatine", "Radon", "Francium", "Radium", "Actinium", "Thorium", "Protactinium",
    "Uranium", "Neptunium", "Plutonium", "Americium", "Curium", "Berkelium",
    "Californium", "Einsteinium", "Fermium", "Mendelevium", "Nobelium", "Lawrencium",
    "Rutherfordium", "Dubnium", "Seaborgium", "Bohrium", "Hassium", "Meitnerium",
    "Darmstadtium", "Roentgenium", "Copernicium", "Nihonium", "Flerovium",
    "Moscovium", "Livermorium", "Tennessine", "Oganesson"
]

In [14]:
import csv

chemistry_csv = 'databases/chemistry.csv'
computer_science_csv = 'databases/computer-science.csv'

chemistry_competencies_list = []
computer_science_competencies_list = []

# Reading all predefined competencies from the csv files
# Predefined chemistry competencies
with open(chemistry_csv, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        chemistry_competencies_list.append(row)
# Predefined computer science competencies
with open(computer_science_csv, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        computer_science_competencies_list.append(row)

# Saving competencies into lists and adding all chemical elements
chemistry_competencies = chemistry_competencies_list[0] + chemical_elements
computer_science_competencies = computer_science_competencies_list[0]

# Lowercase all competencies
chemistry_competencies = [element.lower() for element in chemistry_competencies]
computer_science_competencies = [element.lower() for element in computer_science_competencies]

# Function to return the competencies that are in a given text of a publication
def get_competencies(publication_text):
    competencies = []
    
    # Search for all predefined competencies, ensuring that they do not only occur as a part of a word 

    for competency in chemistry_competencies:
        if re.search(r'\b' + re.escape(competency) + r'\b', publication_text):
            competencies.append(competency)
    
    for competency in computer_science_competencies:
        if re.search(r'\b' + re.escape(competency) + r'\b', publication_text):
            competencies.append(competency)
    
    return competencies

## Functions to add instances and relations to the ontology

### Creating the ontology

In [15]:
from rdflib import Graph, Literal, Namespace, RDF, URIRef

# Creating an RDF graph and namespace
g = Graph()
n  = Namespace("urn:semantic_search:")

# Creating Classes of the ontology
class_author = URIRef(n + "Author")
class_publication = URIRef(n + "Publication")
class_competency = URIRef(n + "Competency")

# Creating Relations for the classes
written_by = URIRef(n + "Written_by")


### Classes

In [16]:
# function to add a new publication, it is checked if the publication already exists, returns the publication in the ontology
def add_publication(graph, publication_id, publication_title, publication_abstract, publication_year):
    new_publication = URIRef(n + f"Publication:{publication_id}") 
    # Check if publication already exists in the graph
    if (new_publication, RDF.type, class_publication) in graph:
        return new_publication
    else:
        # Add the publication, it's title and year of publication
        graph.add((new_publication, RDF.type, class_publication))
        graph.add((new_publication, URIRef(n + "publications:title"), Literal(publication_title)))
        graph.add((new_publication, URIRef(n + "publications:year"), Literal(publication_year)))
        return new_publication


In [17]:
# function to add a new author, it is checked if the author already exists, returns the author in the ontology
def add_author(graph, author_id, first_name, last_name):
    new_author = URIRef(n + f"Author:{author_id}") 
    # Check if author already exists in the graph
    if (new_author, RDF.type, class_author) in graph:
        return new_author
    else:
        # Add the author, his first and his last name
        graph.add((new_author, RDF.type, class_author))
        graph.add((new_author, URIRef(n + "authors:firstName"), Literal(first_name)))
        graph.add((new_author, URIRef(n + "authors:lastName"), Literal(last_name)))
        return new_author


In [18]:
# function to add a new competency, it is checked if the competency already exists, returns the competency in the ontology
def add_competency(graph, competency_name):
    competency_name = competency_name.replace(" ", "_")
    new_competency = URIRef(n + f"Competency:{competency_name}") 
    # Check if competency already exists in the graph
    if (new_competency, RDF.type, class_competency) in graph:
        return new_competency
    else:
        # Add the competency and it's name
        graph.add((new_competency, RDF.type, class_competency))
        graph.add((new_competency, URIRef(n + "competencies:name"), Literal(competency_name)))
        return new_competency


### Relations

In [19]:
# function to add a written_by relation
def add_written_by(graph, author_id, publication_id):
    author = URIRef(n + f"Author:{author_id}") 
    publication = URIRef(n + f"Publication:{publication_id}")
    
    # Ensure that both the author and publication exist in the graph
    if (author, RDF.type, class_author) in graph and (publication, RDF.type, class_publication) in graph:
        # Add the written_by relation
        graph.add((publication, written_by, author))
        return True
    else:
        raise IOError("Author or publication do not exist")

In [20]:
# function to add a derived_from relation
def add_extraction(graph, publication_id, competency_name, certainty):
    publication = URIRef(n + f"Publication:{publication_id}")
    competency = URIRef(n + f"Competency:{competency_name}")

    # Create a derived_from relation, that has the certainty saved
    new_dynamic_derived_from = URIRef(n + f"Extraction:{publication_id}-{competency_name}-{certainty}")

    assert 0 <= certainty <= 1

    # Ensure that both the publication and competency exist in the graph before creating the relationship
    if (publication, RDF.type, class_publication) in graph and (competency, RDF.type, class_competency) in graph:
        
        # Searching for a potentially existing extraction with of the same competency from the same publication
        query_str = f"""
            SELECT ?predicate WHERE {{
                <{competency}> ?predicate <{publication}> .
                FILTER(STRSTARTS(STR(?predicate), "{n}Extraction:{publication_id}-{competency_name}-"))
            }}
            """

        old_dynamic_derived_from = list(graph.query(query_str))

        # Check if extraction already exists in the graph
        if old_dynamic_derived_from:
            old_dynamic_derived_from = old_dynamic_derived_from[0][0]
            old_certainty = float(str(old_dynamic_derived_from).rsplit('-', 1)[-1])  # Extract the old certainty from the URI

            # Check if the new certainty of the extraction is higher than the old one
            if old_certainty < int(certainty):
                # The extraction is updated with the new higher certainty
                graph.remove((competency, old_dynamic_derived_from, publication))
                graph.add((competency, new_dynamic_derived_from, publication))
                return new_dynamic_derived_from
            else:
                # The old certainty is higher and therefore the extraction is unchanged
                return old_dynamic_derived_from
        else:
            # Add new extraction to the graph
            graph.add((competency, new_dynamic_derived_from, publication))
            return new_dynamic_derived_from
    else:
        raise IOError("Publication or Competency do not exist")


### Adding the publications, authors and competencies to the ontology

### All Publications

In [21]:
import sqlite3
import re
from tqdm import tqdm

ALLOWED_COMPETENCY_PATTERN = r'^[A-Za-z_]+$'
DEFAULT_TITLE_CERTAINTY = 0.7
DEFAULT_ABSTRACT_CERTAINTY = 0.2

# Creating an RDF graph and namespace
g = Graph()

# Preparation for result checking
csv_comps = []
tf_idf_comps_abstracts = []
tf_idf_comps_titles = []

# Creating the ontology from the SQL database of publications
# Connect to the database
conn = sqlite3.connect('databases/publications-database.db')

# Enable foreign key support
conn.execute("PRAGMA foreign_keys = ON")

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

cursor.execute('''
    SELECT * FROM Publication
''')

# Load all publicaitons
publications = cursor.fetchall()

for publication in tqdm(publications):
    # Assign all properties of the publication to readable variables
    publication_id = publication[0]
    publication_title = publication[1]
    publication_abstract = publication[2]
    publication_year = publication[3]

    # Add the publication to the ontology
    publication_onto = add_publication(g, publication_id, publication_title, publication_abstract, publication_year)

    # Handling the authors of the publication

    cursor.execute('''
        SELECT Author.author_id, first_name, last_name FROM
                    Author JOIN written_by ON written_by.author_id = Author.author_id
                   WHERE written_by.publication_id = ?
                   ''', [publication_id])
    
    # Load all authors of the current publication
    authors = cursor.fetchall()

    for author in authors:
        author_id = author[0]
        author_first_name = author[1]
        author_last_name = author[2]

        # Add the author to the ontology
        author_onto = add_author(g, author_id, author_first_name, author_last_name)
        # Add the written_by relation to the ontology
        written_by_onto = add_written_by(g, author_id, publication_id)

    # Handling all the competencies the publications contains:

    # Handling the predifined competencies for the titles and the abstracts separately
    
    # Load all predefined competencies in the abstract of the current publication
    competencies_csv_abstracts = get_competencies(publication_abstract)
    csv_comps = csv_comps + competencies_csv_abstracts # For result checking afterwards
    for abstract_competency in competencies_csv_abstracts:
        abstract_competency = abstract_competency.replace(" ", "_") # For uniform naming, bi- and trigrams are separated by underscores
        # Add the competency to the ontology
        competency_onto = add_competency(g, abstract_competency)
        # Add the extraction to the ontology, use the default certainty for absrtacts
        has_competency_onto = add_extraction(g, publication_id, abstract_competency, DEFAULT_ABSTRACT_CERTAINTY)

    # Load all predefined competencies in the title of the current publication
    competencies_csv_titles = get_competencies(publication_title)
    csv_comps = csv_comps + competencies_csv_titles # For result checking afterwards
    for title_competency in competencies_csv_titles:
        title_competency = title_competency.replace(" ", "_") # For uniform naming, bi- and trigrams are separated by underscores
        # Add the competency to the ontology
        competency_onto = add_competency(g, title_competency)
        # Add the extraction to the ontology, use the default certainty for titles
        has_competency_onto = add_extraction(g, publication_id, title_competency, DEFAULT_TITLE_CERTAINTY)


    # Handling the tf-idf competencies

    # Sorting the competencies from the abstracts by the TF-IDF value in descending order
    sorted_tf_idf_abstracts_dict = dict(sorted(abstracts_tfidf[publication_id].items(), key=lambda x: x[1], reverse=True))
    # The top 0.05 competencies from absrtacts are chosen
    tf_idf_competencies_abstract = take_first_percentages(sorted_tf_idf_abstracts_dict, 0.05) 

    # Sorting the competencies from the titles by the TF-IDF value in descending order
    sorted_tf_idf_titles_dict = dict(sorted(titles_tfidf[publication_id].items(), key=lambda x: x[1], reverse=True))
    # The top 0.1 competencies from titles are chosen
    tf_idf_competencies_titles = take_first_percentages(sorted_tf_idf_titles_dict, 0.1)

    # For result checking afterwards
    tf_idf_comps_abstracts = tf_idf_comps_abstracts + list(tf_idf_competencies_abstract.keys())
    tf_idf_comps_titles = tf_idf_comps_titles + list(tf_idf_competencies_titles.keys())

    # Add the competencies from the titles
    for competency in tf_idf_competencies_titles:
        competency = competency.replace(" ", "_")
        if re.match(ALLOWED_COMPETENCY_PATTERN, competency) is not None:
            # Add the competency to the ontology
            added_competency = add_competency(g, competency)
            # Add the extraction to the ontology, using certainty 1 for competencies from titles
            added_extraction = add_extraction(g, publication_id, competency, 1)
    
    # The tf-idf competencies from the abstracts are scaled

    # Determining the min and the max tf-idf value
    max_tf_idf_value = max(tf_idf_competencies_abstract.values())
    min_tf_idf_value = min(tf_idf_competencies_abstract.values())
    
    # Adding the competencies from the abstracts
    for competency in tf_idf_competencies_abstract:
        competence_tf_idf_value = tf_idf_competencies_abstract[competency]

        # Normalizing the TF-IDF values with the scaling, mapped into the space [0.5,1]
        if min_tf_idf_value != max_tf_idf_value:
            competency_certainty = (((competence_tf_idf_value - min_tf_idf_value) / (max_tf_idf_value - min_tf_idf_value)) * 0.5) + 0.5 
        else:
            # If there are no different tf-idf values all competences get certainty 1
            competency_certainty = 1
        
        competency = competency.replace(" ", "_") # For uniform naming, bi- and trigrams are separated by underscores

        # Only allow correctly formatted competencies to avoid errors from special characters
        if re.match(ALLOWED_COMPETENCY_PATTERN, competency) is not None:
            # Add the competency to the ontology
            added_competency = add_competency(g, competency)
            # Add the extraction to the ontology
            added_extraction = add_extraction(g, publication_id, competency, competency_certainty)


# Save the ontology to an XML file
g.serialize(destination='../ontology/semantic-search-ontology.rdf', format='xml')

# Close the database connection
conn.close()

100%|██████████| 3500/3500 [18:49<00:00,  3.10it/s]


## Result checking

In [22]:
from rdflib import Graph, Namespace

# Load the RDF graph from the ontology file
graph = Graph()
graph.parse('../ontology/semantic-search-ontology.rdf', format='xml')
n  = Namespace("urn:semantic_search:")

### Checking the extracted competencies

In [23]:
competencies_query = """
    PREFIX n: <urn:semantic_search:>
    SELECT ?competency_name WHERE {
        ?competency a n:Competency ;
                    n:competencies:name ?competency_name .
    }
"""

# Execute the SPARQL query
results = graph.query(competencies_query)

# Extract and print competency names from the query results
competency_names = [str(result[0]).replace("_", " ") for result in results]

extracted_elements = {}

for element in chemical_elements:
    if element.lower() in competency_names:
        if element in extracted_elements.keys():
            extracted_elements += 1
        else: extracted_elements[element] = 1

print("Extracted predefined from the chemical elements sorted by their occurances: ", dict(sorted(extracted_elements.items(), key=lambda x: x[1], reverse=True)))

Extracted predefined from the chemical elements sorted by their occurances:  {'Hydrogen': 1, 'Helium': 1, 'Lithium': 1, 'Beryllium': 1, 'Boron': 1, 'Carbon': 1, 'Nitrogen': 1, 'Oxygen': 1, 'Fluorine': 1, 'Neon': 1, 'Sodium': 1, 'Magnesium': 1, 'Aluminum': 1, 'Silicon': 1, 'Phosphorus': 1, 'Sulfur': 1, 'Chlorine': 1, 'Argon': 1, 'Potassium': 1, 'Calcium': 1, 'Titanium': 1, 'Vanadium': 1, 'Chromium': 1, 'Manganese': 1, 'Iron': 1, 'Cobalt': 1, 'Nickel': 1, 'Copper': 1, 'Zinc': 1, 'Gallium': 1, 'Germanium': 1, 'Arsenic': 1, 'Selenium': 1, 'Bromine': 1, 'Krypton': 1, 'Strontium': 1, 'Yttrium': 1, 'Zirconium': 1, 'Niobium': 1, 'Molybdenum': 1, 'Technetium': 1, 'Ruthenium': 1, 'Rhodium': 1, 'Palladium': 1, 'Silver': 1, 'Cadmium': 1, 'Indium': 1, 'Tin': 1, 'Antimony': 1, 'Iodine': 1, 'Cesium': 1, 'Europium': 1, 'Gadolinium': 1, 'Dysprosium': 1, 'Holmium': 1, 'Erbium': 1, 'Ytterbium': 1, 'Hafnium': 1, 'Tantalum': 1, 'Tungsten': 1, 'Platinum': 1, 'Gold': 1, 'Mercury': 1, 'Bismuth': 1, 'Polonium'

In [24]:
csv_dic = {}
for j in csv_comps:
    if j in csv_dic:
      csv_dic[j] +=1
    else:
      csv_dic[j] =1
print("Extracted predefined competencies sorted by their occurances: ", dict(sorted(csv_dic.items(), key=lambda x: x[1], reverse=True)))

Extracted predefined competencies sorted by their occurances:  {'energy': 538, 'interface': 260, 'temperature': 257, 'light': 245, 'mass': 239, 'software': 228, 'water': 224, 'algorithm': 207, 'frequency': 184, 'accuracy': 167, 'density': 157, 'gas': 156, 'reduction': 154, 'quantum': 152, 'electron': 145, 'heat': 144, 'volume': 141, 'hydrogen': 140, 'element': 137, 'uncertainty': 135, 'methodology': 133, 'pressure': 128, 'metal': 121, 'procedure': 117, 'spectroscopy': 116, 'group': 113, 'concentration': 113, 'carbon': 106, 'liquid': 105, 'solid': 101, 'radiation': 97, 'yield': 96, 'diffusion': 96, 'oxygen': 96, 'precision': 92, 'nuclear': 90, 'resonance': 89, 'computing': 88, 'primary': 80, 'deposition': 76, 'variable': 76, 'computer': 76, 'equilibrium': 73, 'acid': 72, 'period': 71, 'computation': 69, 'absorption': 65, 'crystal': 65, 'dispersion': 64, 'machine learning': 63, 'chemistry': 63, 'silicon': 63, 'extraction': 61, 'ion': 61, 'oxidation': 59, 'benchmark': 58, 'electrochemistr

In [25]:
tf_idf_dic_titles = {}
for j in tf_idf_comps_titles:
    if j in tf_idf_dic_titles:
      tf_idf_dic_titles[j] +=1
    else:
      tf_idf_dic_titles[j] =1
print("Extracted tf-idf competencies from the titles sorted by their occurances: ", dict(sorted(tf_idf_dic_titles.items(), key=lambda x: x[1], reverse=True)))



In [26]:
tf_idf_dic = {}
for j in tf_idf_comps_abstracts:
    if j in tf_idf_dic:
      tf_idf_dic[j] +=1
    else:
      tf_idf_dic[j] =1
print("Extracted tf-idf competencies from the abstracts sorted by their occurances: ", dict(sorted(tf_idf_dic.items(), key=lambda x: x[1], reverse=True)))

