In [1]:
# import necessary packages
import pandas as pd
import numpy as np
import re
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as en_STOP_WORDS
from spacy.lang.nl.stop_words import STOP_WORDS as nl_STOP_WORDS
import en_core_web_sm
import nl_core_news_sm
import collections

ensp = en_core_web_sm.load()
nlsp = nl_core_news_sm.load()
ensp_singlewords = en_core_web_sm.load(disable=['parser', 'ner']) # just keep tagger for lemmatization for English
nlsp_singlewords = nl_core_news_sm.load(disable=['parser', 'ner']) # just keep tagger for lemmatization for Dutch

en_stopwords = ensp.Defaults.stop_words
nl_stopwords = nlsp.Defaults.stop_words

In [2]:
# read in and preprocess data
data = pd.read_csv("data.csv")
df = data.copy()

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,project_id,english_keywords,dutch_keywords,abstract_id,english_abstract,dutch_abstract,data_provider_id,data_provider_name,Unnamed: 8
0,0,6fa0f7de-4502-4995-92ae-5467e49df1b3,Ion channels;Positive allosteric modulators;Cr...,,148096734,Nicotinic acetylcholine receptors (nAChRs) are...,Nicotine-acetylcholinereceptoren (nAChR's) zij...,54937891,KULeuven,
1,1,50a0f20c-331a-41d9-a1ce-24cf21657a48,Entrepreneurial orientation;Strategy;Environme...,,148089590,"In today's intense global competition, and a r...","In today's intense global competition, and a r...",52804525,KULeuven,
2,2,0ecff36a-9401-431f-a1e9-185c61e89749,breast cancer;,,147893558,Increased body mass index (BMI) has been recog...,Een verhoogd â€˜body mass index' (BMI) is een ...,54933510,KULeuven,
3,3,43aba49c-eb93-4905-863d-1452b5f9fe80,affording;,,147604762,Interactive alignment (i.e. copying behaviour ...,Eerder onderzoek heeft aangetoond dat kopieged...,54926224,KULeuven,
4,4,716b1c6d-e7c2-43d9-994d-b2d9aeccdd24,metrology;,,147604718,In this project research into the topographic ...,In dit project zal onderzoek naar de topografi...,54926170,KULeuven,


In [4]:
# preprocesses the keywords
for index, row in enumerate(dataframe["english_keywords"]):
    if pd.isna(dataframe.iloc[index]["english_keywords"]) is False:
        dataframe.at[index,"english_keywords"] = re.findall(r"[\w]+", dataframe.iloc[index]["english_keywords"].lower())

In [57]:
# Preprocessing the abstract WITH lemmatization
def preprocess_abstract(text, langTag):
    """function preprocess_abstract : preprocesses a string (e.g. an abstract)

   Args:
       text (string): a string
       langTag (string): the language of the words in wordList and the synonyms (e.g. 'eng' for English and 'nld' for Dutch)

   Returns:
       list: {dictionary of terms with their frequency relative to the total amount of words}
   """
    abstract = ensp(text.translate(str.maketrans('', '', string.punctuation)))
    tokens = [word.lemma_ for word in abstract if not word.lemma_ in en_stopwords and len(word) > 1]
    return tokens

In [58]:
# calculates the term frequency
def term_frequency(tokens):
    """function term_frequency : calculates the frequencies of terms from a list of words

   Args:
       tokens ([list of strings]): a list of strings (e.g. single words from an abstract)

   Returns:
       list: {dictionary of terms with their frequency relative to the total amount of words}
   """
    counter = collections.Counter()
    tf_dict = counter.copy()
    for word in tokens:
        counter[word] += 1
    for word in counter:
        tf_dict[word] = counter.get(word)/len(tokens)
    return tf_dict

In [19]:
idf_dict = collections.Counter()
for index, row in enumerate(df["english_keywords"]):
    if df.isna().iloc[index]["english_keywords"] == False:
        for word in df.iloc[index]["english_keywords"]:
            lemma = " ".join([word.lemma_ for word in ensp_singlewords(word)])
            idf_dict[lemma] += 1

In [22]:
for term in tf_dict:
    if term in idf_dict:
        print(term)

disease
brain
cognitive
develop
medical
structure
protein
function
development
new
drug
human
design
high
positive
allosteric
modulator
potential


In [61]:
for index, row in df.iterrows():
    #print(row["english_abstract"])
    possible_keywords = preprocess_abstract(row["english_abstract"], None)
    #print(possible_keywords)
    keyword_frequencies = term_frequency(possible_keywords)
    df[row["Possible Keywords"]] = keyword_frequencies

KeyError: 'Possible Keywords'

In [62]:
df.head()

Unnamed: 0.1,Unnamed: 0,project_id,english_keywords,dutch_keywords,abstract_id,english_abstract,dutch_abstract,data_provider_id,data_provider_name,Unnamed: 8
0,0,6fa0f7de-4502-4995-92ae-5467e49df1b3,"[ion, channels, positive, allosteric, modulato...",,148096734,Nicotinic acetylcholine receptors (nAChRs) are...,Nicotine-acetylcholinereceptoren (nAChR's) zij...,54937891,KULeuven,
1,1,50a0f20c-331a-41d9-a1ce-24cf21657a48,"[entrepreneurial, orientation, strategy, envir...",,148089590,"In today's intense global competition, and a r...","In today's intense global competition, and a r...",52804525,KULeuven,
2,2,0ecff36a-9401-431f-a1e9-185c61e89749,"[breast, cancer]",,147893558,Increased body mass index (BMI) has been recog...,Een verhoogd â€˜body mass index' (BMI) is een ...,54933510,KULeuven,
3,3,43aba49c-eb93-4905-863d-1452b5f9fe80,[affording],,147604762,Interactive alignment (i.e. copying behaviour ...,Eerder onderzoek heeft aangetoond dat kopieged...,54926224,KULeuven,
4,4,716b1c6d-e7c2-43d9-994d-b2d9aeccdd24,[metrology],,147604718,In this project research into the topographic ...,In dit project zal onderzoek naar de topografi...,54926170,KULeuven,
