In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import pandas as pd
import gc
import numpy as np
from ast import literal_eval
import rake
from itertools import combinations

import spacy
from scipy import spatial

import re
from inflection import singularize, pluralize
from dimcli.utils import *
from dimcli.utils.networkviz import NetworkViz # custom version of pyvis - colab-compatible

In [4]:
import networkx as nx

In [5]:
# embeddings = spacy.load('en_core_web_lg')

# List1 = embeddings('researchers').vector
# List2 = embeddings('researcher').vector
# result = 1 - spatial.distance.cosine(List1, List2)

In [6]:
def compare_lists(list1, reference_list):
    if len(list1)==0 or len(reference_list)==0:
        return None
    else:
        counter = 0
        for word in np.unique(list1):
            if word.lower() in np.unique(pd.Series(reference_list).str.lower()):
                counter=counter +1
        return counter/min(len(list1),len(reference_list))
                

In [7]:
def dimcli_keywords(abstract, nb_of_keywords = 10):
    res = dsl.query(f"""extract_concepts("{abstract}", return_scores=true)""")
    dimdf=pd.DataFrame(res['extracted_concepts'])
    if len(dimdf)>0:
        return list(dimdf['concept'].str.lower()[0:min(len(dimdf),nb_of_keywords)])
    else:
        return []

In [8]:
def dimcli_keywords_score(abstract, nb_of_keywords = 10):
    res = dsl.query(f"""extract_concepts("{abstract}", return_scores=true)""")
    dimdf=pd.DataFrame(res['extracted_concepts'])
    kws=[]
    for i in range(min(nb_of_keywords,len(dimdf))):
        kws.append((dimdf['concept'][i].lower(),dimdf['relevance'][i]))
    return kws



In [9]:
def rake_10(text):
    rake_object = rake.Rake("smartstopwords.txt", min_char_length=2, max_words_length=4)
    keywords = rake_object.run(text)

    keywords[0:10]
    kws=[]
    threecount=0
    twocount=0
    for (kw,score) in keywords:
        if len(kw.split())==3 and threecount<6:
            threecount+=1
            kws.append(kw)
        if len(kw.split())<3 and twocount<6:
            kws.append(kw)
            twocount+=1
    return kws

In [10]:
def rake_5_score_2(text):
    rake_object = rake.Rake("smartstopwords.txt", min_char_length=2, max_words_length=4)
    keywords = rake_object.run(text)

    keywords[0:10]
    kws=[]
    
    twocount=0
    for (kw,score) in keywords:
        if len(kw.split())<3 and twocount<6:
            kws.append((kw,score))
            twocount+=1
    return kws
def rake_5_score_3(text):
    rake_object = rake.Rake("smartstopwords.txt", min_char_length=2, max_words_length=4)
    keywords = rake_object.run(text)

    kws=[]
    
    threecount=0
    for (kw,score) in keywords:
        if len(kw.split())==3 and threecount<6:
            kws.append((kw,score))
            threecount+=1
    return kws

In [11]:
def select_with_diversity(ranking, lam, n=10):
    
    df = pd.DataFrame.from_records(ranking, columns=['kw','score'])
    df = df.sort_values(by=['score'], ascending=False)
    df.index= range(len(df))
    df['score'] = round(df['score']/df['score'][0],5)
    top=df.iloc[0,0]
    selected=[top]
    records = df.iloc[1:,:].to_records(index=False)
    rest = list(records)

    while len(selected)<n and len(rest)>0:
        top_score=-1
        for r in rest:
            (kw,score)=(r[0],r[1])
            max_sim=-1
            for s in selected:
                List1 = embeddings(str(s)).vector
                List2 = embeddings(str(kw)).vector
                result = 1 - spatial.distance.cosine(List1, List2)
                if result>max_sim:
                    max_sim=result
            current_score = lam*score - (1-lam)*max_sim
            if current_score>top_score:
                top_score = current_score
                candidate = r
        
        rest.remove(candidate)
        selected.append(candidate[0])
    return selected

In [12]:
def select_without_diversity(ranking, n=10):
    
    df = pd.DataFrame.from_records(ranking, columns=['kw','score'])
    df = df.sort_values(by=['score'], ascending=False)
    df.index= range(len(df))

    sorted_kws=list(df.iloc[:,0])
    selected=[]

    for kw in sorted_kws:    
        if len(selected) > n:
            break
        if kw.isnumeric():
            continue
        kw_removed= False
        for selected_kw in selected:           
            if kw in selected_kw:
                kw_removed = True
                break
        
        if not kw_removed:
            selected.append(kw)
    
    return selected


In [13]:
def replace_similars(reference_list,list_to_change):
    l1 = reference_list
    l2 = list_to_change
    kws1 = list(map(list, zip(*l1)))[0]
    for i in range(len(l2)):
        if singularize(l2[i][0]) in kws1:
            l2[i] = (singularize(l2[i][0]), l2[i][1])
    
        if  pluralize(l2[i][0]) in kws1:
            l2[i] = (pluralize(l2[i][0]), l2[i][1])
    return l2

a =[('paste',4), ('cards', 5), ('mice traps',9), ('mouse model',6)]
b =[('paste',8), ('card', 5), ('mouse traps',9), ('mouse models',6), ('cards',12)]
replace_similars(a,b)

[('paste', 8),
 ('cards', 5),
 ('mouse traps', 9),
 ('mouse model', 6),
 ('cards', 12)]

In [14]:
def ensemble(l1,l2,l3,l4):
    l2 = replace_similars(l1,l2)
    l3 = replace_similars(l1,l3)
    l4 = replace_similars(l1,l4)
    
    
    ranking=[]
    try:
        s1=list(map(list, zip(*l1)))[1]
        s1=list(2*np.array(s1)/np.mean(s1))
    except IndexError:
        s1=[]
    try:
        s2=list(map(list, zip(*l2)))[1]
        s2=list(2*np.array(s2)/np.mean(s2))
    except IndexError:
        s2=[]
    try:
        s3=list(map(list, zip(*l3)))[1]
        s3=list(np.array(s3)/np.mean(s3))
    except IndexError:
        s3=[]
    try:
        s4=list(map(list, zip(*l4)))[1]
        s4=list(np.array(s4)/np.mean(s4))
    except IndexError:
        s4=[]
    score_list=s1+s2+s3+s4
    all_kws=l1+l2+l3+l4
    kw_list=list(map(list, zip(*all_kws)))[0]
    
    for kw in np.unique(kw_list):        
        indices = [i for i, x in enumerate(kw_list) if x == kw ]
        score=0
        for ind in indices:
            score= score + score_list[ind]
        ranking.append((kw,score))
    return ranking        

In [15]:
# !pip install dimcli plotly -U --quiet

import dimcli
from dimcli.utils import *
import json
import sys
import pandas as pd
import plotly.express as px
if not 'google.colab' in sys.modules:
  # make js dependecies local / needed by html exports
  from plotly.offline import init_notebook_mode
  init_notebook_mode(connected=True)
#

print("==\nLogging in..")
# https://digital-science.github.io/dimcli/getting-started.html#authentication
ENDPOINT = "https://app.dimensions.ai"
if 'google.colab' in sys.modules:
  import getpass
  KEY = getpass.getpass(prompt='API Key: ')
  dimcli.login(key=KEY, endpoint=ENDPOINT)
else:
  KEY = "4F9C976CA7EA4CBB90505B0A4989DCF8"
  dimcli.login(key=KEY, endpoint=ENDPOINT)
dsl = dimcli.Dsl()

==
Logging in..
[2mDimcli - Dimensions API Client (v0.9.6)[0m
[2mConnected to: <https://app.dimensions.ai/api/dsl> - DSL v2.0[0m
[2mMethod: manual login[0m


In [16]:
#if you want to free up memory
#del df_countvect
# del df_tfidfvect
gc.collect()

0

## Filter df (bad abstracts) and do tf-idf

In [17]:


nlp = spacy.load("en_core_web_sm")
def singularize_text(text):
    
    # Load English tokenizer, tagger,
    # parser, NER and word vectors
    
    #     nlp = spacy.load("en_core_web_sm")

    # Process whole documents
    doc = nlp(text)

    # Token and Tag
    
    for token in doc:
        if token.pos_ == 'NOUN':
            text=re.sub(' ' + str(token) + '*([\n,.?!;\s])', ' ' + singularize(str(token)) + '\\1', text)
            
            
    return text
text= '''
During a stoppage in Sunday's Manchester derby win, substitute Fernandinho spoke with Kevin de Bruyne, so Pep Guardiola was asked if the Brazilian midfielder is now taking on more of a coaching role at the moment: "These things come from himself. Fernandinho and Kevin de Bruyne have been in all the meetings for the last six years so they know exactly what’s happening in every meeting before we go in. I love it when they talk to each other and find a solution."

On whether he sees Fernandinho like his former assistant coach Mikel Arteta but on the field: "In terms of both respect on and off the field, yes, definitely. I’m pretty sure he’s thinking about the club. He has everything to be a manager.

"I would love him to [stay]. I've no complaints when he has to play on a cold night against Peterborough [in the FA Cup] on a bad pitch. At the end of the season we have to decide what is best."'''
t= time.time()

print(singularize_text(text))
t2= time.time()
print(t2-t)
len(text)


During a stoppage in Sunday's Manchester derby win, substitute Fernandinho spoke with Kevin de Bruyne, so Pep Guardiola was asked if the Brazilian midfielder is now taking on more of a coaching role at the moment: "These thing come from himself. Fernandinho and Kevin de Bruyne have been in all the meeting for the last six year so they know exactly what’s happening in every meeting before we go in. I love it when they talk to each other and find a solution."

On whether he sees Fernandinho like his former assistant coach Mikel Arteta but on the field: "In term of both respect on and off the field, yes, definitely. I’m pretty sure he’s thinking about the club. He has everything to be a manager.

"I would love him to [stay]. I've no complaint when he has to play on a cold night against Peterborough [in the FA Cup] on a bad pitch. At the end of the season we have to decide what is best."
0.06781840324401855


902

In [18]:


def remove_multiple_strings(cur_string, replace_list, replacement = ' '):
    for cur_word in replace_list:
        cur_string = cur_string.replace(cur_word, replacement)
    return cur_string


In [39]:
pd.read_csv('abstractsandkeywords20192020.csv', index_col=0)

Unnamed: 0,proj_id,proj_title,abstract,titiabs,keywords
0,d6e5686b-f561-4207-9c85-d72920542b70,Use of beetroot juice to protect against posto...,p Postoperative ileus POI is a transient impa...,Use of beetroot juice to protect against posto...,"['colorectal surgery', 'ileus', 'beetroot juice']"
1,c8bd48f5-3b1b-4965-b797-ffbbb29c4691,Model Predictive Control of Fourth Generation ...,p Decarbonisation of the energy system result...,Model Predictive Control of Fourth Generation ...,"['thermal networks', 'model predictive control']"
2,4c0dd72c-9abc-4ff9-b541-e01d5259da7a,H2020 CARLA The European Photonics CAReer LAun...,Europe faces a situation in which the photonic...,H2020 CARLA The European Photonics CAReer LAun...,"['data storage', 'biophotonics', 'european uni..."
3,fcae1028-f9fe-48e4-aa44-b3c6a302d80d,Renewable Energy SOlutions for URban communiti...,p align left CHALLENGE p p align left The inc...,Renewable Energy SOlutions for URban communiti...,"['storage', 'renewable energy', 'flexibility',..."
4,808ddd2c-6465-43ba-ba61-0332bd2e9ff2,Lower Bounds for partial differential operator...,p align left The theory of partial differenti...,Lower Bounds for partial differential operator...,['differential operators']
...,...,...,...,...,...
7926,9ca0bf64-7a26-4844-82e3-cb7951563860,Large area capacitive MEMS for flexible electr...,p The purpose of this PhD is to design proces...,Large area capacitive MEMS for flexible electr...,"['mems', 'transducer', 'actuator', 'sensor', '..."
7927,502d2c4e-c127-4f83-8866-89f01d5397b6,Mycorrhizal inoculation for a more profitable ...,p Cassava is a crop with a lot of potentials ...,Mycorrhizal inoculation for a more profitable ...,"['cassava', 'amf', 'water use efficiency', 'st..."
7928,79c5adcf-65f5-4b96-a61e-3ebd388b1111,Cancer cells on the move the role of cellular ...,p In the 21st century cancer is expected to g...,Cancer cells on the move the role of cellular ...,['cell adhesion and migration mechanisms in ca...
7929,58666921-f7cb-422e-b606-d145e9b0905e,FABulous Farmers employ Functional Agrobiodive...,Main research question goal br p FABulous Farm...,FABulous Farmers employ Functional Agrobiodive...,[]


In [41]:
pd.read_csv('publicationsabstract.csv', index_col=0)

Unnamed: 0,pub_id,pub_title,abstract,titiabs
0,fb4c28d9-7796-471e-9d3c-d7b39ba1e81b,National identity predicts public health suppo...,,National identity predicts public health suppo...
1,258dbc7e-59dd-4029-b916-372e72c37a1e,,,. .
2,45766acb-ea4c-4956-a3d6-f55ee89971de,,,. .
3,bc2df0a6-f4ec-4dd9-a4e3-6a378bda17b1,Computational Analysis of 3D Cellular Forces w...,The field of tissue engineering as well as the...,Computational Analysis of 3D Cellular Forces w...
4,99e123b8-df3c-4dad-a1fb-e97d5cbbb4ad,Wat kost onderzoek en ontwikkeling van een med...,The cost of research and development R amp D f...,Wat kost onderzoek en ontwikkeling van een med...
...,...,...,...,...
3301,a0bb99a9-8590-49d1-b8e0-5a4d6d202393,,,. .
3302,10e7a759-b680-438f-a594-b3ffab4cbe4d,i Plasmodium malariae i after successful trea...,p We described a case of Plasmodium malariae ...,i Plasmodium malariae i after successful trea...
3303,e76ff76e-d27f-4c68-86ce-c75e219cd2f6,,,. .
3304,5669453a-e375-4b47-96c0-0f19b924a209,Oxidative Stress and Inflammation in Cardiovas...,High oxidative stress Th1 Th17 immune response...,Oxidative Stress and Inflammation in Cardiovas...


In [19]:
df=pd.read_csv('abstractsandkeywords20192020.csv', index_col=0)
filtered_df = df[df['abstract'].notnull()]
filtered_df.index=range(len(filtered_df))


rows_to_delete = []
for i in range(len(filtered_df)):
    if len(filtered_df['abstract'][i])<200:
        
        rows_to_delete.append(i)
filtered_df = filtered_df.drop(rows_to_delete) 
filtered_df.index=range(len(filtered_df))

filtered_df['titiabs']=filtered_df['titiabs'].str.lower()
replace_list = [' p ',' align left ', ' li li ', ' ul li ', ' ol li ', ' li ul ', ' li ol ']
filtered_df['titiabs']=filtered_df['titiabs'].apply(lambda x: remove_multiple_strings(x, replace_list, ' '))
print(len(filtered_df))
# filtered_df['titiabs1']=filtered_df['titiabs'].apply(lambda x: singularize_text(x))

filtered_df

7542


Unnamed: 0,proj_id,proj_title,abstract,titiabs,keywords
0,d6e5686b-f561-4207-9c85-d72920542b70,Use of beetroot juice to protect against posto...,p Postoperative ileus POI is a transient impa...,use of beetroot juice to protect against posto...,"['colorectal surgery', 'ileus', 'beetroot juice']"
1,c8bd48f5-3b1b-4965-b797-ffbbb29c4691,Model Predictive Control of Fourth Generation ...,p Decarbonisation of the energy system result...,model predictive control of fourth generation ...,"['thermal networks', 'model predictive control']"
2,4c0dd72c-9abc-4ff9-b541-e01d5259da7a,H2020 CARLA The European Photonics CAReer LAun...,Europe faces a situation in which the photonic...,h2020 carla the european photonics career laun...,"['data storage', 'biophotonics', 'european uni..."
3,fcae1028-f9fe-48e4-aa44-b3c6a302d80d,Renewable Energy SOlutions for URban communiti...,p align left CHALLENGE p p align left The inc...,renewable energy solutions for urban communiti...,"['storage', 'renewable energy', 'flexibility',..."
4,808ddd2c-6465-43ba-ba61-0332bd2e9ff2,Lower Bounds for partial differential operator...,p align left The theory of partial differenti...,lower bounds for partial differential operator...,['differential operators']
...,...,...,...,...,...
7537,9ca0bf64-7a26-4844-82e3-cb7951563860,Large area capacitive MEMS for flexible electr...,p The purpose of this PhD is to design proces...,large area capacitive mems for flexible electr...,"['mems', 'transducer', 'actuator', 'sensor', '..."
7538,502d2c4e-c127-4f83-8866-89f01d5397b6,Mycorrhizal inoculation for a more profitable ...,p Cassava is a crop with a lot of potentials ...,mycorrhizal inoculation for a more profitable ...,"['cassava', 'amf', 'water use efficiency', 'st..."
7539,79c5adcf-65f5-4b96-a61e-3ebd388b1111,Cancer cells on the move the role of cellular ...,p In the 21st century cancer is expected to g...,cancer cells on the move the role of cellular ...,['cell adhesion and migration mechanisms in ca...
7540,58666921-f7cb-422e-b606-d145e9b0905e,FABulous Farmers employ Functional Agrobiodive...,Main research question goal br p FABulous Farm...,fabulous farmers employ functional agrobiodive...,[]


In [20]:
train =filtered_df['titiabs']
# instantiate the vectorizer object

tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english', ngram_range = (1,2), min_df=2)
# convert the documents into a matrix

tfidf_wm = tfidfvectorizer.fit_transform(train)
#retrieve the terms found in the corpora
# if we take same parameters on both Classes(CountVectorizer and TfidfVectorizer) , it will give same output of get_feature_names() methods)
#count_tokens = tfidfvectorizer.get_feature_names() # no difference

tfidf_tokens = tfidfvectorizer.get_feature_names_out()

df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),index = filtered_df.index, columns = tfidf_tokens)

print("\nTD-IDF Vectorizer\n")
print(df_tfidfvect)


TD-IDF Vectorizer

       00  000  000 000  000 belgian  000 bof  000 bp  000 cases  000 deaths  \
0     0.0  0.0      0.0          0.0      0.0     0.0        0.0         0.0   
1     0.0  0.0      0.0          0.0      0.0     0.0        0.0         0.0   
2     0.0  0.0      0.0          0.0      0.0     0.0        0.0         0.0   
3     0.0  0.0      0.0          0.0      0.0     0.0        0.0         0.0   
4     0.0  0.0      0.0          0.0      0.0     0.0        0.0         0.0   
...   ...  ...      ...          ...      ...     ...        ...         ...   
7537  0.0  0.0      0.0          0.0      0.0     0.0        0.0         0.0   
7538  0.0  0.0      0.0          0.0      0.0     0.0        0.0         0.0   
7539  0.0  0.0      0.0          0.0      0.0     0.0        0.0         0.0   
7540  0.0  0.0      0.0          0.0      0.0     0.0        0.0         0.0   
7541  0.0  0.0      0.0          0.0      0.0     0.0        0.0         0.0   

      000 different

## Generate keywords without score (rake, tf-idf, dimcli)

In [139]:
extracted=[]
# trunc_df=filtered_df.iloc[0:100,]
for i in range( len(trunc_df)):
    l=list(df_tfidfvect.iloc[i,:].sort_values(ascending=False)[0:10].index)
    extracted=extracted+ [l]
extracted
trunc_df['tfidf']=extracted
trunc_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,proj_id,proj_title,abstract,titiabs,keywords,tfidf + score,dimcli keywords + score,rake 2 + score,rake 3 + score,tfidf
0,d6e5686b-f561-4207-9c85-d72920542b70,Use of beetroot juice to protect against posto...,p Postoperative ileus POI is a transient impa...,use of beetroot juice to protect against posto...,"['colorectal surgery', 'ileus', 'beetroot juice']","[(beetroot juice, 0.3319857430680271), (beetro...","[(beetroot juice, 0.694), (postoperative ileus...","[(beetroot juice, 5.666666666666667), (colorec...","[(increased healthcare cost, 9.5), (oxidative ...","[beetroot juice, beetroot, juice, colorectal s..."
1,c8bd48f5-3b1b-4965-b797-ffbbb29c4691,Model Predictive Control of Fourth Generation ...,p Decarbonisation of the energy system result...,model predictive control of fourth generation ...,"['thermal networks', 'model predictive control']","[(networks, 0.30954756508561154), (dh, 0.26856...","[(model predictive control, 0.802), (predictiv...","[(dh networks, 5.75), (heating sector, 5.0), (...","[(energy system results, 9.333333333333334), (...","[networks, dh, 4gdh, dh networks, mpc, generat..."
2,4c0dd72c-9abc-4ff9-b541-e01d5259da7a,H2020 CARLA The European Photonics CAReer LAun...,Europe faces a situation in which the photonic...,h2020 carla the european photonics career laun...,"['data storage', 'biophotonics', 'european uni...","[(carla, 0.34687848630296225), (photonics, 0.2...","[(photonics industry, 0.149), (path, 0.115), (...","[(h2020 carla, 4.0), (europe faces, 4.0), (pho...",[],"[carla, photonics, european photonics, career ..."
3,fcae1028-f9fe-48e4-aa44-b3c6a302d80d,Renewable Energy SOlutions for URban communiti...,p align left CHALLENGE p p align left The inc...,renewable energy solutions for urban communiti...,"['storage', 'renewable energy', 'flexibility',...","[(align left, 0.3881504963491191), (align, 0.3...","[(renewable energy solutions, 0.709), (energy ...","[(renewable energy, 5.444444444444445), (align...","[(citizens amp children, 10.0), (circular econ...","[align left, align, left, energy, circular, dc..."
4,808ddd2c-6465-43ba-ba61-0332bd2e9ff2,Lower Bounds for partial differential operator...,p align left The theory of partial differenti...,lower bounds for partial differential operator...,['differential operators'],"[(differential operators, 0.3287138808680321),...","[(partial differential operators, 0.915), (com...","[(lower bounds, 4.5), (align left, 4.0), (impo...","[(partial differential operators, 9.4222222222...","[differential operators, partial differential,..."
...,...,...,...,...,...,...,...,...,...,...
95,14d30255-a8c7-4779-bbf4-7ba8df72e6bd,Single cell immune profiling to improve patien...,p Cancer immunotherapy using immune checkpoin...,single cell immune profiling to improve patien...,['single-cell transcriptome'],"[(icb, 0.4244016516670835), (single cell, 0.17...","[(cancer immunotherapy, 0.689), (immunotherapy...","[(advanced head, 4.5), (liquid biopsies, 4.0),...","[(improve patient stratification, 9.0), (durab...","[icb, single cell, immunotherapy, cancer immun..."
96,df6e0dfc-8b07-44ef-a451-999184a4ef4a,Impact of clinical guidance amp point of care ...,Most antibiotics are prescribed in ambulatory ...,impact of clinical guidance amp point of care ...,"['experimental study', 'infections']","[(crp, 0.2869971223784021), (point care, 0.245...","[(care test, 0.668), (crp point, 0.646), (ill ...","[(crp point, 5.2), (care test, 4.9285714285714...","[(validated clinical algorithm, 9.666666666666...","[crp, point care, aron, care, children, care c..."
97,223002ab-29ac-4d1b-9f31-1ab50efb1265,Exploiting microbiomes to control Phytophthora...,p Phytophthora cryptogea is an important oomy...,exploiting microbiomes to control phytophthora...,"['lettuce', 'beneficial microbiota', 'phytopht...","[(cryptogea, 0.4509118411892602), (lettuce, 0....","[(phytophthora cryptogea, 0.819), (cryptogea, ...","[(lettuce cultivation, 5.125), (hydroponic cul...","[(alternative control measurements, 9.0), (acc...","[cryptogea, lettuce, lettuce cultivation, hydr..."
98,fafe5398-cac9-4ce6-9812-f4e1b55a2477,Contract agreement between The Ministry of Tra...,The objective of this agreement is to establis...,contract agreement between the ministry of tra...,"['master', 'mobility', 'transport']","[(transportation sciences, 0.22073084723927083...","[(employees, 0.046), (program, 0.034), (mot, 0...","[(uh imob, 4.0), (distance learning, 4.0), (af...","[(federal democratic republic, 9.0), (transpor...","[transportation sciences, program distance, sc..."


In [468]:
a='6'
print(a.isnumeric())
float(a)

True


6.0

In [141]:
dim_kw=[]
rake_kw=[]
for i in range(len(trunc_df)):
    abstract=trunc_df['titiabs'][i]
    
    print(i)
    dim_kw.append(dimcli_keywords(abstract,10))
    rake_kw.append(rake_10(abstract))
trunc_df['dimcli keywords']=dim_kw
trunc_df['rake']=rake_kw

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Generate keywords plus score (rake2, rake3, tf-idf, dimcli)

In [21]:
extracted=[]
trunc_df=filtered_df.iloc[0:100,]
for i in range( len(trunc_df)):
    s=df_tfidfvect.iloc[i,:].sort_values(ascending=False)[0:12]
    tfidf10=[]
    for kw in s.index:
        tfidf10.append((kw,s[kw]))
    extracted=extracted+ [tfidf10]
trunc_df['tfidf + score']=extracted
trunc_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,proj_id,proj_title,abstract,titiabs,keywords,tfidf + score
0,d6e5686b-f561-4207-9c85-d72920542b70,Use of beetroot juice to protect against posto...,p Postoperative ileus POI is a transient impa...,use of beetroot juice to protect against posto...,"['colorectal surgery', 'ileus', 'beetroot juice']","[(beetroot juice, 0.3616689256201938), (beetro..."
1,c8bd48f5-3b1b-4965-b797-ffbbb29c4691,Model Predictive Control of Fourth Generation ...,p Decarbonisation of the energy system result...,model predictive control of fourth generation ...,"['thermal networks', 'model predictive control']","[(networks, 0.3989342237716239), (dh, 0.346113..."
2,4c0dd72c-9abc-4ff9-b541-e01d5259da7a,H2020 CARLA The European Photonics CAReer LAun...,Europe faces a situation in which the photonic...,h2020 carla the european photonics career laun...,"['data storage', 'biophotonics', 'european uni...","[(photonics, 0.4291152312482264), (launch, 0.3..."
3,fcae1028-f9fe-48e4-aa44-b3c6a302d80d,Renewable Energy SOlutions for URban communiti...,p align left CHALLENGE p p align left The inc...,renewable energy solutions for urban communiti...,"['storage', 'renewable energy', 'flexibility',...","[(energy, 0.3042056601368213), (circular, 0.25..."
4,808ddd2c-6465-43ba-ba61-0332bd2e9ff2,Lower Bounds for partial differential operator...,p align left The theory of partial differenti...,lower bounds for partial differential operator...,['differential operators'],"[(differential operators, 0.3964515844619687),..."
...,...,...,...,...,...,...
95,14d30255-a8c7-4779-bbf4-7ba8df72e6bd,Single cell immune profiling to improve patien...,p Cancer immunotherapy using immune checkpoin...,single cell immune profiling to improve patien...,['single-cell transcriptome'],"[(icb, 0.49370777467155125), (single cell, 0.1..."
96,df6e0dfc-8b07-44ef-a451-999184a4ef4a,Impact of clinical guidance amp point of care ...,Most antibiotics are prescribed in ambulatory ...,impact of clinical guidance amp point of care ...,"['experimental study', 'infections']","[(crp, 0.3508187355130432), (point care, 0.300..."
97,223002ab-29ac-4d1b-9f31-1ab50efb1265,Exploiting microbiomes to control Phytophthora...,p Phytophthora cryptogea is an important oomy...,exploiting microbiomes to control phytophthora...,"['lettuce', 'beneficial microbiota', 'phytopht...","[(lettuce, 0.5086114191164591), (hydroponic, 0..."
98,fafe5398-cac9-4ce6-9812-f4e1b55a2477,Contract agreement between The Ministry of Tra...,The objective of this agreement is to establis...,contract agreement between the ministry of tra...,"['master', 'mobility', 'transport']","[(ethiopia, 0.3274710443569126), (mot, 0.31924..."


In [22]:
dim_kw=[]
rake_kw2=[]
rake_kw3=[]
for i in range(len(trunc_df)):
    abstract=trunc_df['titiabs'][i]
    abstract1=trunc_df['titiabs'][i]
    print(i)
    dim_kw.append(dimcli_keywords_score(abstract, 12))
    rake_kw2.append(rake_5_score_2(abstract1))
    rake_kw3.append(rake_5_score_3(abstract1))
trunc_df['dimcli keywords + score']=dim_kw
trunc_df['rake 2 + score']=rake_kw2
trunc_df['rake 3 + score']=rake_kw3

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Put keywords together (ensemble of rake, tf-idf and dimcli)

In [24]:
trunc_df10=trunc_df.iloc[:10,:]

In [23]:
ens_kw=[]
ens_kw1=[]

for i in range(len(trunc_df)):
    l1=trunc_df['tfidf + score'][i]
    l2=trunc_df['dimcli keywords + score'][i]
    l3=trunc_df['rake 2 + score'][i]
    l4=trunc_df['rake 3 + score'][i]
    
    ranking=ensemble(l1,l2,l3,l4)

    final_keywords1=select_without_diversity(ranking, n=10)
    print(i)

    print(final_keywords1)
    

    ens_kw1.append(final_keywords1)
trunc_df['ensemble1 kws']= ens_kw1


0
['beetroot juice', 'colorectal surgery', 'postoperative ileus', 'protective effect', 'poi', 'randomized phase ii trial', 'pathogenesis of poi', 'laparoscopic colorectal surgery', 'ischemia-reperfusion injury', 'following colorectal', 'use beetroot']
1
['mpc', 'predictive control', 'networks', 'heating networks', 'model predictive control', 'dh', 'linear systems', 'dh network', 'generation district heating networks', 'energy systems', 'fourth generation']
2
['photonics industry', 'path', 'source', 'instrument', 'launch', 'europe faces', 'h2020', 'career', 'situation', 'prepared professionals', 'professionals support']
3
['grid', 'energy solutions', 'energy systems', 'circular economy policies', 'renewable energy', 'dc', 'renewable energy solutions', 'based circular', 'materials', 'self-sufficient energy system', 'second life batteries']
4
['differential operators', 'lie groups', 'lower bounds', 'partial differential operators', 'principal symbol', 'compact lie group', 'problem of solv

62
['membrane', 'protein structure', 'physiological conditions', 'membrane proteins', 'plasma membrane', 'membrane protein structures', 'lmrp', 'development of microdevices', 'novel microdevice', 'sub-nanometer resolution', 'screening']
63
['hemp', 'fiber production', 'sustainable intensification', 'industrial hemp', 'fiber characteristics', 'ecosystem services', 'microbial communities', 'microbes', 'community', 'sustainable intensive agriculture', 'hemp fiber']
64
['hydrogels', 'heart', 'stem cells', 'heart tissue', 'shear-thinning hydrogels', 'stem cell delivery', 'injectable hydrogels', 'hydrogel network', 'injectability', 'heart tissue regeneration', 'promising candidate']
65
['organic electronics', 'polymers', 'conjugated polymers', 'structural defects', 'commercialization', 'stille cross coupling', 'type conjugated polymers', 'push pull', 'building blocks', 'pull type', 'leverage commercialization']
66
['organic photovoltaics', 'transparent organic photovoltaics', 'organic molecu



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Check the co-occurences of keywords

In [20]:
df=pd.read_csv('abstractsandkeywords20102022.csv', index_col=0)
df = df[df['abstract'].notnull()]
df.index=range(len(df))
df['titiabs']=df['titiabs'].str.lower()
df

Unnamed: 0,proj_id,proj_title,abstract,titiabs,keywords
0,fb878181-1b81-4ca9-a59a-253f8685c5af,RRF Project FARI the Artificial Intelligence A...,FARI is the Artificial Intelligence for the Co...,rrf project fari the artificial intelligence a...,[]
1,d6e5686b-f561-4207-9c85-d72920542b70,Use of beetroot juice to protect against posto...,p Postoperative ileus POI is a transient impa...,use of beetroot juice to protect against posto...,"['colorectal surgery', 'ileus', 'beetroot juice']"
2,1d529337-ac2e-453b-91ee-fe0f4c493dc6,Adaptation of the SCIROCCO Tool and the 12 dim...,p Introduction p p The EU project Scirocco Ex...,adaptation of the scirocco tool and the 12 dim...,"['health policy', 'integrated care', 'health s..."
3,449ba8a5-3e75-44fe-b52e-5eaebf2f6077,LemnaPro,LemnaPro is een landbouw LA traject met als d...,lemnapro. lemnapro. lemnapro is een landbouw ...,[]
4,9f320fbb-0712-4b71-9d3b-1e2ce7178444,Integrated control of Phytophthora cryptogea i...,Phytophthora cryptogea or root rot is a common...,integrated control of phytophthora cryptogea i...,[]
...,...,...,...,...,...
35156,88ebd608-13be-4e43-b3f1-3a748917316c,Integration of imaging techniques for the quan...,Main research question goal br em In this stud...,integration of imaging techniques for the quan...,[]
35157,d74ab72b-0d9e-4709-be12-a462c67828d1,ReNu2Farm,The project aims at increasing recycling rates...,renu2farm. renu2farm. the project aims at incr...,[]
35158,358139c1-2ff4-4e16-ad54-ff7594000b73,Integrated control of the leaf thrips Thrips t...,During this project a realistic IPM strategy w...,integrated control of the leaf thrips thrips t...,[]
35159,42e3b5fa-a6f2-47e0-9b8b-1c33d23a366a,Sustainable control of cabbage fly,The main objective of this project is the impl...,sustainable control of cabbage fly. sustainabl...,[]


In [24]:
import nltk
def remove_punctuation(string): 
    # initializing punctuations string
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

    # Removing punctuations in string
    # Using loop + punctuation string
    for ele in string:
        if ele in punc:
            string = string.replace(ele, "")
    return string
 


In [25]:
def keywords_in_doc_simple(docs, keywords):
    cooc=np.zeros((len(docs), len(keywords)))
    for i in range(len(docs)):
        for j in range( len(keywords)):
            abstract_without_punctuation = remove_punctuation(docs[i])
            if ' ' + keywords[j] + ' ' in abstract_without_punctuation:
                cooc[i,j] = 1
               

    return pd.DataFrame(cooc, columns=[keywords])

In [26]:
def mean_linkage_strength(cooc_df):
    keywords = cooc_df.columns.values.tolist()
    freq_indices=[]
    for i in range(len(cooc_df.sum(axis=0))):
        if cooc_df.sum(axis=0)[i]>5:
            freq_indices.append(i)
            
    comb_keywords=list(combinations(keywords,2))
    comb_indices=list(combinations(range(len(keywords)),2))
    similarities=[]
    for i in range(len(comb_indices)):
        indices=comb_indices[i]

        if indices[0] in freq_indices and indices[1] in freq_indices:
            cooccurrences=sum((cooc_df.iloc[:, indices[0]]+cooc_df.iloc[:, indices[1]])==2)
            similarity=cooccurrences/min(cooc_df.sum(axis=0)[indices[0]],cooc_df.sum(axis=0)[indices[1]])
            
            similarities.append(similarity)
    selected_elements=[]       
    for index in freq_indices:
        selected_elements.append(keywords[index])
    return np.mean(similarities), len(freq_indices), selected_elements

In [27]:
def pairwise_linkage_strength(cooc_df):
    keywords = cooc_df.columns.values.tolist()
    freq_indices=[]
    for i in range(len(cooc_df.sum(axis=0))):
        if cooc_df.sum(axis=0)[i]>5:
            freq_indices.append(i)
            
    comb_keywords=list(combinations(keywords,2))
    comb_indices=list(combinations(range(len(keywords)),2))
    similarities=[]
    for i in range(len(comb_indices)):
        indices=comb_indices[i]

        if indices[0] in freq_indices and indices[1] in freq_indices:
            cooccurrences=sum((cooc_df.iloc[:, indices[0]]+cooc_df.iloc[:, indices[1]])==2)
            similarity=(cooccurrences-1)/(min(cooc_df.sum(axis=0)[indices[0]],cooc_df.sum(axis=0)[indices[1]])-1)
            
            similarities.append((keywords[indices[0]], keywords[indices[1]], similarity))
    return similarities
# pairwise_linkage_strength(cooc_df)

In [28]:
def make_cooc_df(docs, keywords):
    voc = keywords
    countvectorizer = CountVectorizer(vocabulary = keywords, analyzer= 'word', stop_words='english', ngram_range = (1,4))
    count_wm = countvectorizer.fit_transform(docs)
    count_tokens = countvectorizer.get_feature_names_out()
    df_countvect = pd.DataFrame(data = count_wm.toarray(), columns = count_tokens)
    cooc_df=df_countvect.astype(bool).astype(int)
    return cooc_df

In [29]:
def make_cooc_df_plus(docs, keywords):

    voc = []
    for kw in keywords:

        if singularize(kw) == pluralize(kw) == kw:
            voc = voc + [kw, 'wordthatdontexist']
        elif singularize(kw) == kw:
            voc = voc + [singularize(kw), pluralize(kw)]
        elif pluralize(kw) == kw:
            voc = voc + [pluralize(kw), singularize(kw)]
        else:
            voc = voc + [kw, singularize(kw)]


    countvectorizer = CountVectorizer(vocabulary = voc, analyzer= 'word', stop_words='english', ngram_range = (1,4))
    count_wm = countvectorizer.fit_transform(docs)
    count_tokens = countvectorizer.get_feature_names_out()

    df_countvect = pd.DataFrame(data = count_wm.toarray(), columns = count_tokens)
    for i in range(int(df_countvect.shape[1]/2)):

        df_countvect.iloc[:,2*i] = df_countvect.iloc[:,2*i] + df_countvect.iloc[:,2*i+1]
    cooc_df = df_countvect.iloc[:,::2].astype(bool).astype(int)
    return cooc_df

## Build co-occurence network and visualise it

In [30]:
def visualize_network(G, cooc_df, cut_off):
    
    
    mls = mean_linkage_strength(cooc_df)[0]
    nb_of_groups = len(groups_of_graph(G))
    
    viznet = NetworkViz(notebook=True, width="100%", height="800px")
    viznet.toggle_hide_edges_on_drag(True)
    viznet.barnes_hut()
    viznet.repulsion(300)
    viznet.heading = f" Number of groups= {nb_of_groups}, MLS= {mls}, Min_edge strength = {cut_off}."


    # reuse plotly color palette
    palette = px.colors.diverging.Temps  # 7 colors

    viznet.from_nx(G)


    # update visual features 

    for node in viznet.nodes:
        freq = G.nodes[node['label']]['frequency']
        score_bucket = G.nodes[node['label']]['score_bucket'] # get from original network

        node['size'] = 20
        node['color'] = palette[2*score_bucket]  # get color based on score_bucket (1 or 2)
        node['borderWidthSelected'] = 5
        node['title'] = f"<h4>Concept: '{node['label']}'</h4><hr>Frequency: {freq}",
        # print(node)
    for edge in viznet.edges:
        # get value from main Network weight
        edge['value'] = G.edges[edge['from'], edge['to']]['weight']
        edge['title'] = G.edges[edge['from'], edge['to']]['weight']
        edge['color'] = palette[2]
        # print(edge)

    return viznet.show("concepts_network.html")

def make_network(cooc_df, MIN_EDGE_WEIGHT = 0.1):
    G = nx.Graph() # networkX instance

    #
    # TIP play with these parameters in ordeto generate different types of networks
    #

    

    EDGES_SET = pairwise_linkage_strength(cooc_df)


    #
    # build nodes from concepts, including score_avg and frequency
    # -- NOTE: score_bucket indicates if the concepts is above or below the mean_score
    # -- this value is used in the visualization below to color-code nodes
    #

    CONCEPTS_SET = cooc_df.sum()

    for index in CONCEPTS_SET.index:
        document_freq = CONCEPTS_SET[index]
        if document_freq < 5:
            continue
        
        G.add_node(index, frequency=int(document_freq), score_bucket = 0)
    

    #
    # build edges, based on concepts co-occurrence within pubs
    # -- calculate a 'weight' based on how often two concepts co-occur
    #
    



    for (a, b, weight) in EDGES_SET: 
         G.add_edge(a, b, weight=weight)


    

    #
    # this extra step is useful to remove low-weight connections
    #

    print(f".. cleaning up edges with weight < {MIN_EDGE_WEIGHT}...")

    for a, b, w in list(G.edges(data='weight')):
        if w < MIN_EDGE_WEIGHT:
            G.remove_edge(a, b)
    

#     print(f".. removing isolated nodes...")

#     G.remove_nodes_from(list(nx.isolates(G)))
    print("Nodes:", len(G.nodes()), "Edges:", len(G.edges()))
    return G
    

In [31]:
def has_k_connected_sub_graph(G, min_size = 3, k = 2):
    k_components = nx.k_components(G)
    
    try:
        list_of_k_components = k_components[k]
    except:
        list_of_k_components = []
    for k_component  in list_of_k_components:
        if len(k_component)>=min_size:
            return True
    return False

def groups_of_graph(G):
    groups = []
    large_components  =[]
    nb_of_nodes = len(G.nodes)
    minimum_group_size = int(0.25*nb_of_nodes)+1
    S = [G.subgraph(c).copy() for c in nx.connected_components(G)]
    for sub_G in S:
        if len(sub_G)>=8:
            large_components.append(sub_G)
        if len(sub_G)==1:
            continue
        elif len(sub_G)==2>=minimum_group_size:
            groups.append(list(sub_G.nodes))
        elif has_k_connected_sub_graph(sub_G, min_size = max(3, minimum_group_size), k=2):
            groups.append(list(sub_G.nodes))
            
    for G in large_components:
        groups_in_component = []
        S = [G.subgraph(c).copy() for c in nx.biconnected_components(G)]
        one_group_too_much = False
        for sub_G in S:
            if has_k_connected_sub_graph(sub_G, min_size = max(4, minimum_group_size), k=3):                
                groups_in_component.append(list(sub_G.nodes))
                
        if len(groups_in_component)>1:
            groups.remove(list(G.nodes))
            groups=groups+groups_in_component
                 
    return groups


In [32]:
def graph_into_groups(G, UPPER_LIMIT_MIN_EDGE_WEIGHT = 0.3):
    H = G.copy()
    initial_number_of_edges = len(H.edges)
    edges_deleted = 0
    cut_off=0.05
    while len(groups_of_graph(H)) <= 1 and edges_deleted<(1/3*initial_number_of_edges):
        MIN_EDGE_WEIGHT = min(list(map(list, zip(*list(H.edges(data='weight')))))[2]) #gets smallest weight of all edges
        if MIN_EDGE_WEIGHT>UPPER_LIMIT_MIN_EDGE_WEIGHT: 
            break
        for a, b, w in list(H.edges(data='weight')):
            if w == MIN_EDGE_WEIGHT:
                H.remove_edge(a, b)
                edges_deleted = edges_deleted +1
                print('Edge removed between {} and {}.'.format(a,b))
                cut_off=MIN_EDGE_WEIGHT
    
    for node in list(H.nodes):
        for group in groups_of_graph(H):
            if node in group:
                H.nodes[node]['score_bucket'] = groups_of_graph(H).index(group)+1
    
    return H, cut_off



   
    

In [401]:
docs = df['titiabs']

actual_keywords= ['extreme conditions', 'chemical attack', 'durability', 'healing agents', 'self-healing', 'concrete' ]
keywords = np.unique(actual_keywords+['retina', 'visual system', 'neuron', 'glia', 'extracellular vesicle protein', 'propagation', 'neuroinflammation', 'neurodegeneration'])
cooc_df = make_cooc_df(docs, keywords)
G=make_network(cooc_df, MIN_EDGE_WEIGHT = 0.05)
G, cut_off = graph_into_groups(G, UPPER_LIMIT_MIN_EDGE_WEIGHT = 0.3)
visualize_network(G, cooc_df, cut_off)


.. cleaning up edges with weight < 0.05...
Nodes: 10 Edges: 9


In [33]:
groups_df = pd.DataFrame([], columns=['Title', 'Keywords', 'Frequent keywords', 'Nb of freq keywords', 'MLS', 'Nb of groups'])
for i in range(0,50):

    docs = df['titiabs']
    title = trunc_df['proj_title'][i]
    keywords= list(np.unique(trunc_df['ensemble1 kws'][i]+literal_eval(trunc_df['keywords'][i])))
    indices_to_delete=[]
    for i in range(len(keywords)):
        if singularize(keywords[i]) in keywords[:i] + keywords[i+1:]:
            indices_to_delete.append(i)
    for index in sorted(indices_to_delete, reverse=True):
        del keywords[index] 
    cooc_df = make_cooc_df(docs, keywords)
    mls = mean_linkage_strength(cooc_df)[0]
    G = make_network(cooc_df, MIN_EDGE_WEIGHT = 0.05)
    G, cut_off = graph_into_groups(G, UPPER_LIMIT_MIN_EDGE_WEIGHT = 0.3)
    nb_of_groups = len(groups_of_graph(G))
    freq_key = list(G.nodes)
    df2 = pd.DataFrame([[title, keywords, freq_key, len(freq_key), mls, nb_of_groups]], columns=['Title', 'Keywords', 'Frequent keywords', 'Nb of freq keywords', 'MLS', 'Nb of groups'])
    groups_df = groups_df.append(df2)
    print(i)


Mean of empty slice.


invalid value encountered in double_scalars


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 1 Edges: 0
11
.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 7
Edge removed between energy systems and networks.
Edge removed between energy systems and mpc.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 9 Edges: 11
Edge removed between h2020 and horizon 2020.
Edge removed between horizon 2020 and situation.
Edge removed between instrument and path.
Edge removed between career and situation.
14



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 7 Edges: 19
Edge removed between grid and materials.
Edge removed between flexibility and storage.
Edge removed between flexibility and materials.
Edge removed between flexibility and renewable energy.
Edge removed between dc and grid.
Edge removed between dc and materials.
Edge removed between dc and renewable energy.
13



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


Mean of empty slice.


invalid value encountered in double_scalars


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 1 Edges: 0
10
.. cleaning up edges with weight < 0.05...
Nodes: 9 Edges: 30
Edge removed between inflammation and intestine.
Edge removed between ibd and novel therapeutic strategies.
Edge removed between inflammation and novel therapeutic strategies.
Edge removed between inflammatory bowel disease ibd and novel therapeutic strategies.
Edge removed between ibd and monocyte.
Edge removed between inflammatory bowel disease ibd and monocyte.
Edge removed between monocyte and novel therapeutic strategies.
Edge removed between intestine and patients.
Edge removed between intestinal inflammation and monocyte.
Edge removed between intestinal inflammation and myeloid cells.
Edge removed between intestinal inflammation and novel therapeutic strategies.



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



12
.. cleaning up edges with weight < 0.05...
Nodes: 7 Edges: 11
Edge removed between animals and ethics.
Edge removed between doctoral research and society.
Edge removed between animal welfare and society.
Edge removed between ethics and law.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 6 Edges: 7
Edge removed between building and professionals.
Edge removed between concept and professionals.
Edge removed between building and vlaio.
Edge removed between concept and vlaio.
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 7 Edges: 19
Edge removed between measures and socio economic.
Edge removed between economic impact and measures.
Edge removed between economic impact and spread.
Edge removed between covid 19 and socio economic.
Edge removed between socio economic and spread.
Edge removed between economic impact and health impact.
Edge removed between covid 19 and spread.
13



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 6 Edges: 4
Edge removed between farmers and insect.
Edge removed between circular economy and insect.
15



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 9 Edges: 12
Edge removed between fatigue and recovery.
Edge removed between outcomes and pathophysiology.
Edge removed between outcomes and recovery.
Edge removed between fatigue and inflammation.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 3
Edge removed between 4d and composite materials.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 4 Edges: 1
Edge removed between robotics and transmissions.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 4 Edges: 3
Edge removed between blockchain and law.
Edge removed between blockchain and legal framework.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 4 Edges: 5
Edge removed between lasers and thz.
Edge removed between ghz and lasers.
Edge removed between ghz and wavelength.
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 6 Edges: 4
Edge removed between eggs and parasite.
Edge removed between eggs and stool.
Edge removed between parasite and stool.
Edge removed between quantification and stool.
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 7 Edges: 10
Edge removed between concept and microbiology.
Edge removed between bacteria and concept.
Edge removed between bacteria and humans animals.
Edge removed between concept and humans animals.
Edge removed between humans animals and microbiology.
13



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 5
Edge removed between fluorescence and vub.
Edge removed between applications and vub.
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 10 Edges: 18
Edge removed between adults and cells.
Edge removed between immune checkpoint inhibitors and therapy resistance.
Edge removed between immunotherapy and therapy resistance.
Edge removed between leukemia and therapy resistance.
Edge removed between immunotherapy and nanobody.
Edge removed between aml and immunotherapy.
Edge removed between aml and nanobody.



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



12
.. cleaning up edges with weight < 0.05...
Nodes: 8 Edges: 10
Edge removed between museum and people.
Edge removed between new way and organizations.
Edge removed between people and social inclusion.
Edge removed between organizations and people.
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 2
Edge removed between cargo and culture.
10



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 10
Edge removed between properties and structural health monitoring.
Edge removed between robotics and structural health monitoring.
Edge removed between self healing and structural health monitoring.
Edge removed between properties and robotics.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 3 Edges: 2
Edge removed between cvd and gene therapy.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 7 Edges: 20
Edge removed between innovation and region.
Edge removed between innovation and view.
Edge removed between economic development and region.
Edge removed between economic development and view.
Edge removed between region and support.
Edge removed between brussels capital region and economic development.
Edge removed between brussels capital region and view.
10



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 7 Edges: 6
Edge removed between inspection and optimization.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 10 Edges: 30
Edge removed between optimisation and reasoning.
Edge removed between optimization and reasoning.
Edge removed between br and optimisation.
Edge removed between machine learning and reasoning.
Edge removed between optimisation and optimization.
Edge removed between br and optimization.
Edge removed between machine learning and optimization.
Edge removed between environment br and learning.
Edge removed between environment br and machine learning.
Edge removed between environment br and optimisation.
Edge removed between environment br and paradigm.



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



13
.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 5
Edge removed between ambition and smes.
Edge removed between entrepreneurs and entrepreneurship.
10



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 4 Edges: 1
Edge removed between receptors and reward.
14



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 7 Edges: 10
Edge removed between breast cancer and images.
Edge removed between breast cancer and deep learning.
Edge removed between deep learning and images.
13



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 4
Edge removed between cooperation and data protection.
Edge removed between criminal and legal protection.
Edge removed between data protection and legal protection.
Edge removed between legal protection and tax.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 8
Edge removed between health information and multilingual.
Edge removed between covid 19 and healthcare sector.
Edge removed between health information and healthcare sector.
10



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 4
Edge removed between future and horizon 2020.
Edge removed between battery and future.
14



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 1
Edge removed between climate resilient and low carbon.
13



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 4 Edges: 1
Edge removed between antibiotics and compounds.
14



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 2
Edge removed between building and electricity.
13



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 4 Edges: 1
Edge removed between member states and specifications.
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 6 Edges: 10
Edge removed between liver disease and qol.
Edge removed between nafld and qol.
Edge removed between non alcoholic and qol.
Edge removed between liver cirrhosis and qol.
15



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 6 Edges: 6
Edge removed between covid 19 and team.
Edge removed between prof and team.
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 6 Edges: 1
Edge removed between autoimmune diseases and immunology.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 6 Edges: 4
Edge removed between approach and chapter.
Edge removed between approach and numbers.
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 8 Edges: 16
Edge removed between automated and society.
Edge removed between automation and society.
Edge removed between automated and computation.
Edge removed between computation and future.
Edge removed between computation and technology.
Edge removed between society and technology.



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



11
.. cleaning up edges with weight < 0.05...
Nodes: 8 Edges: 12
Edge removed between drivers and water.
Edge removed between climate change and si.
Edge removed between si and water.
Edge removed between influence and water.
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 7 Edges: 6
Edge removed between environments and users.
Edge removed between palliative and users.
15



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 3 Edges: 1
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 4 Edges: 2
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 7 Edges: 5
Edge removed between financing and supervision.
Edge removed between months and resources.
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 5 Edges: 2
Edge removed between esa and experiment.
Edge removed between esa and flight.
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 4 Edges: 1
Edge removed between equations and solver.
12



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 2 Edges: 1
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



.. cleaning up edges with weight < 0.05...
Nodes: 3 Edges: 2
11



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [34]:
groups_df.index= range(len(groups_df))

In [37]:
groups_df.sort_values(by=['Nb of groups'])

Unnamed: 0,Title,Keywords,Frequent keywords,Nb of freq keywords,MLS,Nb of groups
0,Use of beetroot juice to protect against posto...,"[beetroot juice, colorectal surgery, following...",[protective effect],1,,0
47,FWO travel grant for a short stay abroad in th...,"[computational issues, equations, evolution eq...","[equations, solver, travel grant, united states]",4,0.181159,0
46,Diamond NV color center spin resonance magneto...,"[balloon, carbon materials, diamant, dlr, esa,...","[carbon materials, esa, experiment, flight, re...",5,0.110173,0
42,Designing person centred palliative environments,"[actual users, architecture, designer, environ...","[architecture, designer, environments, palliat...",7,0.128909,0
39,The Blue Thread A Redactional and Narrative Cr...,"[approach, blue thread, book of numbers, chapt...","[approach, chapter, flow, narrative, numbers, ...",6,0.10351,0
38,Research in translational immunomodulation,"[autoimmune diseases, cd4 43, cell subsets, ef...","[autoimmune diseases, cd4 43, cell subsets, et...",6,0.123935,0
37,DSI COVID 19 team,"[covid 19, dsi, epidemiology and public health...","[covid 19, hens, mathematical model, models, p...",6,0.096161,0
35,Study on common specifications for road markin...,"[assistance systems, common specification, dri...","[member states, specific attention, specificat...",4,0.047544,0
33,Exploring the three dimensional structure of a...,"[amps, antibiotics, antimicrobial peptides, ci...","[antibiotics, compounds, computational chemist...",4,0.107692,0
32,H2020 BAT4EVER Building a Low Carbon Climate R...,"[batteries, building low, calendar life, carbo...","[batteries, climate resilient, future generati...",5,0.085595,0


In [426]:
trunc_df

Unnamed: 0,proj_id,proj_title,abstract,titiabs,keywords,tfidf + score,dimcli keywords + score,rake 2 + score,rake 3 + score,ensemble1 kws
0,d6e5686b-f561-4207-9c85-d72920542b70,Use of beetroot juice to protect against posto...,p Postoperative ileus POI is a transient impa...,use of beetroot juice to protect against posto...,"['colorectal surgery', 'ileus', 'beetroot juice']","[(beetroot juice, 0.3616689256201938), (beetro...","[(beetroot juice, 0.694), (postoperative ileus...","[(beetroot juice, 5.666666666666667), (colorec...","[(increased healthcare cost, 9.5), (oxidative ...","[beetroot juice, colorectal surgery, postopera..."
1,c8bd48f5-3b1b-4965-b797-ffbbb29c4691,Model Predictive Control of Fourth Generation ...,p Decarbonisation of the energy system result...,model predictive control of fourth generation ...,"['thermal networks', 'model predictive control']","[(networks, 0.3989342237716239), (dh, 0.346113...","[(model predictive control, 0.802), (predictiv...","[(dh networks, 5.75), (heating sector, 5.0), (...","[(energy system results, 9.333333333333334), (...","[mpc, predictive control, networks, model pred..."
2,4c0dd72c-9abc-4ff9-b541-e01d5259da7a,H2020 CARLA The European Photonics CAReer LAun...,Europe faces a situation in which the photonic...,h2020 carla the european photonics career laun...,"['data storage', 'biophotonics', 'european uni...","[(photonics, 0.4291152312482264), (launch, 0.3...","[(photonics industry, 0.149), (path, 0.115), (...","[(h2020 carla, 4.0), (europe faces, 4.0), (pho...",[],"[photonics industry, path, source, instrument,..."
3,fcae1028-f9fe-48e4-aa44-b3c6a302d80d,Renewable Energy SOlutions for URban communiti...,p align left CHALLENGE p p align left The inc...,renewable energy solutions for urban communiti...,"['storage', 'renewable energy', 'flexibility',...","[(energy, 0.3042056601368213), (circular, 0.25...","[(energy systems, 0.756), (renewable energy so...","[(renewable energy, 5.444444444444445), (dc ba...","[(citizens amp children, 10.0), (circular econ...","[grid, energy solutions, energy systems, circu..."
4,808ddd2c-6465-43ba-ba61-0332bd2e9ff2,Lower Bounds for partial differential operator...,p align left The theory of partial differenti...,lower bounds for partial differential operator...,['differential operators'],"[(differential operators, 0.3964515844619687),...","[(partial differential operators, 0.915), (com...","[(lower bounds, 4.5), (important branches, 4.0...","[(partial differential operators, 9.4222222222...","[differential operators, lower bounds, lie gro..."
5,e467c2ca-8fe7-438c-a5d5-6dc88f7b079f,The role of myeloid cells in the resolution of...,p Inflammatory bowel disease IBD is character...,the role of myeloid cells in the resolution of...,"['myeloid cells', 'intestine', 'inflammation']","[(intestinal inflammation, 0.30230517967878295...","[(intestinal inflammation, 0.721), (inflammati...","[(pro resolving, 5.5), (altered monocyte, 4.66...","[(gastrointestinal tract leading, 9.0), (blood...","[intestinal inflammation, ibd, monocyte, pro r..."
6,96cbc9ba-9882-4051-9d0b-7a2055640eef,Project Animal Welfare Animal Ethics and Anima...,p This project is a collaboration between Ghe...,project animal welfare animal ethics and anima...,"['animal welfare', 'animal ethics', 'animal law']","[(animals, 0.35315070684207384), (li, 0.338245...","[(animal ethics, 0.539), (animal law, 0.532), ...","[(li li, 5.5), (doctoral research, 5.166666666...","[(questions ul li, 8.75), (effect li ul, 8.75)...","[welfare, ethics, law, animals, li, 34, li li,..."
7,5a265c63-d895-4f03-8df6-8c9d86be7418,VLAIO development project CircuWallPanels,This project aims to develop the concept of th...,vlaio development project circuwallpanels. vla...,"['circuwallpanels', 'research by design']","[(building, 0.3056753906923478), (development ...","[(concept, 0.034), (ecological philosophy, 0.0...","[(project aims, 5.333333333333334), (ecologica...","[(building physics requirements, 8.0)]","[building, ecological philosophy, development ..."
8,f53687a7-060e-420e-897c-3cfa400ee5d6,Balancing socio economic and public health imp...,Given the uncertainty about the further develo...,balancing socio economic and public health imp...,"['psychological analysis', 'socio-economic ana...","[(socio economic, 0.24131538533038993), (socio...","[(economic impact, 0.204), (impact, 0.178), (a...","[(spatial heterogeneity, 4.8), (mitigation sop...","[(require geostatistical methods, 9.5), (balan...","[health impact, socio economic, economic impac..."
9,134444be-528b-4bce-bfbc-c51f28c2fe35,Introsect,Research in the insect sector is not standing ...,introsect. introsect. research in the insect s...,"['insect farming', 'mealworms', 'black soldier...","[(insect, 0.42197349200436807), (breeding, 0.2...","[(insect, 0.786), (farmers, 0.771), (breeding,...","[(insect sector, 4.4), (insect breeding, 4.4),...","[(introduce insect farming, 8.4), (effective i...","[insect, insect breeding, farmers, agricultura..."


In [452]:
for i in range(len(filtered_df)):
    if ' li ' in filtered_df['titiabs'][i]:
        print(filtered_df['titiabs'][i])
    

project animal welfare animal ethics and animal law. project animal welfare animal ethics and animal law.  this project is a collaboration between ghent university and ku leuven p the research involves supervising a doctorate about the role of the animal in society with as common thread the 34 one welfare 34 concept in which the interaction between animal and human welfare and the environment are central 34 one welfare 34 is considered a broadening of the 34 one health 34 theme with which it also partly overlaps in recent months it has become even clearer that the health and welfare of humans cannot be separated from that of animals and the environment there is only 34 one well being 34 animals play an important role in this but how we treat them can differ these differences are usually not determined by the animal itself but by humans p more specifically the doctoral research departs from the following questions ul li how do we deal with animals in society li li what regulations are c

trailer on barge. trailer on barge.  the vil project trailer on barge shuttle aims to provide an innovative alternative in congestion sensitive areas in flanders by shuttling empty and or loaded trailers by barge over relatively short distances by means of a ro ro roll on roll off concept p the transport sector today is being confronted with a rising transport demand meeting this demand is a difficult task due to multiple factors driver shortage increasing congestion on our roads which raises the amount of hours lost in traffic and the total amount of time it takes to complete a transport order in other words on the suppliers side it is necessary to make optimal use of the current transport capacity p barge as an alternative p more often inland shipping is seen as an alternative transport mode because there is still capacity available there are already efficient solutions being offered for bulk and import export containers but for piece goods it is more difficult currently this cargo i

In [447]:
trunc_df['titiabs'][6]

'project animal welfare animal ethics and animal law. project animal welfare animal ethics and animal law.  this project is a collaboration between ghent university and ku leuven p the research involves supervising a doctorate about the role of the animal in society with as common thread the 34 one welfare 34 concept in which the interaction between animal and human welfare and the environment are central 34 one welfare 34 is considered a broadening of the 34 one health 34 theme with which it also partly overlaps in recent months it has become even clearer that the health and welfare of humans cannot be separated from that of animals and the environment there is only 34 one well being 34 animals play an important role in this but how we treat them can differ these differences are usually not determined by the animal itself but by humans p more specifically the doctoral research departs from the following questions ul li how do we deal with animals in society li li what regulations are 

In [38]:
i=24
docs = df['titiabs']
keywords= np.unique(trunc_df['ensemble1 kws'][i]+literal_eval(trunc_df['keywords'][i]))
cooc_df = make_cooc_df(docs, keywords)
G=make_network(cooc_df, MIN_EDGE_WEIGHT = 0.05)
G, cut_off = graph_into_groups(G, UPPER_LIMIT_MIN_EDGE_WEIGHT = 0.3)
visualize_network(G, cooc_df, cut_off)

.. cleaning up edges with weight < 0.05...
Nodes: 7 Edges: 6
Edge removed between inspection and optimization.


In [36]:
G=make_network(cooc_df, MIN_EDGE_WEIGHT = 0.05)
G, cut_off = graph_into_groups(G, UPPER_LIMIT_MIN_EDGE_WEIGHT = 0.3)
visualize_network(G, cooc_df, cut_off)

.. cleaning up edges with weight < 0.05...
Nodes: 8 Edges: 13
Edge removed between doctoral research and law.
Edge removed between doctoral research and society.
Edge removed between animal welfare and society.
Edge removed between society and welfare.


## Compare keyword extracters

In [143]:
eval_df =  trunc_df.iloc[:,:5]

trunc_df['keywords'] = trunc_df.apply(lambda x: literal_eval(x['keywords']),axis=1)
eval_df['dimcli_score'] = trunc_df.apply(lambda x: compare_lists(x['keywords'], x['dimcli keywords']),axis=1)
eval_df['tfidf_score'] = trunc_df.apply(lambda x: compare_lists(x['keywords'], x['tfidf']),axis=1)
eval_df['rake_score'] = trunc_df.apply(lambda x: compare_lists(x['keywords'], x['rake']),axis=1)


eval_df['tfidf vs dimcli'] = trunc_df.apply(lambda x: compare_lists(x['dimcli keywords'], x['tfidf']),axis=1)
eval_df['nb of keywords'] = trunc_df.apply(lambda x: len(x['keywords']),axis=1)
eval_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Unnamed: 0,proj_id,proj_title,abstract,titiabs,keywords,tfidf + score,dimcli_score,tfidf_score,rake_score,tfidf vs dimcli,nb of keywords
0,d6e5686b-f561-4207-9c85-d72920542b70,Use of beetroot juice to protect against posto...,p Postoperative ileus POI is a transient impa...,use of beetroot juice to protect against posto...,"[colorectal surgery, ileus, beetroot juice]","[(beetroot juice, 0.3319857430680271), (beetro...",0.666667,1.000000,0.666667,0.5,3
1,c8bd48f5-3b1b-4965-b797-ffbbb29c4691,Model Predictive Control of Fourth Generation ...,p Decarbonisation of the energy system result...,model predictive control of fourth generation ...,"[thermal networks, model predictive control]","[(networks, 0.30954756508561154), (dh, 0.26856...",0.500000,0.000000,0.500000,0.1,2
2,4c0dd72c-9abc-4ff9-b541-e01d5259da7a,H2020 CARLA The European Photonics CAReer LAun...,Europe faces a situation in which the photonic...,h2020 carla the european photonics career laun...,"[data storage, biophotonics, european union (e...","[(carla, 0.34687848630296225), (photonics, 0.2...",0.000000,0.000000,0.000000,0.1,4
3,fcae1028-f9fe-48e4-aa44-b3c6a302d80d,Renewable Energy SOlutions for URban communiti...,p align left CHALLENGE p p align left The inc...,renewable energy solutions for urban communiti...,"[storage, renewable energy, flexibility, dc ba...","[(align left, 0.3881504963491191), (align, 0.3...",0.000000,0.000000,0.250000,0.1,4
4,808ddd2c-6465-43ba-ba61-0332bd2e9ff2,Lower Bounds for partial differential operator...,p align left The theory of partial differenti...,lower bounds for partial differential operator...,[differential operators],"[(differential operators, 0.3287138808680321),...",1.000000,1.000000,0.000000,0.3,1
...,...,...,...,...,...,...,...,...,...,...,...
95,14d30255-a8c7-4779-bbf4-7ba8df72e6bd,Single cell immune profiling to improve patien...,p Cancer immunotherapy using immune checkpoin...,single cell immune profiling to improve patien...,[single-cell transcriptome],"[(icb, 0.4244016516670835), (single cell, 0.17...",0.000000,0.000000,0.000000,0.3,1
96,df6e0dfc-8b07-44ef-a451-999184a4ef4a,Impact of clinical guidance amp point of care ...,Most antibiotics are prescribed in ambulatory ...,impact of clinical guidance amp point of care ...,"[experimental study, infections]","[(crp, 0.2869971223784021), (point care, 0.245...",0.000000,0.000000,0.000000,0.1,2
97,223002ab-29ac-4d1b-9f31-1ab50efb1265,Exploiting microbiomes to control Phytophthora...,p Phytophthora cryptogea is an important oomy...,exploiting microbiomes to control phytophthora...,"[lettuce, beneficial microbiota, phytophthora ...","[(cryptogea, 0.4509118411892602), (lettuce, 0....",0.333333,0.666667,0.333333,0.4,3
98,fafe5398-cac9-4ce6-9812-f4e1b55a2477,Contract agreement between The Ministry of Tra...,The objective of this agreement is to establis...,contract agreement between the ministry of tra...,"[master, mobility, transport]","[(transportation sciences, 0.22073084723927083...",0.333333,0.333333,0.000000,0.2,3


In [151]:
eval_df.loc[eval_df['nb of keywords'] >2].mean(axis=0)

dimcli_score       0.140535
tfidf_score        0.145473
rake_score         0.074074
tfidf vs dimcli    0.203704
nb of keywords     4.740741
dtype: float64

In [144]:
eval_df.mean(axis=0)

dimcli_score       0.190321
tfidf_score        0.208877
rake_score         0.073883
tfidf vs dimcli    0.224111
nb of keywords     2.770000
dtype: float64