In [1]:
# Import required packages
import numpy as np
import pandas as pd
import csv
import string
import re
from pprint import pprint

!pip install stop_words
# Define function for tokenize and lemmatizing
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords



## Read dataset

In [2]:
# rename the header from 'Summary' to 'Abstract'
df = pd.read_csv('abstract.csv', index_col = False)
df.rename(columns = {'Summary':'Abstract'}, inplace = True)
df.head()

Unnamed: 0.1,Unnamed: 0,Abstract
0,0,The article discuss the dental informatics as ...
1,1,The goal of the Collaboration Spotting project...
2,2,The article talks about how dental informatics...
3,3,This journal review points out the articles ab...
4,4,The article talks about dental informatics as ...


In [3]:
df.shape

(129, 2)

In [4]:
# cleaning data

#lowercase and removing special characters and commonly occurring words
#df['Abstract'] = df['Abstract'].apply(lambda x: " ".join(x for x in str(x).split() if not x.isdigit() and not x.isspace()))
#df['Abstract'] = df['Abstract'].str.replace(r'[^\w\s]','')
#df['Abstract'] = df['Abstract'].str.replace(r'\d+', '')
#df['Abstract'] = df['Abstract'].str.lower()

stop = stopwords.words('english')
df['Abstract']= df['Abstract'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


In [5]:
# create lists of abstract and combine list to string

# create list of documents
abstract_set = [abstract for abstract in df['Abstract']]

# Remove new line characters
abstract_set = [re.sub(r'\s+', ' ', sent) for sent in abstract_set]

# Remove distracting single quotes
abstract_set = [re.sub("\'", "", sent) for sent in abstract_set]

# combine list to string 
abstract_string = ' '.join([(item) for item in abstract_set])


## new dataset with only 'Title'

In [6]:
# create new dataset with only Titles
titles = pd.read_csv('article_info.csv')
header = ['Title']
titles.to_csv('title.csv', columns = header)

In [7]:
df1 = pd.read_csv('title.csv', index_col = False)

In [8]:
# cleaning data

#lowercase and removing special characters and commonly occurring words
#df1['Title'] = df1['Title'].apply(lambda x: " ".join(x for x in str(x).split() if not x.isdigit() and not x.isspace()))
#df1['Title'] = df1['Title'].str.replace(r'[^\w\s]','')
#df1['Title'] = df1['Title'].str.replace(r'\d+', '')
#df1['Title'] = df1['Title'].str.lower()

stop = stopwords.words('english')
df1['Title']= df1['Title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [9]:
# create lists of abstract and combine list to string

# create list of documents
title_set = [title for title in df1['Title']]

# Remove new line characters
title_set = [re.sub(r'\s+', ' ', sent) for sent in title_set]

# Remove distracting single quotes
title_set = [re.sub("\'", "", sent) for sent in title_set]

# combine list to string 
title_string = ' '.join([(item) for item in title_set])

# Import required packages for keyBERT

In [1]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.5.0.tar.gz (19 kB)
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 2.4 MB/s eta 0:00:01
Collecting rich>=10.4.0
  Downloading rich-11.0.0-py3-none-any.whl (215 kB)
[K     |████████████████████████████████| 215 kB 11.0 MB/s eta 0:00:01
Collecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 13.1 MB/s eta 0:00:01
Collecting torchvision
  Downloading torchvision-0.11.2-cp38-cp38-macosx_10_9_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 3.0 MB/s eta 0:00:01
Collecting torch>=1.6.0
  Downloading torch-1.10.1-cp38-none-macosx_10_9_x86_64.whl (147.1 MB)
[K     |████████████████████████████████| 147.1 MB 47 kB/s  eta 0:00:012
[?25hBuilding wheels for collected packages: keybert, sentence-transformers
  Building wheel for keybert (setup.py) 

In [10]:
from keybert import KeyBERT

In [11]:
kw_model = KeyBERT(model = 'distilbert-base-nli-mean-tokens')

# Abstracts

## Top 50 keywords (1-grams) from the abstracts

In [12]:
kw_model.extract_keywords(abstract_string, 
                          top_n = 50,
                          keyphrase_ngram_range=(1, 1),
                          use_mmr = True,
                          diversity = 0.2,
                          stop_words=None)



[('dentists', 0.4951),
 ('bioinformatics', 0.4191),
 ('dentistry', 0.4807),
 ('dentist', 0.4555),
 ('cancer', 0.3524),
 ('orthodontic', 0.3766),
 ('dental', 0.3956),
 ('antibiotics', 0.331),
 ('radiology', 0.3236),
 ('diabetes', 0.2828),
 ('clinics', 0.3189),
 ('biomedical', 0.3259),
 ('genomics', 0.3159),
 ('physicians', 0.3131),
 ('harvard', 0.2607),
 ('orthodontist', 0.3557),
 ('wisconsin', 0.2525),
 ('proteomics', 0.3062),
 ('librarian', 0.2605),
 ('orthognathic', 0.2789),
 ('clinicians', 0.3142),
 ('patents', 0.2713),
 ('therapy', 0.2876),
 ('videoconferencing', 0.2328),
 ('researchers', 0.2571),
 ('medicine', 0.3034),
 ('pathology', 0.2563),
 ('exam', 0.2536),
 ('doctors', 0.3066),
 ('clinician', 0.311),
 ('techcnological', 0.2522),
 ('nurse', 0.2501),
 ('clinic', 0.2955),
 ('biomaterials', 0.2832),
 ('smartphones', 0.1877),
 ('universities', 0.2361),
 ('2007', 0.2117),
 ('tumor', 0.2699),
 ('faculty', 0.2635),
 ('radiography', 0.2661),
 ('1107', 0.2052),
 ('doctor', 0.2776),
 ('

## Top 50 keywords (2-grams) from the abstracts 

In [75]:
# with low diversity

kw_model.extract_keywords(abstract_string, 
                          top_n = 50,
                          keyphrase_ngram_range=(1, 2),
                          use_mmr = True,
                          diversity = 0.2,
                          stop_words=None)



[('bioinformatics dental', 0.6829),
 ('dentistry questionnaire', 0.6347),
 ('dental research', 0.6441),
 ('cancer patients', 0.536),
 ('dentists interview', 0.6036),
 ('informatics dentistry', 0.6253),
 ('dentistry years', 0.5956),
 ('dentistry guide', 0.6176),
 ('dentistry survey', 0.623),
 ('dental educators', 0.6057),
 ('visualization dentists', 0.6005),
 ('dental biomaterials', 0.6077),
 ('dental informatics', 0.6077),
 ('school dentistry', 0.5853),
 ('bioinformatics courses', 0.5409),
 ('revolution dental', 0.5493),
 ('dentistry reusing', 0.5801),
 ('delivering dental', 0.5818),
 ('dental radiology', 0.5759),
 ('clinical dentistry', 0.5854),
 ('dental practinoers', 0.5885),
 ('updated dental', 0.5687),
 ('dental postgraduate', 0.5836),
 ('dentists learning', 0.5845),
 ('workflow dental', 0.5839),
 ('dental academy', 0.5833),
 ('dental medicine', 0.5866),
 ('challenges dentists', 0.5658),
 ('hygienist dentists', 0.5741),
 ('dental faculty', 0.5865),
 ('dental inforamtics', 0.5862),

## Top 50 keywords (3-grams) from the abstracts 

In [14]:
kw_model.extract_keywords(abstract_string, 
                          top_n = 50,
                          keyphrase_ngram_range=(1, 3),
                          use_mmr = True,
                          diversity = 0.2,
                          stop_words=None)



[('dental informatics research', 0.7422),
 ('revolution bioinformatics dental', 0.7283),
 ('updated dental providers', 0.7065),
 ('education continuing dental', 0.7024),
 ('dental research students', 0.7137),
 ('dental informaticians researchers', 0.7228),
 ('bioinformatics dental informatics', 0.7271),
 ('dental education explore', 0.7101),
 ('dentistry questionnaire survey', 0.6949),
 ('techcnological changes dentistry', 0.6869),
 ('dental informatics evolving', 0.7046),
 ('research tools dentistry', 0.6969),
 ('dental clinical research', 0.7143),
 ('development dental students', 0.6998),
 ('developing dental informatics', 0.7081),
 ('dentistry reusing research', 0.6837),
 ('oral cancer genomics', 0.6438),
 ('dental informatics biomedical', 0.7051),
 ('dentists interview transcripts', 0.6622),
 ('engineering dental research', 0.6981),
 ('dental research summarizing', 0.7014),
 ('healthcare dental research', 0.7005),
 ('technologies adoption dental', 0.6793),
 ('dental informatics app

In [71]:
# with higher diversity

kw_model.extract_keywords(abstract_string, 
                          top_n = 50,
                          keyphrase_ngram_range=(1, 2),
                          use_mmr = True,
                          diversity = 0.7,
                          stop_words=None)



[('bioinformatics dental', 0.6829),
 ('latest initiatives', 0.2287),
 ('500 patients', 0.1676),
 ('smoking history', 0.2985),
 ('diabetes increased', 0.2853),
 ('university texas', 0.2336),
 ('cancer patients', 0.536),
 ('102 provoders', 0.2775),
 ('school administrators', 0.2444),
 ('broadband technologies', 0.2557),
 ('pilot testing', 0.3068),
 ('clinics chinese', 0.2848),
 ('trends iowa', 0.3084),
 ('clinic july', 0.3333),
 ('canal therapy', 0.4501),
 ('statistics lessons', 0.3123),
 ('revolution bioinformatics', 0.4779),
 ('researchers successfully', 0.2416),
 ('harvard school', 0.3148),
 ('jordanian students', 0.2413),
 ('completed orthodontic', 0.4358),
 ('twenty dental', 0.3019),
 ('health data', 0.3356),
 ('forest naïve', -0.0131),
 ('2017 data', 0.2536),
 ('years interviewing', 0.3386),
 ('research oral', 0.4341),
 ('smartphones positive', 0.1634),
 ('genomic studies', 0.4186),
 ('different dentists', 0.4403),
 ('scandinavian practitioner', 0.1798),
 ('mining methods', 0.2144)

# Titles

## Top 50 keywords (1-gram) from the titles

In [13]:
kw_model.extract_keywords(title_string, 
                          top_n = 50,
                          keyphrase_ngram_range=(1, 1), 
                          use_mmr = True,
                          diversity = 0.2,
                          stop_words=None)



[('dentistry', 0.4966),
 ('bioinformatics', 0.4207),
 ('dentists', 0.4747),
 ('dentist', 0.4431),
 ('dental', 0.3994),
 ('medicine', 0.3559),
 ('cancer', 0.3197),
 ('clinics', 0.35),
 ('orthodontic', 0.3427),
 ('biomedical', 0.3325),
 ('genomics', 0.3213),
 ('diabetes', 0.2778),
 ('clinic', 0.331),
 ('healthcare', 0.3178),
 ('biomaterials', 0.2984),
 ('pathology', 0.2553),
 ('innovation', 0.2509),
 ('therapy', 0.2641),
 ('radiography', 0.2299),
 ('academics', 0.2208),
 ('informatics', 0.2403),
 ('digidontics', 0.2299),
 ('medical', 0.2799),
 ('dentofacial', 0.2274),
 ('tumor', 0.2516),
 ('myanmar', 0.1738),
 ('hospital', 0.259),
 ('saudi', 0.1729),
 ('broadband', 0.177),
 ('engineering', 0.1972),
 ('chinas', 0.1454),
 ('decades', 0.1456),
 ('pubmed', 0.1807),
 ('universities', 0.1967),
 ('hygienists', 0.1893),
 ('epidemiology', 0.1934),
 ('online', 0.1723),
 ('mapping', 0.1834),
 ('legal', 0.1622),
 ('clinical', 0.2017),
 ('postgraduates', 0.1755),
 ('jordan', 0.1697),
 ('software', 0.

## Top 50 keywords (2-grams) from the titles

In [76]:
kw_model.extract_keywords(title_string, 
                          top_n = 50,
                          keyphrase_ngram_range=(1, 2), 
                          use_mmr = True,
                          diversity = 0.2,
                          stop_words=None)



[('innovation dental', 0.7059),
 ('medicine informatics', 0.594),
 ('genomics dental', 0.6494),
 ('revolution dental', 0.6286),
 ('dental informatics', 0.6256),
 ('computerization dentistry', 0.5866),
 ('clinical dentistry', 0.5965),
 ('dental research', 0.6064),
 ('dentists ready', 0.5718),
 ('technology dental', 0.5883),
 ('research dental', 0.6011),
 ('comprehensive dental', 0.5764),
 ('dentistry course', 0.5907),
 ('improving dental', 0.5588),
 ('exploring dental', 0.5845),
 ('dental radiography', 0.5686),
 ('dental postgraduates', 0.5815),
 ('research dentist', 0.5811),
 ('cornerstone dental', 0.5614),
 ('dental education', 0.5799),
 ('repurposing dental', 0.5706),
 ('oral genomics', 0.5494),
 ('dental researchers', 0.5826),
 ('dentists clinical', 0.5704),
 ('online dental', 0.5426),
 ('data dentistry', 0.5632),
 ('evolution dental', 0.5507),
 ('medical dental', 0.5615),
 ('training dental', 0.5674),
 ('dental school', 0.5686),
 ('dental software', 0.5651),
 ('technologies dental'

## Top 50 keywords (3-grams) from the titles

In [15]:
kw_model.extract_keywords(title_string, 
                          top_n = 50,
                          keyphrase_ngram_range=(1, 3), 
                          use_mmr = True,
                          diversity = 0.2,
                          stop_words=None)



[('informatics innovation dental', 0.8034),
 ('revolution dental education', 0.7421),
 ('evolution dental informatics', 0.7684),
 ('genomics dental informatics', 0.7605),
 ('innovation dental care', 0.7683),
 ('dental informatics emerging', 0.7453),
 ('dental informatics cornerstone', 0.7352),
 ('improving dental research', 0.7172),
 ('dental researchers informatics', 0.7316),
 ('emergence dental informatics', 0.7285),
 ('informatics cornerstone dental', 0.7343),
 ('informatics improving dental', 0.7231),
 ('dental informatics online', 0.6995),
 ('information revolution dental', 0.7126),
 ('informatics training dental', 0.71),
 ('dental informatics clinical', 0.7137),
 ('oral medicine informatics', 0.6803),
 ('progress dental informatics', 0.7068),
 ('innovation dental', 0.7059),
 ('frontier dental informatics', 0.696),
 ('technology dental education', 0.685),
 ('dental data integration', 0.6883),
 ('clinical dentistry pilot', 0.6509),
 ('dental informatics saudi', 0.6869),
 ('dental i