# Experiments on Key Phrase Extraction Techniques
Explore KeyBERT with various design choices -
* ways to generate initial candidates - spacy noun phrases, bart-base-keyphrase-generation, simple n-grams (what KeyBERT does)
* different sentence transformer models for doc and candidate phrase embeddings
* diversification - Max Sum Similarity, Maximal Marginal Relevance

In [75]:
# Setup
import json
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from keybert import KeyBERT
from sklearn.model_selection import ParameterGrid

## Data

In [2]:
# Using the standard 20-newsgroups dataset which has categories
# Also plan to KPE on our medium blogs

In [3]:
# newsgroups_train = fetch_20newsgroups(subset='train') #remove=('headers', 'footers', 'quotes')

In [4]:
# list(newsgroups_train.target_names)
# newsgroups_train.filenames
# newsgroups_train.data[345]

### Getting scraped medium articles

In [5]:
# !python data_extract.py

In [6]:
with open('data/medium_scraped_data.json', 'r') as fp:
    data = json.load(fp)

## Phrase Extraction

### KeyBERT

In [62]:
i = 1
doc_name = list(data.keys())[i]
print(doc_name)
doc_text = list(data.values())[i]
kw_model = KeyBERT(model='multi-qa-distilbert-cos-v1')

questgen-an-open-source-nlp-library-for-question-generation-algorithms-1e18067fcdc6


Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [63]:
keywords = kw_model.extract_keywords(doc_text, 
                                     keyphrase_ngram_range=(1, 3), 
                                     stop_words=None, 
                                     top_n = 10,
                                     use_maxsum=False, nr_candidates=20,
                                     use_mmr=True, diversity=0.5)
keywords

[('question generation', 0.7528),
 ('natural language processing', 0.4861),
 ('the t5 transformer', 0.1793),
 ('quick assessments from', 0.3247),
 ('questgen ai github', 0.2876),
 ('answering is', 0.2679),
 ('library are mcqs', 0.2602),
 ('during covid 19', 0.0579),
 ('other awesome interns', 0.2034),
 ('models from scratch', 0.2411)]

In [85]:
# Getting results for multiple combinations
param_grid = ParameterGrid(
    {'model': ['all-mpnet-base-v2', 
          'multi-qa-mpnet-base-dot-v1',
          'all-distilroberta-v1',
          'all-MiniLM-L12-v2',
          'multi-qa-distilbert-cos-v1',
          'all-MiniLM-L6-v2'],
     'ngram_range': [(1,2), (1,3)],
     'include_maxsum': [True, False],
     'include_mmr': [True, False]
    })

In [None]:
result = []
for doc_name, doc_text in data.items():
    
    print(doc_name)
    for param in param_grid:
        if param['include_maxsum'] and param['include_mmr']:
            continue

        kw_model = KeyBERT(model=param['model'])
        
        keywords = kw_model.extract_keywords(doc_text, 
                                     keyphrase_ngram_range=param['ngram_range'], 
                                     stop_words=None, 
                                     top_n = 10,
                                     use_maxsum=param['include_maxsum'], nr_candidates=20,
                                     use_mmr=param['include_mmr'], diversity=0.5)
        keywords = list(zip(*keywords))[0]
        
        result.append([doc_name, param['model'], param['ngram_range'], param['include_maxsum'], param['include_mmr'], keywords])


3-ways-to-generate-distractors-wrong-choices-for-mcqs-using-natural-language-processing-d52477a56812
questgen-an-open-source-nlp-library-for-question-generation-algorithms-1e18067fcdc6


In [None]:
result_df = pd.DataFrame(result, columns=['doc', 'model', 'ngram', 'maxsum_diversity', 'mmr_diversity', 'keywords'])

In [80]:
result_df.to_csv('results/keybert_experiments.csv', index=False)

In [84]:
list(zip(*keywords))[0]

('distractors using wordnet',
 'generating distractors using',
 'generating distractors for',
 'generating distractors',
 'generate distractors using',
 'efficient distractors',
 'efficient distractors for',
 'answer distractors',
 'generate distractors',
 'distractors for given')