In [1]:
import json
import re
import boto3
import tqdm
import pytz
from datetime import datetime
import pandas as pd
from keybert import KeyBERT
import spacy
from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [8]:
spacy.require_gpu()
_ = spacy.load('en_core_web_sm')

In [11]:
def edit_keywords(keywords, paper_title):
    temp_keywords = []
    paper_title = str(paper_title)
    
    # If all uppercase title, switch to title case
    if paper_title.isupper():
        paper_title = paper_title.title()
    
    # Remove unnecessary keywords
    for keyword in keywords:
        # remove "et al" keywords
        lower_title = paper_title.lower()
        if re.search('\\bet al\\b', keyword[0]):
            pass
        elif (f'{keyword[0]}-' in lower_title) or (f'-{keyword[0]}' in lower_title):
            pass
        else:
            temp_keywords.append(keyword)
    
    paper_title_words = paper_title.split(" ")
            
    # Get final version of keywords
    final_keywords = []
    for keyword in temp_keywords:
        if keyword[1] < 0.25:
            pass
        else:
            final_keyword = []
            if len(keyword[0].split(" ")) > 4:
                pass
            else:
                for word in keyword[0].split(" "):
                    for title_word in paper_title_words:
                        if word in title_word.lower():
                            final_keyword.append(title_word.replace("'s", "").replace(":", "")
                                                 .replace(")","").replace("(", "")
                                                 .replace("?","").replace("].","").replace("]", "")
                                                 .replace("[", "").replace(".", "").replace(",","")
                                                 .replace('"',"").replace("'", "").replace("’", "")
                                                 .replace("“",""))
                            break
                if final_keyword:
                    final_keywords.append([" ".join(final_keyword), str(keyword[1])])
                else:
                    pass
                    
    # Hyphenated words as a keyword
    if '-' in paper_title:
        if re.search('[A-z]+-[A-z]+-*[A-z]+-*[A-z]+', paper_title):
            matches = re.findall('[A-z]+-[A-z]+-*[A-z]+-*[A-z]+', paper_title)
            for match in matches:
                final_keywords.append([match, str(0.25)])
            
    # Getting rid of single words that are already part of another keyword
    all_single_keywords = [x[0] for x in final_keywords if len(x[0].split(" "))==1]
    single_keywords_to_remove = []
    if all_single_keywords:
        for keyword in final_keywords:
            _ = [single_keywords_to_remove.append(single_keyword) for single_keyword in all_single_keywords 
                 if ((single_keyword in keyword[0]) and (single_keyword != keyword[0]))]
    
    if single_keywords_to_remove:
        final_keywords = {x[0]:x[1] for x in final_keywords if x[0] not in single_keywords_to_remove}
        return [[x,str(y)] for x,y in final_keywords.items()]
    else:
        return final_keywords

In [12]:
def score_data(docs):
    model_preds = kw_model.extract_keywords(docs=docs, vectorizer=KeyphraseCountVectorizer())
    
    final_keywords = [edit_keywords(x, y) for x,y in zip(model_preds, docs)]
    return final_keywords

In [13]:
def split_list(lst, n):
    # Yield successive n-sized chunks from lst
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [18]:
def get_all_keywords(filename, round_name):
    titles_to_keyword_df = pd.read_parquet(f"{filename}", columns=['paper_title'])
    
    titles_to_keyword = titles_to_keyword_df['paper_title'].tolist()
    
    all_keywords = [score_data(chunk) for chunk in 
                split_list(titles_to_keyword, 10000)]
    
    file_id = filename.split("/part-")[1].split("-")[0]
    
    all_keywords_list = [x for y in all_keywords for x in y]
    
    if len(all_keywords_list) == len(titles_to_keyword):
        pd.DataFrame(zip(titles_to_keyword, all_keywords_list), columns=['paper_title','keywords']) \
            .to_parquet("keywords.parquet")
    else:
        print("####### Lengths do not match #######")

In [15]:
# Init KeyBERT
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

In [None]:
%%time
for file_name in all_work_files:
    dt_now = datetime.now(tz=pytz.timezone('US/Eastern')).strftime("%H:%M")
    print(dt_now, " ", file_name)
    _ = get_all_keywords(file_name, round_name)