In [27]:
import json
import re
import time
import requests
import pandas as pd
import numpy as np
import unicodedata
pd.set_option('max_colwidth', None)
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def name_to_keep_ind(groups):
    """
    Function to determine if a text should be kept or not.

    Input:
    groups: list of character groups

    Output:
    0: if text should be not used
    1: if text should be used
    """
    # Groups of characters that do not perform well
    groups_to_skip = ['HIRAGANA', 'CJK', 'KATAKANA','ARABIC', 'HANGUL', 'THAI','DEVANAGARI','BENGALI',
                      'THAANA','GUJARATI','CYRILLIC']
    
    if any(x in groups_to_skip for x in groups):
        return 0
    else:
        return 1

def remove_non_latin_characters(text):
    """
    Function to remove non-latin characters.

    Input:
    text: string of characters

    Output:
    final_char: string of characters with non-latin characters removed
    """
    final_char = []
    groups_to_skip = ['HIRAGANA', 'CJK', 'KATAKANA','ARABIC', 'HANGUL', 'THAI','DEVANAGARI','BENGALI',
                      'THAANA','GUJARATI','CYRILLIC']
    for char in text:
        try:
            script = unicodedata.name(char).split(" ")[0]
            if script not in groups_to_skip:
                final_char.append(char)
        except:
            pass
    return "".join(final_char)
    
def group_non_latin_characters(text):
    """
    Function to group non-latin characters and return the number of latin characters.

    Input:
    text: string of characters

    Output:
    groups: list of character groups
    latin_chars: number of latin characters
    """
    groups = []
    latin_chars = []
    text = text.replace(".", "").replace(" ", "")
    for char in text:
        try:
            script = unicodedata.name(char).split(" ")[0]
            if script == 'LATIN':
                latin_chars.append(script)
            else:
                if script not in groups:
                    groups.append(script)
        except:
            if "UNK" not in groups:
                groups.append("UNK")
    return groups, len(latin_chars)

def check_for_non_latin_characters(text):
    """
    Function to check if non-latin characters are dominant in a text.

    Input:
    text: string of characters

    Output:
    0: if text should be not used
    1: if text should be used
    """
    groups, latin_chars = group_non_latin_characters(str(text))
    if name_to_keep_ind(groups) == 1:
        return 1
    elif latin_chars > 20:
        return 1
    else:
        return 0

In [11]:
def clean_title(old_title):
    """
    Function to check if title should be kept and then remove non-latin characters. Also
    removes some HTML tags from the title.
    
    Input:
    old_title: string of title
    
    Output:
    new_title: string of title with non-latin characters and HTML tags removed
    """
    keep_title = check_for_non_latin_characters(old_title)
    if (keep_title == 1) & isinstance(old_title, str):
        new_title = remove_non_latin_characters(old_title)
        if '<' in new_title:
            new_title = new_title.replace("<i>", "").replace("</i>","")\
                                 .replace("<sub>", "").replace("</sub>","") \
                                 .replace("<sup>", "").replace("</sup>","") \
                                 .replace("<em>", "").replace("</em>","") \
                                 .replace("<b>", "").replace("</b>","") \
                                 .replace("<I>", "").replace("</I>", "") \
                                 .replace("<SUB>", "").replace("</SUB>", "") \
                                 .replace("<scp>", "").replace("</scp>", "") \
                                 .replace("<font>", "").replace("</font>", "") \
                                 .replace("<inf>","").replace("</inf>", "") \
                                 .replace("<i /> ", "") \
                                 .replace("<p>", "").replace("</p>","") \
                                 .replace("<![CDATA[<B>", "").replace("</B>]]>", "") \
                                 .replace("<italic>", "").replace("</italic>","")\
                                 .replace("<title>", "").replace("</title>", "") \
                                 .replace("<br>", "").replace("</br>","").replace("<br/>","") \
                                 .replace("<B>", "").replace("</B>", "") \
                                 .replace("<em>", "").replace("</em>", "") \
                                 .replace("<BR>", "").replace("</BR>", "") \
                                 .replace("<title>", "").replace("</title>", "") \
                                 .replace("<strong>", "").replace("</strong>", "") \
                                 .replace("<formula>", "").replace("</formula>", "") \
                                 .replace("<roman>", "").replace("</roman>", "") \
                                 .replace("<SUP>", "").replace("</SUP>", "") \
                                 .replace("<SSUP>", "").replace("</SSUP>", "") \
                                 .replace("<sc>", "").replace("</sc>", "") \
                                 .replace("<subtitle>", "").replace("</subtitle>", "") \
                                 .replace("<emph/>", "").replace("<emph>", "").replace("</emph>", "") \
                                 .replace("""<p class="Body">""", "") \
                                 .replace("<TITLE>", "").replace("</TITLE>", "") \
                                 .replace("<sub />", "").replace("<sub/>", "") \
                                 .replace("<mi>", "").replace("</mi>", "") \
                                 .replace("<bold>", "").replace("</bold>", "") \
                                 .replace("<mtext>", "").replace("</mtext>", "") \
                                 .replace("<msub>", "").replace("</msub>", "") \
                                 .replace("<mrow>", "").replace("</mrow>", "") \
                                 .replace("</mfenced>", "").replace("</math>", "")

            if '<mml' in new_title:
                all_parts = [x for y in [i.split("mml:math>") for i in new_title.split("<mml:math")] for x in y if x]
                final_parts = []
                for part in all_parts:
                    if re.search(r"\>[$%#!^*\w.,/()+-]*\<", part):
                        pull_out = re.findall(r"\>[$%#!^*\w.,/()+-]*\<", part)
                        final_pieces = []
                        for piece in pull_out:
                            final_pieces.append(piece.replace(">", "").replace("<", ""))
                        
                        final_parts.append(" "+ "".join(final_pieces) + " ")
                    else:
                        final_parts.append(part)
                
                new_title = "".join(final_parts).strip()
            else:
                pass

            if '<xref' in new_title:
                new_title = re.sub(r"\<xref[^/]*\/xref\>", "", new_title)

            if '<inline-formula' in new_title:
                new_title = re.sub(r"\<inline-formula[^/]*\/inline-formula\>", "", new_title)

            if '<title' in new_title:
                new_title = re.sub(r"\<title[^/]*\/title\>", "", new_title)

            if '<p class=' in new_title:
                new_title = re.sub(r"\<p class=[^>]*\>", "", new_title)
            
            if '<span class=' in new_title:
                new_title = re.sub(r"\<span class=[^>]*\>", "", new_title)

            if 'mfenced open' in new_title:
                new_title = re.sub(r"\<mfenced open=[^>]*\>", "", new_title)
            
            if 'math xmlns' in new_title:
                new_title = re.sub(r"\<math xmlns=[^>]*\>", "", new_title)

        if '<' in new_title:
            new_title = new_title.replace(">i<", "").replace(">/i<", "") \
                                 .replace(">b<", "").replace(">/b<", "") \
                                 .replace("<inline-formula>", "").replace("</inline-formula>","")
        if new_title.isupper():
            new_title = new_title.title()
        
        return new_title
    else:
        return ''
    
def clean_abstract(raw_abstract, inverted=False):
    """
    Function to clean abstract and return it in a format for the model.
    
    Input:
    raw_abstract: string of abstract
    inverted: boolean to determine if abstract is inverted index or not
    
    Output:
    final_abstract: string of abstract in format for model
    """
    if inverted:
        if isinstance(raw_abstract, dict) | isinstance(raw_abstract, str):
            if isinstance(raw_abstract, dict):
                invert_abstract = raw_abstract
            else:
                invert_abstract = json.loads(raw_abstract)
            
            if invert_abstract.get('IndexLength'):
                ab_len = invert_abstract['IndexLength']

                if ab_len > 20:
                    abstract = [" "]*ab_len
                    for key, value in invert_abstract['InvertedIndex'].items():
                        for i in value:
                            abstract[i] = key
                    final_abstract = " ".join(abstract)[:2500]
                    keep_abs = check_for_non_latin_characters(final_abstract)
                    if keep_abs == 1:
                        pass
                    else:
                        final_abstract = None
                else:
                    final_abstract = None
            else:
                if len(invert_abstract) > 20:
                    abstract = [" "]*1200
                    for key, value in invert_abstract.items():
                        for i in value:
                            try:
                                abstract[i] = key
                            except:
                                pass
                    final_abstract = " ".join(abstract)[:2500].strip()
                    keep_abs = check_for_non_latin_characters(final_abstract)
                    if keep_abs == 1:
                        pass
                    else:
                        final_abstract = None
                else:
                    final_abstract = None
                
        else:
            final_abstract = None
    else:
        if raw_abstract:
            ab_len = len(raw_abstract)
            if ab_len > 30:
                final_abstract = raw_abstract[:2500]
                keep_abs = check_for_non_latin_characters(final_abstract)
                if keep_abs == 1:
                    pass
                else:
                    final_abstract = None
            else:
                final_abstract = None
        else:
            final_abstract = None
            
    return final_abstract

In [12]:
def get_top_keywords(title, abstract, cand_embs_df):
    """
    Function to use title, abstract, and candidate keyword embeddings to return scores.
    
    Input:
    title: title of paper
    abstract: abstract of paper
    cand_embs_df: dataframe containing keywords and embeddings (filtered by paper topics)
    
    Output:
    final_abstract: string of abstract in format for model
    """
    cand_embs_df = cand_embs_df.copy()
    if title.isupper():
        title = title.title()
    if abstract:
        title_and_abstract = f"{title}\n {abstract}"
    else:
        if title:
            title_and_abstract = f"{title}"
        else:
            title_and_abstract = ""

    if title_and_abstract:
        # Get title/abstract embedding
        title_abs_emb = emb_model.encode(title_and_abstract)
    
        # Get scores for each candidate keyword
        cand_embs_df['cand_scores'] = cand_embs_df['embedding'].apply(lambda x: np.dot(title_abs_emb, x))
    else:
        cand_embs_df['cand_scores'] = -1
    
    return cand_embs_df

In [13]:
def get_candidate_keywords(candidate_topics):
    """
    Function to get keywords based on the topics
    
    Input:
    candidate_topics: topics of paper
    
    Output:
    keywords_data_copy: filtered df of keywords and embeddings
    """
    keywords_data_copy = all_keywords_data[all_keywords_data['topic_id'].isin(candidate_topics)]\
        .drop_duplicates(subset=['keywords'])[['keywords','embedding']].copy()
    return keywords_data_copy

In [14]:
def get_all_keywords(candidate_topics, paper_title, abstract, invert_abstract=False, topk=5):
    """
    Function to get keywords that match title/abstract
    
    Input:
    candidate_topics: topic ids for a paper
    paper_title: title of a paper
    abstract: abstract of a paper
    invert_abstract: whether or not the abstract is being input as an inverted index (True/False)
    topk: maximum number of keywords to pull for a paper
    
    Output:
    final_keywords
    """
    # Process title and abstract
    paper_title = clean_title(paper_title)
    abstract = clean_abstract(abstract, inverted=invert_abstract)
    
    # Get candidate keywords
    keywords_data = get_candidate_keywords(candidate_topics)
    if keywords_data.shape[0]>0:
    
        # Get candidate scores
        cand_scores = get_top_keywords(paper_title, abstract, keywords_data)
        if cand_scores[cand_scores['cand_scores']>=0].shape[0] > 0:
            top_k = cand_scores[cand_scores['cand_scores']>=0].sort_values('cand_scores', ascending=False).head(topk)
            top_k['keywords'] = top_k['keywords'].apply(lambda x: x.lower())
            top_k = top_k.drop_duplicates(subset=['keywords'])
            keywords = top_k['keywords'].tolist()
            scores = top_k['cand_scores'].tolist()
    
            final_keywords = []
            _ = [final_keywords.append({"keyword": keyword, "score": score}) for keyword, score in zip(keywords, scores) if score > 0.50]
    
            if final_keywords:
                return final_keywords
            else:
                if scores[0] > 0.40:
                    return [{"keyword": keywords[0], "score": scores[0]}]
                else:
                    return []
        else:
            return []
    else:
        return []

#### Code for testing (not needed in final predict.py file

In [7]:
def get_paper_id_resp(paper_id):
    open_req = f"https://api.openalex.org/works/W{str(paper_id)}"
    resp = requests.get(open_req)
    if resp.status_code == 200:
        return resp.json()
    else:
        return []

In [8]:
def get_topics_for_paper(paper_id_resp):

    topics = []
    scores = []
    try:
        all_topics = paper_id_resp['topics']
        for i in all_topics:
            topics.append(int(i['id'].split('/T')[1]))
            scores.append(i['score'])
    except:
        pass

    if topics:
        return [x for _, x in sorted(zip(scores, topics), key=lambda pair: pair[0], reverse=True)]
    else:
        return topics

In [33]:
def test_function_to_simulate_data(paper_id):
    paper_id_resp = get_paper_id_resp(paper_id)
    paper_title = paper_id_resp['title']
    abstract = paper_id_resp['abstract_inverted_index']

    
    candidate_topics = get_topics_for_paper(paper_id_resp)
    
    first_time = time.time()
    all_keywords = get_all_keywords(candidate_topics, paper_title, abstract, invert_abstract=True, topk=5)
    print(time.time() - first_time)
    return all_keywords

In [34]:
%%time
for i in [2995104085,2492191525,2780733096]:
    print(test_function_to_simulate_data(i))

0.10874819755554199
[{'keyword': 'high-temperature applications', 'score': 0.5638906796476346}, {'keyword': 'solidification modeling', 'score': 0.5359686474935086}, {'keyword': 'thermochemical', 'score': 0.5137059869882088}]
0.23958754539489746
[{'keyword': 'right-wing extremism', 'score': 0.43486157318306873}]
0.0816946029663086
[{'keyword': 'judicial cooperation', 'score': 0.49244597941582136}]
CPU times: user 3.48 s, sys: 0 ns, total: 3.48 s
Wall time: 661 ms


### Examples straight from OpenAlex

In [17]:
emb_model = SentenceTransformer('baai/BGE-M3')

In [19]:
# topics = pd.read_parquet("topics_for_keyword_pred.parquet")
# topics_explode = topics[['topic_id','keywords']].explode('keywords').copy()
# all_keyword_embs = emb_model.encode(topics_explode['keywords'].tolist())
# topics_explode['embedding'] = [np.array(x) for x in all_keyword_embs.tolist()]
# topics_explode.to_parquet('current_keyword_embs.parquet')
all_keywords_data = pd.read_parquet('s3://openalex-keywords-matcher/v1/keywords_files/')

In [164]:
examples = pd.read_parquet("abstracts_sample_file_single")

In [165]:
examples['abstract_processed'] = examples['abstract'].apply(lambda x: clean_abstract(x, inverted=True))

In [166]:
samples = examples.sample(100)

In [179]:
samples['keywords'] = samples['paper_id'].apply(test_function_to_simulate_data)

In [180]:
samples['keywords_len'] = samples['keywords'].apply(len)

In [181]:
samples.sample(10)

Unnamed: 0,paper_id,original_title,abstract,abstract_processed,keywords,keywords_len
660,2995104085,Thermodynamic Simulation of Polycrystalline Silicon Chemical Vapor Deposition in Si–Cl–H System,"{""IndexLength"":206,""InvertedIndex"":{""Based"":[0],""on"":[1,91],""thermodynamic"":[2,72],""data"":[3],""for"":[4,35,45,109,121],""related"":[5],""pure"":[6],""substances,"":[7],""the"":[8,18,25,32,79,103,110,126,130,158,181,186,190,193,203],""relations"":[9],""of"":[10,27,74,106,175],""(nCl/nH)Eq"":[11],""and"":[12,29,64,153,170],""(nCl/nH)o"":[13],""have"":[14,161],""been"":[15,83,162],""plotted"":[16],""in"":[17,78,125,185,202],""Si–Cl–H"":[19,127],""system."":[20,128],""The"":[21,85,95,116,142],""results"":[22],""show"":[23],""that"":[24],""difference"":[26],""(nSi/nCl)o"":[28],""(nSi/nCl)Eq"":[30],""is"":[31,43,58,67,98,113,119,197],""driving"":[33],""force"":[34],""polycrystalline"":[36,46,75,92,122],""silicon"":[37,47,76,93,123,131,194],""chemical"":[38],""vapor"":[39],""deposition"":[40,48,132],""(CVD)."":[41],""SiHCl3"":[42,173],""preferred"":[44],""to"":[49,172,180],""SiCl4."":[50],""SiH2Cl2"":[51],""would"":[52],""be"":[53,148],""even"":[54],""better,"":[55],""but"":[56],""it"":[57,66],""not"":[59],""stable"":[60],""as"":[61,164],""a"":[62,88],""gas"":[63],""hence"":[65],""less"":[68],""frequently"":[69],""used."":[70],""Then,"":[71],""simulation"":[73],""CVD"":[77,124],""Si–H–Cl"":[80],""system"":[81],""has"":[82,87],""investigated."":[84],""pressure"":[86],""negative"":[89],""effect"":[90],""yield."":[94],""optimum"":[96,143,159,191],""temperature"":[97],""1400"":[99,165],""K,"":[100,166],""at"":[101],""which"":[102,177],""kinetic"":[104],""rate"":[105,133],""rate-determining"":[107],""step"":[108],""main"":[111],""reaction"":[112],""large"":[114],""enough."":[115],""excess"":[117],""hydrogen"":[118],""necessary"":[120],""However,"":[129],""increases"":[134],""then"":[135],""decreases"":[136],""with"":[137],""increasing"":[138],""H2"":[139,144,171],""molar"":[140,145],""fraction."":[141],""fraction"":[146],""should"":[147],""determined"":[149],""by"":[150],""considering"":[151],""thermodynamics"":[152],""transport"":[154],""phenomena"":[155],""simultaneously."":[156],""Finally,"":[157],""conditions"":[160],""obtained"":[163],""about"":[167],""0.1"":[168],""MPa,"":[169],""ratio"":[174,196],""15,"":[176],""are"":[178],""close"":[179],""limited"":[182],""reported"":[183,201],""values"":[184],""open"":[187,204],""literature."":[188,205],""Under"":[189],""conditions,"":[192],""yield"":[195],""34.82%"":[198],""against"":[199],""20%"":[200]}}","Based on thermodynamic data for related pure substances, the relations of (nCl/nH)Eq and (nCl/nH)o have been plotted in the Si–Cl–H system. The results show that the difference of (nSi/nCl)o and (nSi/nCl)Eq is the driving force for polycrystalline silicon chemical vapor deposition (CVD). SiHCl3 is preferred for polycrystalline silicon deposition to SiCl4. SiH2Cl2 would be even better, but it is not stable as a gas and hence it is less frequently used. Then, thermodynamic simulation of polycrystalline silicon CVD in the Si–H–Cl system has been investigated. The pressure has a negative effect on polycrystalline silicon yield. The optimum temperature is 1400 K, at which the kinetic rate of rate-determining step for the main reaction is large enough. The excess hydrogen is necessary for polycrystalline silicon CVD in the Si–Cl–H system. However, the silicon deposition rate increases then decreases with increasing H2 molar fraction. The optimum H2 molar fraction should be determined by considering thermodynamics and transport phenomena simultaneously. Finally, the optimum conditions have been obtained as 1400 K, about 0.1 MPa, and H2 to SiHCl3 ratio of 15, which are close to the limited reported values in the open literature. Under the optimum conditions, the silicon yield ratio is 34.82% against 20% reported in the open literature.","[{'keyword': 'high-temperature applications', 'score': 0.5638905879113437}, {'keyword': 'solidification modeling', 'score': 0.53596854140221}, {'keyword': 'thermochemical', 'score': 0.513705947877374}]",3
1173,560436365,細管式等速電気泳動による血液(ヘモグロビン)の種属鑑別,,,[],0
86,2492191525,11. Demonstrationsdelikte im Kontext rechtsextremer Aufmärsche.,,,"[{'keyword': 'right-wing extremism', 'score': 0.4348615239197596}]",1
212,427190563,A-4-63 前処理を用いたステレオエコーキャンセラの収束条件,,,[],0
1087,2780733096,Sussidiarietà penale e sussidiarietà comunitaria,,,"[{'keyword': 'judicial cooperation', 'score': 0.49244595404353453}]",1
1097,2329033315,"Christian Sapin, dir. — Peindre à Auxerre au Moyen Âge, IX-XIVe siècles. 10 ans de recherches à l'abbaye Saint- Germain et à la cathédrale Saint-Étienne d'Auxerre, dir. Ch. Sapin. paris, CTHS, 1999 (Mémoire de la section d'archéologie et d'histoire de l'art, VII)",,,"[{'keyword': 'early medieval churches', 'score': 0.45441234722167256}]",1
272,2043568223,"Genecology of Holodiscus discolor (Rosaceae) in the Pacific Northwest, U.S.A.","{""IndexLength"":177,""InvertedIndex"":{""An"":[0],""important"":[1],""goal"":[2],""for"":[3,172],""land"":[4],""managers"":[5],""is"":[6],""the"":[7,32,57,80,96,100,139,160],""incorporation"":[8],""of"":[9,70,87,95,138,156],""appropriate"":[10],""(e.g.,"":[11],""locally"":[12],""adapted"":[13],""and"":[14,21,40,61,67,90,132,174],""genetically"":[15],""diverse)"":[16],""plant"":[17],""materials"":[18],""in"":[19,34,37,59,99,141],""restoration"":[20,173],""revegetation"":[22,175],""activities."":[23,176],""To"":[24],""identify"":[25],""these"":[26],""materials,"":[27],""researchers"":[28],""need"":[29],""to"":[30,46,55,65,111],""characterize"":[31,56],""variability"":[33,58,98,140],""essential"":[35],""traits"":[36,63,92],""natural"":[38],""populations"":[39],""determine"":[41],""how"":[42],""they"":[43],""are"":[44],""related"":[45],""environmental"":[47,112],""conditions."":[48],""This"":[49],""common"":[50],""garden"":[51],""study"":[52],""was"":[53,109],""implemented"":[54],""growth"":[60,89],""phenological"":[62,91],""relative"":[64],""climatic"":[66],""geographic"":[68],""variables"":[69],""39"":[71],""Holodiscus"":[72],""discolor"":[73],""(Pursh)"":[74],""Maxim."":[75],""accessions"":[76],""from"":[77],""locations"":[78],""throughout"":[79],""Pacific"":[81,161],""Northwest,"":[82],""U.S.A."":[83],""Principal"":[84],""component"":[85,103],""analysis"":[86,119,149],""12"":[88],""explained"":[93,136],""48.2%"":[94],""observed"":[97],""first"":[101],""principal"":[102],""(PC-1)."":[104],""With"":[105],""multiple"":[106],""regressions,"":[107],""PC-1"":[108,142],""compared"":[110],""values"":[113],""at"":[114],""each"":[115],""source"":[116],""location."":[117],""Regression"":[118],""identified"":[120,154],""a"":[121],""four-variable"":[122],""model"":[123,153],""containing"":[124],""elevation,"":[125],""minimum"":[126],""January"":[127],""temperature,"":[128,131],""maximum"":[129],""October"":[130],""February"":[133],""precipitation"":[134],""that"":[135,163],""86%"":[137],""(r2="":[143],""0.86,"":[144],""p"":[145],""\u003c"":[146],""0.0001)."":[147],""Spatial"":[148],""using"":[150],""this"":[151],""regression"":[152],""patterns"":[155],""genetic"":[157],""diversity"":[158],""within"":[159],""Northwest"":[162],""can"":[164],""help"":[165],""guide"":[166],""germplasm"":[167],""selection"":[168],""(i.e.,"":[169],""seed"":[170],""collections)"":[171]}}","An important goal for land managers is the incorporation of appropriate (e.g., locally adapted and genetically diverse) plant materials in restoration and revegetation activities. To identify these materials, researchers need to characterize the variability in essential traits in natural populations and determine how they are related to environmental conditions. This common garden study was implemented to characterize the variability in growth and phenological traits relative to climatic and geographic variables of 39 Holodiscus discolor (Pursh) Maxim. accessions from locations throughout the Pacific Northwest, U.S.A. Principal component analysis of 12 growth and phenological traits explained 48.2% of the observed variability in the first principal component (PC-1). With multiple regressions, PC-1 was compared to environmental values at each source location. Regression analysis identified a four-variable model containing elevation, minimum January temperature, maximum October temperature, and February precipitation that explained 86% of the variability in PC-1 (r2= 0.86, p < 0.0001). Spatial analysis using this regression model identified patterns of genetic diversity within the Pacific Northwest that can help guide germplasm selection (i.e., seed collections) for restoration and revegetation activities.","[{'keyword': 'ecological characteristics', 'score': 0.519444312947291}, {'keyword': 'phylogeny', 'score': 0.5161767311191638}]",2
696,3026401810,"Sastra Kabanti: Pengertian, Jenis, dan Fungsi","{""IndexLength"":125,""InvertedIndex"":{""Seni"":[0],""pertunjukan"":[1],""kabanti"":[2,16,58,61,76,79,113],""seakan"":[3],""telah"":[4],""menjadi"":[5],""ikon"":[6],""kesusastraan"":[7],""masyarakat"":[8,20],""Buton."":[9],""Dibanding"":[10],""dengan"":[11,56,75],""jenis"":[12],""kesusatraan"":[13],""lain,"":[14],""sastra"":[15],""lebih"":[17,69],""dikenal"":[18],""oleh"":[19],""luas."":[21],""Kabanti"":[22,49],""merupakan"":[23],""senandungan"":[24],""tentang"":[25,116,120],""tuntunan"":[26,88],""dan"":[27,30,85,106,123],""falsafah"":[28],""hidup"":[29,89],""juga"":[31,44],""sebagai"":[32],""media"":[33],""pengungkapan"":[34],""perasaan"":[35,118],""berkembang"":[36],""tidak"":[37],""hanya"":[38],""di"":[39,45,52,64,71],""lingkungan"":[40,47,53,66],""keraton"":[41,54,59,67,80],""Buton"":[42],""tetapi"":[43],""luar"":[46,65],""keraton."":[48],""yang"":[50,62,90,110],""digelar"":[51,63,81,98],""disebut"":[55,74],""nama"":[57],""sedangkan"":[60,95],""atau"":[68],""tepatnya"":[70],""daerah-daerah"":[72],""pesisir"":[73,97,114],""pesisir."":[77],""Jenis"":[78],""pada"":[82,99],""acara-acara"":[83,100,107],""keagamaan"":[84],""menyandungkan"":[86],""petuah-petuah"":[87],""dilandasi"":[91],""ajaran"":[92],""agama"":[93],""Islam,"":[94],""labanti"":[96],""sosial"":[101],""seperti"":[102],""pesta"":[103],""pernikahan,"":[104],""sunatan,"":[105],""syukuran."":[108],""Informasi"":[109],""disampaikan"":[111],""melalui"":[112],""umumnya"":[115],""ungkapan"":[117],""(misalnya"":[119],""cinta,"":[121],""kasih"":[122],""kerinduan)."":[124]}}","Seni pertunjukan kabanti seakan telah menjadi ikon kesusastraan masyarakat Buton. Dibanding dengan jenis kesusatraan lain, sastra kabanti lebih dikenal oleh masyarakat luas. Kabanti merupakan senandungan tentang tuntunan dan falsafah hidup dan juga sebagai media pengungkapan perasaan berkembang tidak hanya di lingkungan keraton Buton tetapi juga di luar lingkungan keraton. Kabanti yang digelar di lingkungan keraton disebut dengan nama kabanti keraton sedangkan kabanti yang digelar di luar lingkungan keraton atau lebih tepatnya di daerah-daerah pesisir disebut dengan kabanti pesisir. Jenis kabanti keraton digelar pada acara-acara keagamaan dan menyandungkan petuah-petuah tuntunan hidup yang dilandasi ajaran agama Islam, sedangkan labanti pesisir digelar pada acara-acara sosial seperti pesta pernikahan, sunatan, dan acara-acara syukuran. Informasi yang disampaikan melalui kabanti pesisir umumnya tentang ungkapan perasaan (misalnya tentang cinta, kasih dan kerinduan).","[{'keyword': 'indonesian culture', 'score': 0.4073959710749505}]",1
53,2339540393,"男儿不磨炼,器局安足奇——从郑淑昭的诗作看其子成才之因","{""IndexLength"":1,""InvertedIndex"":{""郑淑昭是西南硕儒郑珍之女,出生于书香门第,自幼聪明好学,熟读《列女传》,歆慕汉史学家班昭,能文能诗。郑淑昭23岁嫁给父亲的学生赵廷璜,育有三子一女。她亲课子女学业,品端行正,以身为范,宽严相济,因材施教,注重环境对子女的影响,寓教于劳,敦敦教导,教子有道,三个儿子最终皆学有所成,声名远播。"":[0]}}",,[],0
1585,2803055043,Aortic Aneurysm in Elderly Patients,"{""IndexLength"":64,""InvertedIndex"":{""The"":[0,31],""appropriateness"":[1],""of"":[2,8,38,48,57],""endovascular"":[3],""abdominal"":[4,10],""aortic"":[5,11],""repair"":[6],""(EVAR)"":[7],""uncomplicated"":[9],""aneurysm"":[12],""(AAA)"":[13],""is"":[14],""dependent"":[15],""on"":[16],""the"":[17,36,55,58],""risk/benefit"":[18],""ratio,"":[19],""particularly"":[20],""in"":[21,51,61],""patients"":[22,39],""\u003e80"":[23,40],""years"":[24,41],""old"":[25,42],""with"":[26],""possible"":[27],""short"":[28],""life"":[29],""expectancy."":[30],""aim"":[32],""was"":[33],""to"":[34,53],""evaluate"":[35],""survival"":[37],""after"":[43],""EVAR"":[44,59],""and"":[45],""analyse"":[46,54],""predictors"":[47],""late"":[49],""mortality,"":[50],""order"":[52],""efficacy"":[56],""treatment"":[60],""these"":[62],""patients."":[63]}}","The appropriateness of endovascular abdominal aortic repair (EVAR) of uncomplicated abdominal aortic aneurysm (AAA) is dependent on the risk/benefit ratio, particularly in patients >80 years old with possible short life expectancy. The aim was to evaluate the survival of patients >80 years old after EVAR and analyse predictors of late mortality, in order to analyse the efficacy of the EVAR treatment in these patients.","[{'keyword': 'aortic aneurysm', 'score': 0.8329388581185999}, {'keyword': 'thoracic aortic aneurysms', 'score': 0.753976901599346}, {'keyword': 'aortic dissection', 'score': 0.6784362547183966}, {'keyword': 'aneurysm rupture', 'score': 0.6756348164423072}, {'keyword': 'aortic root replacement', 'score': 0.6729291721100504}]",5


In [182]:
samples['keywords_len'].value_counts()

keywords_len
1    40
0    21
2    12
3    10
5     9
4     8
Name: count, dtype: int64

In [183]:
samples[samples['abstract'].isnull()]['keywords_len'].value_counts()

keywords_len
1    18
0    14
2     6
3     5
5     4
4     3
Name: count, dtype: int64

In [184]:
samples[~samples['abstract'].isnull()]['keywords_len'].value_counts()

keywords_len
1    22
0     7
2     6
3     5
4     5
5     5
Name: count, dtype: int64

In [188]:
samples[samples['abstract'].isnull()].sample(10)[['paper_id','original_title','abstract_processed','keywords']]

Unnamed: 0,paper_id,original_title,abstract_processed,keywords
2220,2793828623,Doce lecciones : comentarios a los acontecimientos de Hungria / [José Miguel de Azaola].,,"[{'keyword': 'historiography', 'score': 0.4431596018212326}]"
299,2589565674,"Creativity, Exploration and Control in Musical Parameter Spaces.",,"[{'keyword': 'musical performance', 'score': 0.5948892984307431}, {'keyword': 'sound synthesis', 'score': 0.5936375563721186}, {'keyword': 'music generation', 'score': 0.5902165441787353}, {'keyword': 'digital musical instruments', 'score': 0.558197739792075}, {'keyword': 'acoustic ecology', 'score': 0.556107683092431}]"
1442,2104376796,Eine genaue Mikrobestimmungsmethode für Arsen in biologischem Material,,"[{'keyword': 'scanning electrochemical microscopy', 'score': 0.5208396966097735}, {'keyword': 'anodic stripping voltammetry', 'score': 0.5188365755426492}, {'keyword': 'detection', 'score': 0.5174081585205104}, {'keyword': 'heavy metal ions', 'score': 0.5082847403643969}]"
503,1541307873,Ocena nierównomierności nacisków w hamulcach wielotarczowych,,"[{'keyword': 'marine engine diagnostics', 'score': 0.5327456006205008}, {'keyword': 'sensitivity analysis', 'score': 0.5062124892476537}, {'keyword': 'vehicle technical condition', 'score': 0.5035523748246806}]"
2357,1510515863,A study of the overoxidation of the conducting polymer polypyrrole,,"[{'keyword': 'conducting polymers', 'score': 0.6650822905053988}, {'keyword': 'polyaniline', 'score': 0.5071796545420042}]"
86,2492191525,11. Demonstrationsdelikte im Kontext rechtsextremer Aufmärsche.,,"[{'keyword': 'right-wing extremism', 'score': 0.4348615239197596}]"
320,4212796861,Schnelltest detektiert Acinetobacter zuverlässig in Thrombozytenkonzentraten,,"[{'keyword': 'acinetobacter baumannii', 'score': 0.4922739330281245}]"
1670,3186108928,「個我」與「大我」以雙文化自我觀點建構台灣大學生生涯敘說,,[]
1029,4243319117,Specification for Copper Alloys in Ingot Form,,"[{'keyword': 'aluminum alloy', 'score': 0.4852800507574163}]"
2144,4233280353,Cartography and the Mind of Man,,"[{'keyword': 'crowdsourced mapping', 'score': 0.47493198735342546}]"


### Exploring keywords

In [56]:
def transform_keyword_for_normalization(keyword):

    normalized_keyword = keyword.lower().replace("-", " ").replace(" ", "")

    normalized_keyword = normalized_keyword[:-1] if normalized_keyword.endswith('s') else normalized_keyword
    
    return normalized_keyword

In [103]:
def get_final_keyword_key(display_name):
    return display_name.lower().replace("-", " ").replace("  ", " ").replace(" ", "-")

In [86]:
def get_final_display_name(name_options):
    if len(name_options) == 1:
        return name_options[0]
    else:
        name_len = [len(x) for x in name_options]
        hyphens = [1 if '-' in x else 0 for x in name_options]
        num_caps = [len([x for x in i if x.isupper()]) for i in name_options]

        max_hyphens = 0
        max_name_len = 0
        max_num_caps = 0

        final_name = ""

        for n_len, hyp, n_caps, name_option in zip(name_len, hyphens, num_caps, name_options):
            if (n_len > max_name_len) or (hyp > max_hyphens) or (n_caps > max_num_caps):
                final_name = name_option
                max_hyphens = hyp
                max_name_len = n_len
                max_num_caps = n_caps

        return final_name

In [79]:
all_keywords_data.drop_duplicates('keywords')['norm_keywords'].value_counts().reset_index().head(50)

Unnamed: 0,norm_keywords,count
0,antiinflammatoryeffect,5
1,environmentalimpact,5
2,culturallandscape,4
3,socialscience,4
4,antioxidant,4
5,indigenouspeople,4
6,infection,4
7,informationsystem,3
8,ionchannel,3
9,executivefunction,3


In [57]:
all_keywords_data['norm_keywords'] = all_keywords_data['keywords'].apply(transform_keyword_for_normalization)

In [74]:
all_keywords_data[all_keywords_data['norm_keywords']=='environmentalimpact'].drop_duplicates('keywords')[['keywords','norm_keywords']]

Unnamed: 0,keywords,norm_keywords
31,Environmental Impact,environmentalimpact
303,Environmental Impacts,environmentalimpact
844,environmental impact,environmentalimpact
1781,Environmental impact,environmentalimpact
2158,environmental impacts,environmentalimpact


In [89]:
testing_final_display_name = all_keywords_data.drop_duplicates('keywords') \
    .groupby('norm_keywords')['keywords'].agg(list).reset_index()

In [90]:
testing_final_display_name['display_name'] = testing_final_display_name['keywords'].apply(get_final_display_name)

In [92]:
final_display_name = testing_final_display_name.explode('keywords')

In [96]:
final_display_name[final_display_name['display_name']!=final_display_name['keywords']].sample(20)

Unnamed: 0,norm_keywords,keywords,display_name
13887,macroeconomic,macroeconomics,Macroeconomics
13419,lfunction,L-functions,L-Functions
5855,developmentalperspective,Developmental Perspective,Developmental Perspectives
9461,genetic,Genetic,Genetics
5428,cytokineresponse,Cytokine Response,Cytokine Responses
15984,nanowire,Nanowire,Nanowires
10681,herbalmedicine,Herbal Medicine,Herbal Medicines
5701,democratization,democratization,Democratization
5702,demographicchange,Demographic Change,Demographic Changes
18214,phagocytosi,phagocytosis,Phagocytosis


In [105]:
all_keywords_data.sample(2)

Unnamed: 0,topic_id,keywords,embedding,norm_keywords
3662,12380,Language Use,"[-0.006248993333429098, 0.008845660835504532, -0.01615484617650509, 0.008175925351679325, -0.029452728107571602, -0.006868328433483839, 0.03864437714219093, -0.0030936303082853556, -0.01212499849498272, -0.037102360278367996, 0.0031116523314267397, -0.0005261495243757963, -0.013991327956318855, -0.01956641487777233, 0.011280806735157967, -0.027716435492038727, -0.018707947805523872, -0.018916543573141098, -0.02811572700738907, -0.017398526892066002, 0.026372473686933517, -0.03686060011386871, 0.056460220366716385, 0.020736893638968468, 0.029440060257911682, 0.017100997269153595, -0.022327957674860954, -0.039698388427495956, 0.03694598376750946, 0.031937126070261, 0.01896175928413868, 0.0009548076195642352, -0.003883135737851262, -0.04341255500912666, -0.027448663488030434, -0.023881709203124046, 0.0010079983621835709, 0.03506425395607948, -0.07612857222557068, 0.025214383378624916, -0.016610121354460716, -0.013217698782682419, -0.026326145976781845, 0.007642971817404032, 0.029276158660650253, -0.02761874906718731, -0.013838553801178932, -0.012769322842359543, -0.03535623103380203, 0.017231522127985954, 0.004302250687032938, 0.01409862656146288, 0.07538896799087524, -0.02690298855304718, 0.020862950012087822, 0.03409605100750923, -0.02214181236922741, 0.012346716597676277, -0.11162769049406052, -0.0059953960590064526, 0.0037148611154407263, -0.023537710309028625, -0.02836988866329193, -0.002456121612340212, 0.001743837259709835, 0.05477321520447731, 0.0077205924317240715, -0.006779720075428486, -0.0137161361053586, -0.033170975744724274, -0.028857827186584473, 0.024321112781763077, -0.00033033653744496405, 0.0018231753492727876, -0.0624212771654129, 0.018395844846963882, 0.0075958059169352055, -0.010724443010985851, 0.005885151214897633, 0.04309176281094551, 0.028405319899320602, 0.018750227987766266, 0.07195410877466202, 0.030102001503109932, 0.006905003450810909, 0.047721017152071, -0.018437175080180168, -0.0216593649238348, -0.011057544499635696, -0.04210004210472107, -0.010156959295272827, -0.02902982197701931, 0.04995148256421089, -0.02243606373667717, -0.005404568277299404, 0.02306358888745308, 0.002156441565603018, 0.03409701958298683, 0.003906681202352047, -0.010732094757258892, ...]",languageuse
3364,13377,Early-warning Signals,"[-0.020272834226489067, 0.023038744926452637, -0.03902260959148407, 0.011662645265460014, -0.03672855347394943, -0.02471904084086418, 0.0163130946457386, 0.048919692635536194, -0.03360806405544281, 0.024144884198904037, 0.019368639215826988, -0.0012810624903067946, -0.01722915656864643, 0.002881556749343872, 0.02621052786707878, -0.09136362373828888, -0.038993965834379196, 0.006396174430847168, -0.03885417804121971, 0.008368096314370632, -0.022387273609638214, -0.003619459690526128, 0.02272907644510269, 0.02469249628484249, 0.02845045179128647, 0.035668663680553436, -0.0008901162655092776, -0.04272786155343056, -0.019253378733992577, 0.04547995701432228, -0.016771430149674416, -0.018706275150179863, 0.004556451924145222, -0.032376550137996674, -0.027554268017411232, -0.008984275162220001, -0.00037852564128115773, 0.010536681860685349, -0.13373348116874695, 0.02858130820095539, -0.01658102683722973, -0.008294988423585892, 0.0150247598066926, -0.04855076223611832, 0.029269693419337273, -0.015940163284540176, -0.029077038168907166, -0.01414341852068901, 0.01960643194615841, -0.05229971930384636, 0.0007223961292766035, -0.013082755729556084, 0.018507380038499832, -0.006726455874741077, 0.013205704279243946, 0.001084921765141189, -0.03286919370293617, -0.002423306228592992, -0.030759602785110474, -0.07029787451028824, -0.04355090856552124, 0.02852787636220455, -0.01836063712835312, -0.020988885313272476, 0.017624223604798317, 0.01251109316945076, 0.021802235394716263, -0.011098591610789299, -0.011547771282494068, -0.025542430579662323, 0.01639663055539131, -0.02388620935380459, -0.038383446633815765, 0.011915106326341629, -0.0711028054356575, 0.02569427341222763, 0.03601229935884476, -0.03139488026499748, 0.030198991298675537, 0.0085576381534338, 0.07634500414133072, -0.025008441880345345, -0.002595364348962903, 0.020316123962402344, 0.03948706388473511, 0.07872848212718964, 0.007445907220244408, 0.015187636949121952, -0.022832348942756653, 0.006783888675272465, -0.007115784101188183, -0.0350264310836792, 0.0326605960726738, 0.004833535756915808, -0.017538920044898987, -0.016247639432549477, -0.03750038892030716, -0.004634861834347248, 0.018344514071941376, 0.021650807932019234, ...]",earlywarningsignal


In [100]:
new_all_keywords_data = all_keywords_data.merge(final_display_name[['keywords','display_name']].drop_duplicates(), 
                                                how='inner', on='keywords')

In [104]:
new_all_keywords_data['keyword_id'] = new_all_keywords_data['display_name'].apply(get_final_keyword_key)

In [110]:
new_all_keywords_data[['keyword_id','display_name']].drop_duplicates().to_parquet('keywords_for_casey.parquet')

In [107]:
new_all_keywords_data[['topic_id','keyword_id','display_name','keywords']].sample(20)

Unnamed: 0,topic_id,keyword_id,display_name,keywords
26810,12712,fascial-plasticity,Fascial Plasticity,Fascial Plasticity
24466,14344,dopamine-agonists,Dopamine Agonists,Dopamine Agonists
4401,10062,metastasis,Metastasis,Metastasis
39428,11115,morphology-based-filters,Morphology-based Filters,Morphology-based Filters
27554,10784,sensory-feedback,Sensory Feedback,Sensory Feedback
32059,11707,eye-tracking,Eye Tracking,Eye Tracking
35498,13825,immigration-policies,Immigration Policies,Immigration Policies
43383,10221,shear-deformation-theory,Shear Deformation Theory,Shear Deformation Theory
23879,12056,approximate-bayesian-computation,Approximate Bayesian Computation,Approximate Bayesian Computation
10192,10546,quality-assurance,Quality Assurance,Quality Assurance


In [106]:
new_all_keywords_data[['topic_id','keyword_id','display_name','keywords','embedding']] \
    .to_parquet('latest_keywords_file.parquet')

In [113]:
new_file = pd.read_parquet("latest_keywords_file.parquet")

In [115]:
new_file.sample(1)

Unnamed: 0,topic_id,keyword_id,display_name,keywords,embedding
19571,13955,entropy-weight,Entropy Weight,Entropy Weight,"[0.0026810334529727697, 0.004703668877482414, -0.007654732093214989, -0.008646409027278423, -0.03681402653455734, -0.07908197492361069, 0.0198382381349802, -0.002426604274660349, -0.0023849685676395893, -0.017848581075668335, 0.012804345227777958, 0.008585539646446705, -0.009734250605106354, -0.003086685435846448, 0.030598612502217293, -0.01647077314555645, -0.015609859488904476, -0.015078515745699406, -0.06924502551555634, -0.00593704404309392, -0.01282799057662487, -0.023465177044272423, 0.009105665609240532, 0.01413060538470745, 0.005108551122248173, 0.05292129889130592, -0.020420623943209648, 0.010411563329398632, 0.017799820750951767, -0.006627588998526335, 0.028011299669742584, 0.0054331207647919655, 0.009877088479697704, -0.05361875519156456, -0.010135981254279613, -0.021687058731913567, 0.03789178282022476, -0.01798289082944393, -0.05849285423755646, 0.010320303030312061, 0.02150595933198929, -0.0587431937456131, -0.03318381682038307, -0.055787794291973114, 0.04692681506276131, -0.04903823882341385, 0.004611814394593239, -0.04053961858153343, -0.050500571727752686, -0.025555351749062538, -0.02603253908455372, -0.003764048218727112, 0.03116334043443203, 0.013714722357690334, -0.0062173279002308846, 0.029825085774064064, -0.05439801514148712, -0.02367313764989376, -0.062439143657684326, -0.06306125223636627, -0.020475858822464943, 0.02240213379263878, 0.012150653637945652, 0.023572096601128578, 0.0149641502648592, 0.01923186145722866, -0.016474340111017227, 0.026129042729735374, 0.004714925307780504, 0.021808037534356117, -0.00037086880183778703, -0.022293612360954285, -0.029507363215088844, 0.02769860066473484, -0.06216927617788315, 0.017519567161798477, -0.03485232964158058, -0.030676495283842087, 0.0017207617638632655, 0.023816987872123718, 0.08586268126964569, 0.004299593158066273, -0.017042577266693115, 0.030130436643958092, -0.0073374370113015175, 0.07074105739593506, -0.032510049641132355, -0.056233249604701996, -0.03915371000766754, -0.06902778148651123, -0.028515709564089775, 0.01412420067936182, -0.0016620381502434611, 0.0010150232119485736, -0.005887164734303951, 0.01973532885313034, -0.03804684057831764, 0.015048989094793797, 0.023880621418356895, 0.006462787743657827, ...]"


In [125]:
new_file[['keywords','keyword_id']].drop_duplicates(subset=['keywords']).set_index('keywords').to_dict()['keyword_id']\
['Imaging']

'imaging'

In [111]:
curr_file = pd.read_parquet("./003_Deployment/model_to_api/container/model_artifacts/model/keywords_latest.parquet")