In [4]:
import sys  
sys.path.insert(0, '../')

In [86]:
from src.datamodules.apis.semantic_scholar_api import SemanticScholarAPI

# Author that we want to analyse
author_id = 1747893
# Semantic scholar API
api = SemanticScholarAPI()
# Get all author papers
papers = api.author(author_id, fields=['papers.title', 'papers.abstract', 'papers.year'])['papers']
papers

[{'paperId': '1631a6e3b5235a4a2b0d71dca22a9614acca2ce3',
  'title': 'Data-to-text Generation with Macro Planning',
  'abstract': 'Abstract Recent approaches to data-to-text generation have adopted the very successful encoder-decoder architecture or variants thereof. These models generate text that is fluent (but often imprecise) and perform quite poorly at selecting appropriate content and ordering it coherently. To overcome some of these issues, we propose a neural model with a macro planning stage followed by a generation stage reminiscent of traditional methods which embrace separate modules for planning and surface realization. Macro plans represent high level organization of important content such as entities, events, and their interactions; they are learned from data and given as input to the generator. Extensive experiments on two data-to-text benchmarks (RotoWire and MLB) show that our approach outperforms competitive baselines in terms of automatic and human evaluation.',
  'y

In [87]:
import spacy
from collections import Counter
from string import punctuation
nlp = spacy.load("en_core_web_sm")


def get_hotwords(text, most_common=10):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] # 1
    doc = nlp(text.lower()) # 2
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.pos_ in pos_tag):
            result.append(token.text)
            
    pairs = Counter(result).most_common(most_common)            
    return pairs, [pair[0] for pair in pairs]


hotwords_per_paper = []
paper_data = []
hotwords = []

for paper in papers:
    if paper['title'] and paper['abstract'] :
        pairs, words = get_hotwords(paper['title']+' '+paper['abstract'])
        hotwords_per_paper.append(pairs)
        paper_data.append((paper['paperId'], paper['title'], paper['year']))
        hotwords += words
    
hotwords = Counter(hotwords)
hotwords_reduced = Counter({k: c for k, c in hotwords.items() if c > 1})
num_valid_papers = len(hotwords_per_paper)
num_hotwords = len(hotwords)
print(num_hotwords, num_hotwords_reduced, num_papers)

676 333 229


In [88]:
import pandas as pd
import plotly.express as px


df = pd.DataFrame.from_dict(hotwords_reduced, orient='index').reset_index()
df = df.rename(columns={'index': 'word', 0: "count"})
df = df.sort_values(by=['count'], ascending=False)
fig = px.histogram(df, x='word', y='count')
fig.show()

In [89]:
hotwords_count_dict = {hotword: [0] * num_valid_papers for hotword in hotwords}

for idx, paper_hotwords in enumerate(hotwords_per_paper):
    for hotword, count in paper_hotwords:
        #print(hotword, count, hotword in hotwords_count_dict)
        if hotword in hotwords_count_dict: 
            hotwords_count_dict[hotword][idx] += count

#for k, v in hotwords_count_dict.items():
#    print(len(v))
            
hotwords_count_df = pd.DataFrame(data=hotwords_count_dict)
fig = px.scatter(hotwords_count_df, x=hotwords_count_df.index, y=hotwords_count_df.columns)
fig.show()

In [91]:
hotword = 'representations'
data = hotwords_count_df[hotword]
fig = px.scatter(hotwords_count_df, x=data.index, y=data)
fig.show()

In [77]:
print(paper_data[27])

('416e3ffff2fe2d43343f6b721721a482829f882d', 'Data-to-text Generation with Entity Modeling', 2019)
