# Huggingface AK and research community papers

Flow to get info from AK and research community papers

## Initial setup

In [1]:
import sys
import os

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from scripts.hf_scraper_script import HFPapersScraper
from scripts.arxiv_utils import *

## Getting a list of authors from the papers

### 1. Get list of authors from a list of papers on huggingface: [AK and community daily papers](https://huggingface.co/papers)

In [2]:
# initialize Huggingface scraper class
hf_papers_scraper = HFPapersScraper()

# get arxiv ids from the links
arxiv_ids = hf_papers_scraper.get_paper_ids()

# Loop through the arxiv ids and get the authors through the arxiv API and XML parsing
for arxiv_id in arxiv_ids:
    xml_content = get_content_by_id(arxiv_id)
    print(arxiv_id, extract_paper_info(xml_content)[0]['authors'])

2411.04923 ['Shehan Munasinghe', 'Hanan Gani', 'Wenqi Zhu', 'Jiale Cao', 'Eric Xing', 'Fahad Shahbaz Khan', 'Salman Khan']
2411.05000 ['Jonathan Roberts', 'Kai Han', 'Samuel Albanie']
2411.04075 ['Chuhan Li', 'Ziyao Shangguan', 'Yilun Zhao', 'Deyuan Li', 'Yixin Liu', 'Arman Cohan']
2411.04905 ['Siming Huang', 'Tianhao Cheng', 'Jason Klein Liu', 'Jiaran Hao', 'Liuyihan Song', 'Yang Xu', 'J. Yang', 'J. H. Liu', 'Chenchen Zhang', 'Linzheng Chai', 'Ruifeng Yuan', 'Zhaoxiang Zhang', 'Jie Fu', 'Qian Liu', 'Ge Zhang', 'Zili Wang', 'Yuan Qi', 'Yinghui Xu', 'Wei Chu']
2411.04928 ['Wenqiang Sun', 'Shuo Chen', 'Fangfu Liu', 'Zilong Chen', 'Yueqi Duan', 'Jun Zhang', 'Yikai Wang']
2411.04989 ['Koichi Namekata', 'Sherwin Bahmani', 'Ziyi Wu', 'Yash Kant', 'Igor Gilitschenski', 'David B. Lindell']
2411.04709 ['Wenhao Wang', 'Yi Yang']
2411.05003 ['David Junhao Zhang', 'Roni Paiss', 'Shiran Zada', 'Nikhil Karnad', 'David E. Jacobs', 'Yael Pritch', 'Inbar Mosseri', 'Mike Zheng Shou', 'Neal Wadhwa', 'Nat

### 2. Get list of paper authors by topic from ArXiv

In [16]:
# Can pass a topic or a list of topics
topic_list = ["negative sampling", "llm", ["reinforcement learning", "transformers"]]
for topic in topic_list:
    print(f"Topic: {topic}")
    print([(paper['title'], paper['authors']) for paper in extract_paper_info(get_content_by_topic(topic, filter_type='any'))])

Topic: negative sampling
[('Entity Aware Negative Sampling with Auxiliary Loss of False Negative\n  Prediction for Knowledge Graph Embedding', ['Sang-Hyun Je']), ('Does Negative Sampling Matter? A Review with Insights into its Theory\n  and Applications', ['Zhen Yang', 'Ming Ding', 'Tinglin Huang', 'Yukuo Cen', 'Junshuai Song', 'Bin Xu', 'Yuxiao Dong', 'Jie Tang']), ('Bayesian Negative Sampling for Recommendation', ['Bin Liu', 'Bang Wang']), ('Synthetic Hard Negative Samples for Contrastive Learning', ['Hengkui Dong', 'Xianzhong Long', 'Yun Li', 'Lei Chen']), ('UFNRec: Utilizing False Negative Samples for Sequential Recommendation', ['Xiaoyang Liu', 'Chong Liu', 'Pinzheng Wang', 'Rongqin Zheng', 'Lixin Zhang', 'Leyu Lin', 'Zhijun Chen', 'Liangliang Fu']), ('Rethinking Samples Selection for Contrastive Learning: Mining of\n  Potential Samples', ['Hengkui Dong', 'Xianzhong Long', 'Yun Li']), ('MixKG: Mixing for harder negative samples in knowledge graph', ['Feihu Che', 'Guohua Yang', 'Pe

### 3. Get list of papers using author name

In [25]:
paper_list = extract_paper_info(get_content_by_author('hinton geoff', print_url=True))

print(len(paper_list))
paper_list # API call filtering not accurate - need to add post filtering

http://export.arxiv.org/api/query?search_query=au%3Ahinton+geoff
10


[{'title': 'Ground based gamma-ray astronomy with Cherenkov Telescopes',
  'authors': ['Jim Hinton'],
  'summary': 'Very-high-energy (>100 GeV) gamma-ray astronomy is emerging as an important\ndiscipline in both high energy astrophysics and astro-particle physics. This\nfield is currently dominated by Imaging Atmospheric-Cherenkov Telescopes\n(IACTs) and arrays of these telescopes. Such arrays have achieved the best\nangular resolution and energy flux sensitivity in the gamma-ray domain and are\nstill far from the fundamental limits of the technique. Here I will summarise\nsome key aspects of this technique and go on to review the current status of\nthe major instruments and to highlight selected recent results.'},
 {'title': 'Extraction of Cosmological Information from WiggleZ',
  'authors': ['Samuel Hinton'],
  'summary': 'In this thesis, I analyse the 2D anisotropic Baryon Acoustic Oscillation\n(BAO) signal present in the final WiggleZ dataset. I utilise newly released\ncovariance m