In [None]:
# Get emails from Gmail and parse the papers
%load_ext autoreload
%autoreload 2
from connect_to_gmail import *
from parse_gmail_message import *
import pandas as pd

# Override the default parameters
DATA_FOLDER = r'./data'
PAPERS_LABEL = 'Subscribe/Gscholar'

# Create data folder if not exists
if not ospath.exists(DATA_FOLDER):
  makedirs(DATA_FOLDER)

# Call the Gmail API
service = get_service(DATA_FOLDER)

# Get all the messages with labels
labels = GetLabelsId(service,'me',[PAPERS_LABEL,'UNREAD'])
messages = ListMessagesWithLabels(service,"me",labels)
print (f'Found {len(messages)} messages')

# Parse the mails
pa = PaperAggregator()

for msg in messages:
  msg_content = GetMessage(service, "me", msg['id'])
  try:
    msg_str = base64.urlsafe_b64decode(msg_content['payload']['body']['data'].encode('utf-8'))
  except KeyError:
    continue

  msg_title = ''
  for h in msg_content['payload']['headers']:
    if h['name'] == 'Subject':
      msg_title = h['value']
  parser = PapersHTMLParser(msg_title)
  parser.feed(str(msg_str))

  for paper in parser.papers:
    pa.add(paper)

total_papers = len(pa.paper_list)
# Remove previously seen papers
old_pa = PaperAggregator()
old_pa.load_excel(ospath.join(DATA_FOLDER, 'archive.xlsx'))
for paper in old_pa.paper_list:
  pa.remove(paper)

# Sort by number of refernece mails
total_new_papers = len(pa.paper_list)
print (f'Found {total_papers} papers (New: {total_new_papers}, Duplicated: {total_papers-total_new_papers})')

old_pa.merge(pa)
df = old_pa.to_dataframe()
df.to_excel(ospath.join(DATA_FOLDER, 'archive.xlsx'), index=False)
print (f'Unread: {df.query("status == 0").shape[0]}')

In [None]:
# Mark all as read
body = {"addLabelIds": [], "removeLabelIds": ["UNREAD","INBOX"]}
for msg in messages:
  service.users().messages().modify(userId="me", id=msg['id'], body=body).execute()

In [None]:
# Query TLDR from semantic scholar
import urllib
import requests
from pathlib import Path
import time
from tqdm import tqdm

SEMANTIC_SCHOLAR_API_KEY = Path('./semantic_scholar_api_key.txt').read_text().strip()

# Parallel query
# Since the API is rate limited (1 request per seconds), we can't use this code
# async def fetch(session, url):
#   async with session.get(url) as response:
#     return await response.json()


# async def query_tldr_all_papers(papers):
#   titles = [
#     re.sub('[^A-Za-z0-9 ]+', ' ', paper.title.strip())
#     for paper in papers
#   ]
#   urls = [
#     f"https://api.semanticscholar.org/graph/v1/paper/search?query={urllib.parse.quote_plus(title)}&fields=tldr&limit=1"
#     for title in titles
#   ]

#   async with aiohttp.ClientSession() as session:
#     tasks = [asyncio.ensure_future(fetch(session, url)) for url in urls]
#     responses = await asyncio.gather(*tasks)
#     for i,response in enumerate(responses):
#       print(response)
#       if response.get('total') > 0:
#         data = response.get('data')
#         if data and len(data) > 0:
#           result = data[0]
#           tldr = result.get('tldr')
#           if tldr:
#             papers[i].tldr = tldr.get('text')

def query_tldr_all_papers(df):
  query_df = df[df['tldr'].isnull()]
  print(f"Querying TLDR for {query_df.shape[0]} papers")
  titles = [
    re.sub('[^A-Za-z0-9 ]+', ' ', title.strip())
    for title in query_df.title
  ]
  urls = [
    f"https://api.semanticscholar.org/graph/v1/paper/search?query={urllib.parse.quote_plus(title)}&fields=tldr,abstract&limit=1"
    for title in titles
  ]
  
  index_loc = query_df.index.to_list()
  assert len(index_loc) == len(urls), 'Index length mismatch'
  
  for i in tqdm(range(10)):
    tstart = time.time()
    response = requests.get(urls[i], headers={'x-api-key': SEMANTIC_SCHOLAR_API_KEY})
    tend = time.time()
    if response.status_code == 200:
      response = response.json()
      if response.get('total') > 0:
        data = response.get('data')
        if data and len(data) > 0:
          result = data[0]
          tldr = result.get('tldr')
          if tldr and tldr.get('text'):
            df.loc[index_loc[i], 'tldr'] = tldr.get('text')

    # # Sleep for 1 second to avoid rate limit
    time.sleep(max(0, 1 - (tend - tstart)))

query_tldr_all_papers(df)
df.to_excel(ospath.join(DATA_FOLDER, 'archive.xlsx'), index=False)

print(f"Unknown TLDR: {df[df['tldr'].isnull()].shape[0]}")

In [None]:
# Print the papers
from IPython.display import display, HTML, Markdown

def print_html(html):
  display(HTML(html))

def print_markdown(md):
  display(Markdown(md))

unread_df = df.query("status == 0")
item_per_page = 10
total_new_papers = unread_df.shape[0] 
total_page = total_new_papers // item_per_page + 1

# Change page index here 
# start from zero 
page = 8

counter = 1
print_html(f'<h3 style="color:yellow">Page {page+1}/{total_page}<br><small>Item / page: {item_per_page}</small></h3>')
for i,row in unread_df[page*item_per_page:(page+1)*item_per_page].iterrows():
  print_html(f'<h3><input type="checkbox" style="float:left"/> {page*item_per_page+counter} / {total_new_papers}</h3>')
  print_html(f'<div><b style="color:lightblue">{row.title}</b></div>')
  print_html(f'<div>{row.tldr}</div>')
  print_html(f'<div>Auhtors: {row.authors}<br><cite>Email title: {row.email_title}</cite><br><a href="{row.link}">Link</a></div>')
  counter+=1

In [158]:
# Mark as read and save to excel
df.loc[unread_df.index, 'status'] = 1
df.to_excel(ospath.join(DATA_FOLDER, 'archive.xlsx'), index=False)