In [16]:
# Get emails from Gmail and parse the papers and save it as excel file
%load_ext autoreload
%autoreload 2
from connect_to_service import *
from parse_gmail_message import *

import pandas as pd
import base64
import os.path as ospath

# Override the default parameters
DATA_FOLDER = "./data/"
PAPERS_LABEL = 'Subscribe/Gscholar'
SHEET_ID = '1Z5Riim21O7Ti5hHlWzriBhn2CbJibqee4psHkonCcBw'

# Create data folder if not exists
if not ospath.exists(DATA_FOLDER):
  ospath.makedirs(DATA_FOLDER)

# Call the Gmail API
creds = get_creds(DATA_FOLDER)
service = get_gmail_service(creds)
sheet_service = get_sheets_service(creds)

# Get all the messages with labels
labels = GetLabelsId(service,'me',[PAPERS_LABEL,'UNREAD'])
messages = ListMessagesWithLabels(service,"me",labels)
print (f'Found {len(messages)} messages')

# Parse the mails
pa = PaperAggregator()

for msg in messages:
  msg_content = GetMessage(service, "me", msg['id'])
  try:
    msg_str = base64.urlsafe_b64decode(msg_content['payload']['body']['data'].encode('utf-8'))
  except KeyError:
    continue

  msg_title = ''
  for h in msg_content['payload']['headers']:
    if h['name'] == 'Subject':
      msg_title = h['value']
  parser = PapersHTMLParser(msg_title)
  parser.feed(str(msg_str))

  for paper in parser.papers:
    pa.add(paper)

total_papers = len(pa.paper_list)
# Remove previously seen papers
old_pa = PaperAggregator()
old_pa.load_excel(ospath.join(DATA_FOLDER, 'archive.xlsx'))
#old_pa.load_csv(ospath.join(DATA_FOLDER, 'archive.csv'))
for paper in old_pa.paper_list:
  pa.remove(paper)

# Sort by number of refernece mails
total_new_papers = len(pa.paper_list)
print (f'Found {total_papers} papers (New: {total_new_papers}, Duplicated: {total_papers-total_new_papers})')

old_pa.merge(pa)
df = old_pa.to_dataframe()
df.to_excel(ospath.join(DATA_FOLDER, 'archive.xlsx'), index=False)
#df.to_csv(ospath.join(DATA_FOLDER, 'archive.csv'), index=False)
print (f'Unread: {df.query("status == 0").shape[0]}')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Found 0 messages
Found 0 papers (New: 0, Duplicated: 0)
Unread: 29


In [31]:
# Query TLDR from semantic scholar
import urllib
import requests
from pathlib import Path
import time
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

get_datetime = lambda: datetime.now(timezone.utc).isoformat()

SEMANTIC_SCHOLAR_API_KEY = Path('./semantic_scholar_api_key.txt').read_text().strip()

def query_tldr_all_papers(df):
  #new_papers = [datetime.fromisoformat(d2) - datetime.fromisoformat(d1) < timedelta(days=14) for (_, d1, d2) in df[['created_at', 'updated_at']].itertuples() ]
  #new_papers = pd.Series(new_papers)
  query_df = df[df['tldr'].isnull() & (df['status'] == 0)]
  print(f"Querying TLDR for {query_df.shape[0]} papers")
  titles = [
    re.sub('[^A-Za-z0-9 ]+', ' ', title.strip())
    for title in query_df.title
  ]
  urls = [
    f"https://api.semanticscholar.org/graph/v1/paper/search?query={urllib.parse.quote_plus(title)}&fields=tldr,abstract&limit=1"
    for title in titles
  ]
  
  index_loc = query_df.index.to_list()
  assert len(index_loc) == len(urls), 'Index length mismatch'
  
  for i in tqdm(range(len(urls))):
    tstart = time.time()
    response = requests.get(urls[i], headers={'x-api-key': SEMANTIC_SCHOLAR_API_KEY})
    tend = time.time()
    if response.status_code == 200:
      response = response.json()
      if response.get('total') > 0:
        data = response.get('data')
        if data and len(data) > 0:
          result = data[0]
          tldr = result.get('tldr')
          if tldr and tldr.get('text'):
            df.loc[index_loc[i], 'tldr'] = tldr.get('text')
    # Update the time
    df.loc[index_loc[i], 'updated_at'] = get_datetime()

    # # Sleep for 1 second to avoid rate limit
    time.sleep(max(0, 1 - (tend - tstart)))

print (f'Unread: {df.query("status == 0").shape[0]}')
query_tldr_all_papers(df)
df.to_excel(ospath.join(DATA_FOLDER, 'archive.xlsx'), index=False)

print(f"Unknown TLDR: {df[df['tldr'].isnull()].shape[0]}")

Unread: 29
Querying TLDR for 0 papers


0it [00:00, ?it/s]


Unknown TLDR: 0


In [29]:
# Update google sheet
prepare_data = PrepareDataforUpdateSheet(sheet_service, SHEET_ID, df)
print(f"Number of Updated records: {len(prepare_data['write_data'])}")
print(f"Number of New records: {len(prepare_data['append_data'])}")
if len(prepare_data['write_data']) > 0:
  UpdateSheet(sheet_service, SHEET_ID, prepare_data['write_data'])
if len(prepare_data['append_data']) > 0:
  AppendSheet(sheet_service, SHEET_ID, prepare_data['append_data'])

Number of Updated records: 0
Number of New records: 0


In [3]:
# Mark all as read
body = {"addLabelIds": [], "removeLabelIds": ["UNREAD","INBOX"]}
for msg in messages:
  service.users().messages().modify(userId="me", id=msg['id'], body=body).execute()

In [None]:
# Print the papers
from IPython.display import display, HTML, Markdown

def print_html(html):
  display(HTML(html))

def print_markdown(md):
  display(Markdown(md))

unread_df = df.query("status == 0")
item_per_page = 10
total_new_papers = unread_df.shape[0] 
total_page = total_new_papers // item_per_page + 1

# Change page index here 
# start from zero 
if 'page' in globals():
  page = page + 1 # type: ignore
else:
  page = 0

counter = 1
print_html(f'<h3 style="color:yellow">Page {page+1}/{total_page}<br><small>Item / page: {item_per_page}</small></h3>')
for i,row in unread_df[page*item_per_page:(page+1)*item_per_page].iterrows():
  print_html(f'<h3><input type="checkbox" style="float:left"/> {page*item_per_page+counter} / {total_new_papers}</h3>')
  print_html(f'<div><b style="color:lightblue">{row.title}</b></div>')
  if isinstance(row.tldr, str) and len(row.tldr) > 0:
    print_html(f'<div>[TLDR]: {row.tldr}</div>')
  else:
    print_html(f'<div>{row.data}</div>')
  print_html(f'<div>Auhtors: {row.authors}<br><cite>Email title: {row.email_title}</cite><br><a href="{row.link}">Link</a></div>')
  counter+=1

In [None]:
if 'page' in globals():
  read_df = unread_df[:(page)*item_per_page]
  print(f'Saved {len(read_df)} read papers to excel')
  df.loc[read_df.index, 'status'] = 1
  df.to_excel(ospath.join(DATA_FOLDER, 'archive.xlsx'), index=False)
else:
  print('No papers read')