In [6]:
import pandas as pd

In [12]:
df = pd.read_csv("/Users/vignesh/Downloads/Papers.txt", sep="\t")

In [14]:
df.columns

Index(['Paper ID', 'Created', 'Last Modified', 'Paper Title', 'Abstract',
       'Primary Contact Author Name', 'Primary Contact Author Email',
       'Authors', 'Author Names', 'Author Emails', 'Primary Subject Area',
       'Secondary Subject Areas', 'Conflicts', 'Assigned', 'Completed',
       '% Completed', 'Bids', 'Discussion', 'Status',
       'Requested For Author Feedback', 'Author Feedback Submitted?',
       'Requested For Camera Ready', 'Camera Ready Submitted?',
       'Requested For Presentation', 'Files', 'Number of Files',
       'Supplementary Files', 'Number of Supplementary Files', 'Reviewers',
       'Reviewer Emails', 'MetaReviewers', 'MetaReviewer Emails',
       'SeniorMetaReviewers', 'SeniorMetaReviewerEmails', 'Q1 (Open-sourcing)',
       'Q2 (Interest on PINDER and PLINDER challenges)'],
      dtype='object')

In [25]:
df_accept = df[df['Status'].isin(['Accept', 'Accept - Oral'])]

In [26]:
df_accept.shape

(81, 36)

In [27]:
columns_of_interest = [
    'Paper ID', 'Paper Title', 'Authors'
]
df_accept = df_accept[columns_of_interest]
df_accept.shape

(81, 3)

In [28]:
df_accept.head()

Unnamed: 0,Paper ID,Paper Title,Authors
0,1,LatentDE: Latent-based Directed Evolution acce...,Thanh Tran (FPT Software AI Center); Nhat Khan...
1,2,Assessing interaction recovery of predicted pr...,Frederic Dreyer (Genentech)*; David Errington ...
2,3,Improving Inverse Folding models at Protein St...,Oliver Dutton (Peptone Ltd); Sandro Bottaro (P...
3,4,Improving Antibody Design with Force-Guided Sa...,Paulina Kulyte (University of Cambridge)*; Fra...
4,5,Equivariant Blurring Diffusion for Multiscale ...,Jiwoong Park (Texas A&M University)*; Yang She...


In [31]:
def construct_pdf_title(paper_title):
    pdf_title = "_".join(paper_title.split(" "))
    pdf_title += ".pdf"
    return pdf_title

construct_pdf_title(df_accept.iloc[1]["Paper Title"])

'Assessing_interaction_recovery_of_predicted_protein-ligand_poses.pdf'

In [76]:
import re

def gather_author_names(author_str):
    authors = author_str.split(";")
    authors = [re.sub(r'\([^)]*\)', '', author).strip() 
               for author in authors]
    authors = [re.sub(r'\*', '', author).strip() for author in authors]
    return authors

gather_author_names(df_accept.iloc[1]["Authors"])

['Frederic Dreyer',
 'David Errington',
 'Cedric Bouysset',
 'Constantin Schneider']

In [58]:
import os
import shutil

In [42]:
os.makedirs("papers_2024")

In [53]:
included_ids = []

for idx in range(df_accept.shape[0]):
    paper_title = df_accept.iloc[idx]["Paper Title"]
    pdf_title = construct_pdf_title(paper_title)

    author_str = df_accept.iloc[idx]["Authors"]
    paper_id = df_accept.iloc[idx]["Paper ID"]

    base_dir = f"/Users/vignesh/Downloads/CameraReadys/{paper_id}/"
    
    if os.path.exists(base_dir):
        files_of_interest = os.listdir(f"{base_dir}/CameraReady/")
        if len(files_of_interest):
            filename = files_of_interest[0]
            new_filename = f"papers_2024/{pdf_title}"
            shutil.copyfile(
                f"{base_dir}/CameraReady/{filename}",
                new_filename
            )
            included_ids.append(paper_id)

In [77]:
def construct_html_syntax(series):
    paper_title = series["Paper Title"]
    pdf_title = construct_pdf_title(paper_title)
    author_str = series["Authors"]

    authors = gather_author_names(author_str)
    author_html = ", ".join(authors)

    html_syntax = "<li>\n"
    html_syntax += f'<p class="title">{paper_title}</p>\n'
    html_syntax += f'<p class="authors">{author_html}</p>\n'
    html_syntax += f'<p class="preprints"><a href="/papers_2024/{pdf_title}">[paper]</a></p>\n'
    html_syntax += "</li>\n"

    return html_syntax

In [78]:
with open("papers_2024.txt", "a") as f:
    for idx in range(df_accept.shape[0]):
        html_syntax = construct_html_syntax(df_accept.iloc[idx])
        f.write(html_syntax)

In [75]:
re.search(r'\*', "*123")

<re.Match object; span=(0, 1), match='*'>