In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
def flatten(l):
    return [item for sublist in l for item in sublist]
from tabula.io import read_pdf

## Find subset of authors from conferences

### 2022

In [4]:
# conda install -c conda-forge camelot-py
# import camelot

def parse_table(table):
    
    table = table.df
    table.columns = ["title","Authors","session"]
    table = table[table.Authors!="Authors"]
        
    authors_in_table = []
    for set_of_authors in table["Authors"]:
        
        set_of_authors = set_of_authors.replace("\n"," ").replace(", ",",").replace(" and ",",").split(",")
        authors_in_table.extend(set_of_authors)
    return authors_in_table

file = "2022.pdf"

pages = ",".join([str(i) for i in range(27,47)])
tables = read_pdf(file,pages=pages)
authors_2022 = set(flatten([parse_table(i) for i in tables]))

JavaNotFoundError: `java` command is not found from this Python process.Please ensure Java is installed and PATH is set for `java`

### 2021

In [3]:
#LINK: https://easychair.org/smart-program/IC2S2-2021/talk_author_index.html
##HERE I'd RATHER HAVE THEM USE BEAUTIFUL SOUP
table_2021 = pd.read_html("2021.html")[1].dropna(subset=[0])[0]
table_2021 = table_2021.apply(lambda x: " ".join(x.split(", ")[::-1]))
authors_2021 = set([i for i in table_2021 if len(i)>1])

### 2020

In [4]:
#LINK: https://ic2s2.mit.edu/program
#LINK2 : https://docs.google.com/spreadsheets/u/0/d/e/2PACX-1vTX9_1Xftn7D-nSI8X9b7tafr_Z0kAbphKdfZ8qUSU9p-syXNsGPdhHl5ZyTnKKL-T6dCEJqtsrn3wy/pubhtml/sheet?headers=false&gid=181378784
##HERE I'd RATHER HAVE THEM USE BEAUTIFUL SOUP

table_2020 = pd.read_html("2020.html", header =1)[0]
authors_2020 = set(flatten(table_2020["Presenters"].dropna().apply(lambda x:x.split(", "))))


### 2019

In [7]:
#LINK ORAL: https://2019.ic2s2.org/oral-presentations/
#LINK POSTERS: https://2019.ic2s2.org/posters/

poster_2019 = "2019_posters.html"
oral_2019 = "2019_oral.html"

def parse_poster_page(link):
    
    all_page = BeautifulSoup(open(link,encoding="utf-8").read())

    #Find the items in the bullet points
    body = all_page.find("body")
    content = body.find_all("div",{"class":"col-md-8 page-content-wrap"})[0]
    bullet_items = content.find_all("li")

    ##REMOVE THE BOLD ITEMS from each bullet points (those are the paper titles)
    for item in bullet_items:    
        strong_parts = item.find_all("strong")
        for part in strong_parts:
            part.extract() #This will remove it from the soup
           
    all_authors = []
    for list_of_authors in bullet_items:
        list_of_authors = list_of_authors.text.replace("\xa0",'').replace("\n",'').replace(" and ", ", ").replace(", ",",") #clean string
        all_authors.extend(list_of_authors.split(","))
    return set(all_authors)

def parse_oral_page(link):
    
    def find_authors(string):
        """
        Given the authors + title string returns list of authors
        
        """
        
        #when the title contains full stops is a problem so replace those cases
        string = string.replace("U.S.","US").replace("vs.","vs").replace("APIs.","APIs")
        
        authors = ". ".join(string.split(". ")[:-1]) #remove the title of the presentation
        
        #FIX THE STRING
        authors = authors.replace(", ",",")
        authors = authors.replace(" and ",",")
        authors = authors.replace(" –","")
        authors = authors.replace("No presentation (cancelled). ","")
        authors = authors.replace("No presentation (cancelled)","")
        authors = authors.replace("No presentation: ","")
        authors = authors.replace("(Moved to 3D Text Analysis) ","")

        return [i for i in authors.split(",") if i!= ""]
    
    all_page = BeautifulSoup(open(link,encoding="utf-8").read())
    #Find the paragraphs that contain the oral presentations
    body = all_page.find("body")
    content = body.find_all("div",{"class":"col-md-8 page-content-wrap"})[0] 
    paragraphs = content.find_all("p")[3:] #The first three paragraphs do not contain the oral presentations

    all_authors = []
    for paragraph in paragraphs: #PARSE EACH PARAGRAPH
        
        ##CHAIR
        chair = paragraph.contents[3].text.replace("Chair: ",'') #THIS IS THE CHAIR OF THE SESSION
        all_authors.append(chair) #ADD THE CHAIR TO THE LIST OF AUTHORS
        
        
        ##PRESENTATIONS
        data = paragraph.contents[4:] #HERE ARE ALL THE PRESENTATIONS
        lines = [i.text.split(" – ") for i in data] #THIS WOULD DIVIDE THE TIME FROM THE TEXT
        data = [i[2] for i in lines if len(i)>=2] #TAKE ONLY THE AUTHORS AND THE TITLE

        authors = flatten([find_authors(i) for i in data])
        all_authors.extend(authors)
    return set(all_authors)
        
authors_2019 = parse_oral_page(oral_2019).union(parse_poster_page(poster_2019))


In [6]:
all_authors = list(sorted(set.union(*[authors_2022, authors_2021, authors_2020, authors_2019 ])))

In [7]:
with open('../data/ic2s2_authors.txt', 'w') as f:
    for line in all_authors:
        f.write(f"{line}\n")

In [9]:
authors_2022

820

In [10]:
authors_2019

{'',
 'Aamena Alshamsi',
 'Abe Hofman',
 'Abeer Aldayel',
 'Abhijnan Chakraborty',
 'Abhishek Samantray',
 'Abigail Horn',
 'Abigail Jacobs',
 'Abu Sayeed Mondol',
 'Adam Hughes',
 'Adam Pah',
 'Adina Nerghes',
 'Adrian Weller',
 'Adriana Iamnitchi',
 'Adrien Benamira',
 'Ahmad Alabdulkareem',
 'Akira Ishii',
 'Albert Laszlo Barabasi',
 'Alberto Antonioni',
 'Alberto Sánchez-Carralero',
 'Alejandro Espinosa-Rada',
 'Alejandro Noriega Campero',
 'Aleksandra Aloric',
 'Aleksandra Nenko',
 'Aleksandra Urman',
 'Aleksei Rotmistrov',
 'Alessandra Urbinati',
 'Alessandro Cossard',
 'Alessandro Provetti',
 'Alessandro Rosina',
 'Alex Furman',
 'Alex Pentland',
 'Alex Rutherford',
 'Alex ‘Sandy’ Pentland',
 'Alexander Mantzaris',
 'Alexander Robertson',
 'Alexander Sachs',
 'Alexandra Olteanu',
 'Alexandra Pang',
 'Alexandra Schofield',
 'Alexandre Bovet',
 'Alexandre Leroux',
 'Alexandre P Francisco',
 'Alexandru-Ionut Babeanu',
 'Alfredo Morales',
 'Ali Aghelmaleki',
 'Ali Faqeeh',
 'Ali Hür

In [13]:
len(all_authors)

2745