In [1]:
from bs4 import BeautifulSoup 
import requests 
import re
import numpy as np

In [None]:
url = "https://ic2s2-2023.org/program"
r = requests.get(url) 
soup = BeautifulSoup(r.content) 

data = soup.find_all("i")

text = str(data)

cleaned_text = re.sub(r'<[^>]+>', '', text)

cleaned_text = re.sub(r'Chair:', '', cleaned_text)

names = [name.strip().lower() for name in cleaned_text.split(',')]

names[0] = names[0][2:]
names[-1] = names[0][:-1]

print(names)

[<i>Chair: Claudia Wagner</i>, <i><u>Jonas L Juul</u>, Jon Kleinberg</i>, <i><u>Chloe Ahn</u>, Xinyi Wang</i>, <i><u>Giuseppe Russo</u>, luca verginer, Manoel Horta Ribeiro, Giona Casiraghi</i>, <i><u>Almog Simchon</u>, Adam Sutton, Matthew Edwards, Stephan Lewandowsky</i>, <i><u>Arianna Pera</u>, Manuel Vimercati, Matteo Palmonari</i>, <i><u>Mohammed Alsobay</u>, Abdullah Almaatouq, David G. Rand, Duncan J. Watts</i>, <i>Sara Venturini, <u>Satyaki Sikdar</u>, Francesco Rinaldi, Francesco Tudisco, Santo Fortunato</i>, <i><u>Isabella Loaiza</u>, Takahiro Yabe, Alex Pentland</i>, <i>Chair: Taha Yasseri</i>, <i><u>Silvia De Sojo Caso</u>, Mia Ann Jørgensen, Sune Lehmann, Laura Alessandretti</i>, <i><u>Emil Bakkensen Johansen</u>, Mathias Wullum Nielsen</i>, <i><u>Rubén Rodríguez Casañ</u>, Antonio Ariño Villarroya</i>, <i><u>Sunny Rai</u>, Ashley Francisco, Salvatore Giorgi, Brenda Curtis, Lyle Ungar, Sharath Chandra Guntuku</i>, <i><u>Allison Koenecke</u>, Eric Giannella, Robb Willer, Sh

In [None]:
for elem in data:
    name = "Linda Steg"
    if name in str(elem):
        print(f"{name} found")
print(f"{name} not found")

# Seems that no keynote speakers included

Linda Steg not found


In [None]:
soup = BeautifulSoup(r.text, "html.parser")

keynote_elements = soup.find_all("td", colspan="100%")

names_keynote = []
for elem in keynote_elements:
    if "keynotes" in str(elem):
        index = str(elem).find("Keynote - ")
        name = str(elem)[index+10:-13]
        names_keynote.append(name.lower())

print(names_keynote)

allnames = names + names_keynote

['jevin west', 'linda steg', 'sharad goel', 'molly crockett', 'lisa anne hendriks', 'stefan gössling', 'joanna bryson', 'tim althoff', 'lauren brent', 'esteban moro']


In [None]:
# NOTE: To find all unique names we used chatgpt to help us filtering the names.

# Steps:
# 1.	Check for exact duplicates.
# 2.	Use fuzzy matching to detect minor spelling differences.
# 3.	Handle middle name variations (keep the longest version)

from thefuzz import fuzz

# Function to check if one name is a short version of another
def is_shorter_version(name1, name2):
    name1_parts = set(name1.split())
    name2_parts = set(name2.split())
    return name1_parts.issubset(name2_parts) or name2_parts.issubset(name1_parts)

# Filter out duplicates with minor differences
unique_names = []
for name in allnames:
    name  = name.title().strip()
    found_duplicate = False
    for unique in unique_names: 
        # Check for minor spelling differences
        if fuzz.ratio(name, unique) > 90:
            found_duplicate = True
            break
        # Check if one is a shorter version of another
        if is_shorter_version(name, unique):
            if len(name) > len(unique):  # Keep the longer version
                unique_names.remove(unique)
                unique_names.append(name)
            found_duplicate = True
            break
    if not found_duplicate:
        unique_names.append(name)

# Print cleaned list
print(unique_names)
print(len(unique_names))

['Claudia Wagner', 'Jonas L Juul', 'Jon Kleinberg', 'Chloe Ahn', 'Xinyi Wang', 'Giuseppe Russo', 'Luca Verginer', 'Manoel Horta Ribeiro', 'Giona Casiraghi', 'Almog Simchon', 'Adam Sutton', 'Matthew Edwards', 'Stephan Lewandowsky', 'Arianna Pera', 'Manuel Vimercati', 'Matteo Palmonari', 'Mohammed Alsobay', 'Abdullah Almaatouq', 'David G. Rand', 'Duncan J. Watts', 'Sara Venturini', 'Satyaki Sikdar', 'Francesco Rinaldi', 'Francesco Tudisco', 'Santo Fortunato', 'Isabella Loaiza', 'Takahiro Yabe', 'Alex Pentland', 'Taha Yasseri', 'Silvia De Sojo Caso', 'Mia Ann Jørgensen', 'Sune Lehmann', 'Emil Bakkensen Johansen', 'Mathias Wullum Nielsen', 'Rubén Rodríguez Casañ', 'Antonio Ariño Villarroya', 'Sunny Rai', 'Ashley Francisco', 'Salvatore Giorgi', 'Brenda Curtis', 'Lyle Ungar', 'Sharath Chandra Guntuku', 'Allison Koenecke', 'Eric Giannella', 'Robb Willer', 'Sharad Goel', 'Ziv Epstein', 'Hause Lin', 'Levin Brinkmann', 'Bramantyo Supriyatno', 'Iyad Rahwan', 'Sandro Ferreira Sousa', 'Vincenzo Nic

### 6. Explaining the process.

Looking into the html code of the website it was quickly seen that most names were written in italics \<i> . This included the two talks categories and posters. But some names were also with underscore \<u> and often many names where listed together. To clean this the library RegEx (re) was used to filter different all found cases (including filtering "Chair:"). 

After checking names from different areas in the webpage we found that we were only missing the the keynote speakers at the top of the webpage. It was discovered that "td", colspan="100%" was unique for the lines with Keynote speakers. And the found names were added. Lastly, all names names where cleaned and we searched for dublicates (including spelling mistakes and handling different versions of middle names).

Name count = 1455