# Import libraries

In [10]:
import numpy as np
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# Load data

In [11]:
country_list_url = "https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv"
df_country = pd.read_csv(country_list_url)
df_country = df_country[["name", "region"]]
df_country.columns = ["Country", "Region"]
df_country = df_country.fillna("Undefined")
df_country

Unnamed: 0,Country,Region
0,Afghanistan,Asia
1,Åland Islands,Europe
2,Albania,Europe
3,Algeria,Africa
4,American Samoa,Oceania
...,...,...
244,Wallis and Futuna,Oceania
245,Western Sahara,Africa
246,Yemen,Asia
247,Zambia,Africa


# Define functions

In [12]:
def extract_external_link(cfp_link):
    page = requests.get(cfp_link)
    result = BeautifulSoup(page.content, "html.parser")

    link = result.find(lambda tag: tag.name == "td" and "Link" in tag.text)
    link = link.text.strip()[6:]

    return link

In [13]:
def extract_cfp_link(query):
    crawled_tables = pd.read_html(query, extract_links = "body")
    if len(crawled_tables) < 6:
        df_filtered = None
        print("No event found!")
    else:
        # Extract the CFP table
        df = crawled_tables[3]

        # Rename columns
        df.columns = ["Event", "When", "Where", "Deadline"]
        df = df[1:]

        # Extract links
        df["Abbreviation"] = df["Event"].apply(lambda x: x[0])
        df["Link"] = df["Event"].apply(lambda x: "http://www.wikicfp.com" + x[1])
        df_filtered = df[["Abbreviation", "Link"]]

        # Drop duplicates
        df_filtered = df_filtered.drop_duplicates()

        # Extract external links
        df_filtered["CFP Link"] = df_filtered["Link"].apply(extract_external_link)
        df_filtered = df_filtered[["Abbreviation", "CFP Link"]]

        df_filtered = df_filtered.drop_duplicates()

    return df_filtered

# TO-DO: solve cases where location is university or country code (USA)
def extract_country(location_name):
    """

    :param location_name:
    :return:
    """
    country_name = location_name
    if location_name is not np.NAN:
        if "," in location_name:
            country_name = location_name.split(",")[-1].strip()
    return country_name

In [14]:
def main_crawler(query):
    # Get all tables
    tables = pd.read_html(query)

    df_cfp = None
    if len(tables) < 5:
        print("No CFP found!")
    else:
        # Extract the CFP table
        df_cfp = tables[2]
        df_cfp.columns = ["Event", "When", "Where", "Deadline"]
        df_cfp = df_cfp[1:]
        df_cfp = df_cfp.drop_duplicates()

    full_name_arr = []
    time_arr = []
    location_arr = []
    deadline_arr = []
    type_arr = []
    #link  # To be added

    event_arr = df_cfp["Event"].unique()

    for event in event_arr:
        df_cfp_filtered = df_cfp[df_cfp["Event"] == event]

        full_name = df_cfp_filtered.iloc[0, 1]
        full_name_arr.append(full_name)

        time_arr.append(df_cfp_filtered.iloc[1, 1])

        location_arr.append(df_cfp_filtered.iloc[1, 2])

        deadline_arr.append(df_cfp_filtered.iloc[1, 3])

    df_cfp_cleaned = pd.DataFrame({
        "Abbreviation": event_arr,
        "Name": full_name_arr,
        "Time": time_arr,
        "Location": location_arr,
        "Deadline": deadline_arr
    })

    # Add country name
    df_cfp_cleaned["Country"] = df_cfp_cleaned["Location"].apply(extract_country)
    df_cfp_cleaned = pd.merge(df_cfp_cleaned, df_country, on = "Country", how = "left")

    # Add links
    df_links = extract_cfp_link(query)
    df_final = pd.merge(df_cfp_cleaned, df_links, on = "Abbreviation", how = "left")

    # Fill NaNs
    df_final = df_final.fillna("Undefined")

    # Sort columns
    df_final = df_final[["Abbreviation", "Name", "Time", "Deadline", "Location", "Country", "Region", "CFP Link"]]

    return df_final

# Crawl data

### TO-DO:
- Add a filter: by location, by time
- Optimize the query (make it faster)

In [15]:
# Define queries
keywords = "process control, manufacturing"

if "," in keywords:
    keyword_arr = keywords.split(',')
    keyword_arr = [text.strip().replace(" ", "+") for text in keyword_arr]
    keyword = "%2C+".join(keyword_arr)
else:
    keyword = keywords

year = "t" # t = this year, n = next year, f = from this year onward, a = all
query = "http://www.wikicfp.com/cfp/servlet/tool.search?q={}&year={}".format(keyword, year)
query

'http://www.wikicfp.com/cfp/servlet/tool.search?q=process+control%2C+manufacturing&year=t'

In [16]:
df_output = main_crawler(query)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Abbreviation"] = df["Event"].apply(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Link"] = df["Event"].apply(lambda x: "http://www.wikicfp.com" + x[1])


In [17]:
df_output

Unnamed: 0,Abbreviation,Name,Time,Deadline,Location,Country,Region,CFP Link
0,CRET 2023,"2023 International Conference on Control, Robo...","Jul 14, 2023 - Jul 16, 2023","Mar 1, 2023","Paris, France",France,Europe,http://www.cret.net/
1,DKMP 2023,11th International Conference on Data Mining &...,"Mar 18, 2023 - Mar 19, 2023","Jan 7, 2023","Vienna, Austria",Austria,Europe,http://ccsea2023.org/dkmp/index
2,CDC 2023,IEEE Conference on Decision and Control,"Dec 13, 2023 - Dec 15, 2023","Mar 17, 2023","Marina Bay Sands, Singapore",Singapore,Asia,https://cdc2023.ieeecss.org/
3,SIUSAI 2023,2023 International Symposium on Intelligent Un...,"Apr 21, 2023 - Apr 23, 2023","Mar 20, 2023","Shenzhen, China",China,Asia,http://www.siusai.org/
4,ICoCTA 2023,2023 4th International Conference on Control T...,"Oct 20, 2023 - Oct 22, 2023","Sep 10, 2023","Xiamen, Fujian, China",China,Asia,http://www.icocta.org/
5,Machines - SI 2023,Special Issue - Advances in Digital Twin Techn...,"Nov 9, 2022 - Dec 31, 2023","Dec 31, 2023",Undefined,Undefined,Undefined,https://www.mdpi.com/journal/machines/special_...
6,CoEEB 2023,2023 International Joint Conference on Environ...,"May 19, 2023 - May 21, 2023","Mar 31, 2023","Stockholm, Sweden",Sweden,Europe,http://www.coeeb.org/
7,SOFTPA 2023,2nd International Conference on Emerging Pract...,"Apr 29, 2023 - Apr 30, 2023","Jan 7, 2023","Copenhagen, Denmark",Denmark,Europe,https://csita2023.org/softpa/index
8,ICACR--Ei 2023,2023 the 7th International Conference on Autom...,"Aug 4, 2023 - Aug 6, 2023","Mar 15, 2023","Kuala Lumpur, Malaysia",Malaysia,Asia,http://www.icacr.org/
9,ICDM 2023,2023 3rd International Conference on Digital M...,"Mar 24, 2023 - Mar 26, 2023","Jan 25, 2023","Ho Chi Minh City, Vietnam",Vietnam,Undefined,http://icdm.net


In [18]:
# Get additional information
# crawled_tables = pd.read_html(query)
# crawled_tables[1]