# Get Paper Info
These notebook will get the paper info for each speakers in the conference. The notebook must be runned for the invited speaker table and the proceedings table afterward. Please note that running the notebook for the proceedings table will take serveral days to complete. 

In [2]:
# Load packages
%run script/setup.py

# Initial Work

Load the data

In [6]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
load_file = 'factInvited_init.csv' # or 'factProceedings_init.csv'
df_initial  = pd.read_csv(os.path.join(filepath,load_file), encoding='utf-8')

dblp_name = df_initial['Full name'] # or df_initial['Links'] if data is factProceedings_init.csv

dblp_name = dblp_name.drop_duplicates().reset_index(drop=True)
dblp_name = dblp_name.dropna().reset_index(drop=True)

Remove disambiguation page

In [None]:
i = -1
remove_idx = []
for disambiguation_page in tqdm(dblp_name):
    i += 1
    page = requests.get(disambiguation_page)
    soup = BeautifulSoup(page.content, "html.parser")
    if soup.find_all("div", {"class": "remark-box"}):
        remove_idx.append(i)
    time.sleep(int(np.random.randint(1, 6, 1)))
    
for index in remove_idx:
    del dblp_name[index]

## Get data from dblp.org
Run the entire notebook for one dataframe at a time. First the invited speakers then the proceedings.
https://dblp.org/search?q=

### Inivited speakers

In [None]:
# Initialize empty dataframe
df1 = pd.DataFrame(columns = ['Full name','Year','Year Count'])

# This takes four days to run
for Author in tqdm(dblp_name):
    author_list = []
    paper_list = []
    
    # Search for the author either by name
    dblp_URL = "https://dblp.org/search?q=" + Author
    
    # Open the list of posible authors
    page = requests.get(dblp_URL)
    soup = BeautifulSoup(page.content, "html.parser")
    
    # Check if there is page with the person's name
    control_check = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list") 
    if len(control_check) > 0:
        # Find the link of the first author
        first_author = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")[0].find_all("a",href=True)[0]['href']
        
        # Enter the page about the chosen author
        author_page = requests.get(first_author)
        soup_of_speaker = BeautifulSoup(author_page.content, "html.parser")

        Year_list = []

        # Iterate through all sections and count the number of publications per year
        sections = soup_of_speaker.find(id="publ-section").find_all("div", class_="hide-body")
        for s in range(0,len(sections)):
            rows_in_sections = sections[s].find_all('ul', class_="publ-list")[0].findAll(True, {"class":['year','entry inproceedings toc','entry article toc','entry incollection toc', 'entry book toc','entry editor toc','entry reference toc']})
            last_row_idx = 0
            
            for row in range(0,len(rows_in_sections)):
                #### Append year multipliers
                if rows_in_sections[row].p == None:
                    Year_list.append(rows_in_sections[row].text)
                    paper_list.append(row-last_row_idx-1)
                    last_row_idx = row 
                if row == len(rows_in_sections)-1: 
                    paper_list.append(len(rows_in_sections)-last_row_idx-1)
            
            # remove all -1 values from the list
            paper_list = [x for x in paper_list if x >= 0]
                
        # Create a dataframe
        df2 = pd.DataFrame(columns = ['Full name','Year','Year Count'])
        # Append full name to the dataframe
        
        df2['Year'] = Year_list
        df2['Year Count'] = paper_list
        df2['Full name'] = Author
        df1 = df1.append(df2, ignore_index=True)
    time.sleep(int(np.random.randint(1, 6, 1)))

### Proceedings

In [None]:
# Initialize empty dataframe
df1 = pd.DataFrame(columns = ['Full name','Year','Year Count'])

# This takes four days to run
for Author in tqdm(dblp_name):
    author_list = []
    paper_list = []
    
    # Search for the author either by name
    dblp_URL = Author
    
    # Enter the page about the chosen author
    author_page = requests.get(dblp_URL)
    soup_of_speaker = BeautifulSoup(author_page.content, "html.parser")

    Year_list = []

    # Iterate through all sections and count the number of publications per year
    sections = soup_of_speaker.find(id="publ-section").find_all("div", class_="hide-body")
    for s in range(0,len(sections)):
        rows_in_sections = sections[s].find_all('ul', class_="publ-list")[0].findAll(True, {"class":['year','entry inproceedings toc','entry article toc','entry incollection toc', 'entry book toc','entry editor toc','entry reference toc']})
        last_row_idx = 0
        
        for row in range(0,len(rows_in_sections)):
            #### Append year multipliers
            if rows_in_sections[row].p == None:
                Year_list.append(rows_in_sections[row].text)
                paper_list.append(row-last_row_idx-1)
                last_row_idx = row 
            if row == len(rows_in_sections)-1: 
                paper_list.append(len(rows_in_sections)-last_row_idx-1)
        
        # remove all -1 values from the list
        paper_list = [x for x in paper_list if x >= 0]
            
    # Create a dataframe
    df2 = pd.DataFrame(columns = ['Full name','Year','Year Count'])
    # Append full name to the dataframe
    
    df2['Year'] = Year_list
    df2['Year Count'] = paper_list
    df2['Full name'] = Author
    df1 = df1.append(df2, ignore_index=True)
    time.sleep(int(np.random.randint(1, 6, 1)))

## Preprocess the collected data

In [11]:
df1 = df1.drop_duplicates().reset_index(drop=True)
df1['Year'] = df1['Year'].astype(int)
df1['Year Count'] = df1['Year Count'].astype(int)

df1 = df1.sort_values(by=['Year'], ascending=True)

df1['Year Count'] = df1.groupby('Full name')['Year Count'].cumsum()

df1['max_year_count'] = df1.groupby('Full name')['Year Count'].transform('max')
df1['First year paper'] = df1.groupby('Full name')['Year'].transform('min')

## Combine data and save as factTable

In [None]:
# create a function to get the paper count for a given name and year
def get_paper_count(name, year):
    df = df1[df1['Full name'] == name]
    paper_count = df[df['Year'] <= year]['Year Count']
    if len(paper_count) == 0:
        return 0
    return paper_count.iloc[-1]

In [None]:
# merge the dataframes and add a new column with the paper count
merged = pd.merge(df_initial, df1.drop_duplicates(subset=['Full name']), on='Full name', how='left')
merged['Year Count'] = merged.apply(lambda x: get_paper_count(x['Full name'], x['Year_x']), axis=1)

# Fill empty rows with 0
merged['max_year_count'] = merged['max_year_count'].fillna(0)
merged['First year paper'] = merged['First year paper'].fillna(0)

# Clean table
merged = merged.drop(columns=['Year_y'])
merged = merged.rename(columns={'Year_x': 'Year', 'Year Count': 'Paper Count', 'max_year_count': 'Max Paper Count'})
merged.head(10)

In [None]:
filename = "factInvited.csv" # or factProceedings.csv
merged.to_csv(os.path.join(filepath, filename), index=False)
print("The file is now saved")