# Get Paper Info
These notebook will get the paper info for each speakers in the conference. The attributes are: Number of papers (by year), first paper year, number of citation (by year), country

In [1]:
%run setup.py

# Invited

## Load the data

In [6]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
df_new  = pd.read_csv(os.path.join(filepath,'factInvited_init.csv'), encoding='utf-8')
# df_base  = pd.read_csv(os.path.join(filepath,'factInvited.csv'), encoding='utf-8')

# Find new names
# df_new = df_new[~df_new['Full name'].isin(df_base['Full name'])].reset_index(drop=True)

dblp_name = df_new['Full name'].str.replace(' ', '%20')
dblp_name = dblp_name.drop_duplicates().reset_index(drop=True)

# drop nan values from the list
dblp_name = dblp_name.dropna().reset_index(drop=True)

## Get data from dblp.org

In [8]:
df1 = pd.DataFrame(columns = ['Full name','Year','Year Count'])

for Author in tqdm(dblp_name):
    author_list = []
    paper_list = []
    
    ##### Check if author is on dblp #####
    dblp_URL = "https://dblp.org/search?q=" + Author
    page = requests.get(dblp_URL)
    soup = BeautifulSoup(page.content, "html.parser")
    control_check = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")
    if len(control_check) > 0:
        first_author = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")[0].find_all("a",href=True)[0]['href']
        # print(Author," is on the webpage and the process continues...")
        
        ##### Get info from author #####
        author_page = requests.get(first_author)
        soup_of_speaker = BeautifulSoup(author_page.content, "html.parser")

        Year_list = []

        sections = soup_of_speaker.find(id="publ-section").find_all("div", class_="hide-body")
        for s in range(0,len(sections)): # Nu går vi ind på hver enkelt sektion
            rows_in_sections = sections[s].find_all('ul', class_="publ-list")[0].findAll(True, {"class":['year','entry inproceedings toc','entry article toc','entry incollection toc', 'entry book toc','entry editor toc','entry reference toc']})
            last_row_idx = 0
            
            for row in range(0,len(rows_in_sections)): # Nu går vi ind på hver enkelt række
                #### Append year multipliers
                if rows_in_sections[row].p == None:
                    Year_list.append(rows_in_sections[row].text)
                    paper_list.append(row-last_row_idx-1)
                    last_row_idx = row 
                if row == len(rows_in_sections)-1: # den sidste linje i hver sektion
                    paper_list.append(len(rows_in_sections)-last_row_idx-1)
            
            # remove all -1 values from the list
            paper_list = [x for x in paper_list if x >= 0]
                
        # create a dataframe
        df2 = pd.DataFrame(columns = ['Full name','Year','Year Count'])
        # append full name to the dataframe
        
        df2['Year'] = Year_list
        df2['Year Count'] = paper_list
        df2['Full name'] = Author
        df1 = df1.append(df2, ignore_index=True)
    time.sleep(1)
        

100%|██████████| 1134/1134 [1:03:42<00:00,  3.37s/it]


In [9]:
df1.to_csv(os.path.join(filepath,"paper_count_Invited_new.csv"), index=False)
df3 = df1.copy()

## Combine data and save as factTable

In [10]:
# Load the initial factTable
df  = pd.read_csv(os.path.join(filepath,'factInvited_init.csv'), encoding='utf-8')
# Load the table with paper count per year
# df2 = pd.read_csv(os.path.join(filepath,'paper_count_per_year_proceedings.csv'), encoding='utf-8')
# concat df1 and df2
# df1 = pd.concat([df2,df3], axis=0, ignore_index=True)

In [11]:
df1['Full name'] = df1['Full name'].str.replace('%20', ' ')
df1 = df1.drop_duplicates().reset_index(drop=True)
df1['Year'] = df1['Year'].astype(int)
df1['Year Count'] = df1['Year Count'].astype(int)

df1 = df1.sort_values(by=['Year'], ascending=True)

df1['Year Count'] = df1.groupby('Full name')['Year Count'].cumsum()

df1['max_year_count'] = df1.groupby('Full name')['Year Count'].transform('max')
df1['First year paper'] = df1.groupby('Full name')['Year'].transform('min')

In [12]:
# create a function to get the paper count for a given name and year
def get_paper_count(name, year):
    df = df1[df1['Full name'] == name]
    paper_count = df[df['Year'] <= year]['Year Count']
    if len(paper_count) == 0:
        return 0
    return paper_count.iloc[-1]

# merge the dataframes and add a new column with the paper count
merged = pd.merge(df, df1.drop_duplicates(subset=['Full name']), on='Full name', how='left')
merged['Year Count'] = merged.apply(lambda x: get_paper_count(x['Full name'], x['Year_x']), axis=1)

# Clean up and control check
print("Number of rows in original table:",df.shape[0], " and umber of rows in merged table:",merged.shape[0])
print("Pct. of speakers whose information I was able to collect",round(((df['Full name'].nunique()-df[~df['Full name'].isin(df1['Full name'])].shape[0])/df1['Full name'].nunique())*100,2),"%")
merged['max_year_count'] = merged['max_year_count'].fillna(0)
merged['First year paper'] = merged['First year paper'].fillna(0)
merged = merged.drop(columns=['Year_y'])
merged = merged.rename(columns={'Year_x': 'Year', 'Year Count': 'Paper Count', 'max_year_count': 'Max Paper Count'})
merged.to_csv(os.path.join(filepath, "factInvited.csv"), index=False)
print("The file is now saved")
merged.head(10)

Number of rows in original table: 1343  and umber of rows in merged table: 1343
Pct. of speakers whose information I was able to collect 99.62 %
The file is now saved


Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender,Paper Count,Max Paper Count,First year paper
0,Glovanni De WIichcli,2003,ASPDAC,Glovanni,De,WIichcli,M,0,0.0,0.0
1,Tadahiro Ohm,2003,ASPDAC,Tadahiro,,Ohm,M,27,92.0,1993.0
2,Ycrvant Zorian,2003,ASPDAC,Ycrvant,,Zorian,M,0,0.0,0.0
3,Gary L. Baldwi,2004,ASPDAC,Gary,L.,Baldwi,M,4,4.0,1962.0
4,Rudy Lauwereins,2004,ASPDAC,Rudy,,Lauwereins,M,112,196.0,1987.0
5,Rajeev Madhavan,2005,ASPDAC,Rajeev,,Madhavan,M,5,5.0,1994.0
6,Jan M. Rabaey,2005,ASPDAC,Jan,M.,Rabaey,M,146,321.0,1985.0
7,Zhenghua Jiang,2005,ASPDAC,Zhenghua,,Jiang,M,1,1.0,2005.0
8,Alberto Sangiovanni-Vincentelli,2006,ASPDAC,Alberto,,Sangiovanni-Vincentelli,M,458,691.0,1973.0
9,Satoru Ito,2006,ASPDAC,Satoru,,Ito,M,3,3.0,2000.0


In [85]:
merged['Conference (short)'].unique()

array(['ASPDAC', 'CHI', 'ECCV', 'HiPC', 'ic2s2', 'ICML', 'IJCAI', 'KDD',
       'LICS', 'RTA', 'SC', 'SIGGRAPH', 'SODA', 'STOC', 'SWAT', 'WADS',
       'WoLLIC', 'WWW', 'AAAI'], dtype=object)

# Proceedings

## Load the data

In [2]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
df_new  = pd.read_csv(os.path.join(filepath,'factProceedings_init1.csv'), encoding='utf-8')
df_base  = pd.read_csv(os.path.join(filepath,'factProceedings.csv'), encoding='utf-8')
df_base = df_base[~df_base['Full name'].isna()]

def fix_encoding(s):
    return s.encode('latin1').decode('utf-8')

# apply the function to the column using the apply method
df_base['Full name'] = df_base['Full name'].apply(fix_encoding)

# Find new names
df_new = df_new[~df_new['Full name'].isin(df_base['Full name'])].reset_index(drop=True)
df_new = df_new[df_new['Year'] >= 2003]
df_new = df_new[~df_new['Conference (short)'].isin(['CVPR','ICIP','ECCV','RTA'])]
df_new = df_new[df_new['Conference (short)']=="NeurIPS"]

dblp_name = df_new['Full name'].str.replace(' ', '%20')
dblp_name = dblp_name.drop_duplicates().reset_index(drop=True)

# I will append these in the end
df_base = df_base[df_base['Conference (short)']!="NeurIPS"]

# drop nan values from the list
dblp_name = dblp_name.dropna().reset_index(drop=True)
dblp_name

0                    Nick%20Whiteley
1                       Annie%20Gray
2           Patrick%20Rubin-Delanchy
3                       Jason%20Kuen
4                Vlad%20I.%20Morariu
                    ...             
4903          Zahra%20Rahimi%20Afzal
4904              Anthony%20Ndirango
4905                     Tyler%20Lee
4906                 Hamid%20Aghajan
4907    Mohammad%20Reza%20Keshtkaran
Name: Full name, Length: 4908, dtype: object

## Get data from dblp.org

Nu har jeg df_base som er min gamle df_proceedings med rigtige uft-8 navne.
Når koden under har kørt skal jeg lægge de nye data sammen med df (df_proceedings) som KUN indeholder NeurIPS data. Så merger jeg. 
Når den data frame er helt færdig skal den lægges sammen med df_base.

In [3]:
df1 = pd.DataFrame(columns = ['Full name','Year','Year Count'])

for Author in tqdm(dblp_name):
    author_list = []
    paper_list = []
    
    ##### Check if author is on dblp #####
    dblp_URL = "https://dblp.org/search?q=" + Author
    page = requests.get(dblp_URL)
    soup = BeautifulSoup(page.content, "html.parser")
    control_check = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")
    if len(control_check) > 0:
        first_author = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")[0].find_all("a",href=True)[0]['href']
        # print(Author," is on the webpage and the process continues...")
        
        ##### Get info from author #####
        author_page = requests.get(first_author)
        soup_of_speaker = BeautifulSoup(author_page.content, "html.parser")

        Year_list = []

        sections = soup_of_speaker.find(id="publ-section").find_all("div", class_="hide-body")
        for s in range(0,len(sections)): # Nu går vi ind på hver enkelt sektion
            rows_in_sections = sections[s].find_all('ul', class_="publ-list")[0].findAll(True, {"class":['year','entry inproceedings toc','entry article toc','entry incollection toc', 'entry book toc','entry editor toc','entry reference toc']})
            last_row_idx = 0
            
            for row in range(0,len(rows_in_sections)): # Nu går vi ind på hver enkelt række
                #### Append year multipliers
                if rows_in_sections[row].p == None:
                    Year_list.append(rows_in_sections[row].text)
                    paper_list.append(row-last_row_idx-1)
                    last_row_idx = row 
                if row == len(rows_in_sections)-1: # den sidste linje i hver sektion
                    paper_list.append(len(rows_in_sections)-last_row_idx-1)
            
            # remove all -1 values from the list
            paper_list = [x for x in paper_list if x >= 0]
                
        # create a dataframe
        df2 = pd.DataFrame(columns = ['Full name','Year','Year Count'])
        # append full name to the dataframe
        
        df2['Year'] = Year_list
        df2['Year Count'] = paper_list
        df2['Full name'] = Author
        df1 = df1.append(df2, ignore_index=True)
    time.sleep(1)
        

100%|██████████| 4908/4908 [2:56:16<00:00,  2.15s/it]  


In [4]:
df1.to_csv(os.path.join(filepath,"paper_count_Proceedings_new1.csv"), index=False)
df3 = df1.copy()

## Combine data and save as factTable

In [10]:
df  = pd.read_csv(os.path.join(filepath,'factProceedings_init1.csv'), encoding='utf-8')
df = df[df['Conference (short)'] == "NeurIPS"]

In [45]:
# Load the initial factTable
df  = pd.read_csv(os.path.join(filepath,'factProceedings_init.csv'), encoding='utf-8')
# Load the table with paper count per year
# df2 = pd.read_csv(os.path.join(filepath,'paper_count_proceedings_old.csv'), encoding='utf-8')
# concat df1 and df2
# df1 = pd.concat([df2,df3], axis=0, ignore_index=True)

In [11]:
df1['Full name'] = df1['Full name'].str.replace('%20', ' ')
df1 = df1.drop_duplicates().reset_index(drop=True)
df1['Year'] = df1['Year'].astype(int)
df1['Year Count'] = df1['Year Count'].astype(int)

df1 = df1.sort_values(by=['Year'], ascending=True)

df1['Year Count'] = df1.groupby('Full name')['Year Count'].cumsum()

df1['max_year_count'] = df1.groupby('Full name')['Year Count'].transform('max')
df1['First year paper'] = df1.groupby('Full name')['Year'].transform('min')

In [23]:
# create a function to get the paper count for a given name and year
def get_paper_count(name, year):
    df = df1[df1['Full name'] == name]
    paper_count = df[df['Year'] <= year]['Year Count']
    if len(paper_count) == 0:
        return 0
    return paper_count.iloc[-1]

# merge the dataframes and add a new column with the paper count
merged = pd.merge(df, df1.drop_duplicates(subset=['Full name']), on='Full name', how='left')
merged['Year Count'] = merged.apply(lambda x: get_paper_count(x['Full name'], x['Year_x']), axis=1)

# Clean up and control check
print("Number of rows in original table:",df.shape[0], " and umber of rows in merged table:",merged.shape[0])
print("Pct. of speakers whose information I was able to collect",round(((df['Full name'].nunique()-df[~df['Full name'].isin(df1['Full name'])].shape[0])/df1['Full name'].nunique())*100,2),"%")
merged['max_year_count'] = merged['max_year_count'].fillna(0)
merged['First year paper'] = merged['First year paper'].fillna(0)
merged = merged.drop(columns=['Year_y'])
merged = merged.rename(columns={'Year_x': 'Year', 'Year Count': 'Paper Count', 'max_year_count': 'Max Paper Count'})


Number of rows in original table: 26087  and umber of rows in merged table: 26087
Pct. of speakers whose information I was able to collect -156.7 %


In [24]:
df6 = df_base.append(merged, ignore_index=True)

In [26]:
df6.head()

Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender,Paper Count,Max Paper Count,First year paper,Age,Conference (long),Main Topic,Productivity
0,Hiroto Yasuura,2003.0,ASPDAC,Hiroto,,Yasuura,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
1,Farzan Fallah,2003.0,ASPDAC,Farzan,,Fallah,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
2,Satoshi Komatsu,2003.0,ASPDAC,Satoshi,,Komatsu,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
3,Masahiro Fujita,2003.0,ASPDAC,Masahiro,,Fujita,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
4,Sri Parameswaran,2003.0,ASPDAC,Sri,,Parameswaran,F,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0


In [29]:
df6.to_csv(os.path.join(filepath, "factProceedings.csv"), index=False)
print("The file is now saved")
df6.head(10)

The file is now saved


Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender,Paper Count,Max Paper Count,First year paper,Age,Conference (long),Main Topic,Productivity
0,Hiroto Yasuura,2003.0,ASPDAC,Hiroto,,Yasuura,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
1,Farzan Fallah,2003.0,ASPDAC,Farzan,,Fallah,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
2,Satoshi Komatsu,2003.0,ASPDAC,Satoshi,,Komatsu,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
3,Masahiro Fujita,2003.0,ASPDAC,Masahiro,,Fujita,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
4,Sri Parameswaran,2003.0,ASPDAC,Sri,,Parameswaran,F,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
5,Haris Lekatsas,2003.0,ASPDAC,Haris,,Lekatsas,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
6,Kento Yamaoka,2003.0,ASPDAC,Kento,,Yamaoka,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
7,Soroush Abbaspour,2003.0,ASPDAC,Soroush,,Abbaspour,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
8,Atsushi Sakai,2003.0,ASPDAC,Atsushi,,Sakai,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
9,Takashi Yamada,2003.0,ASPDAC,Takashi,,Yamada,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0


In [30]:
df6[df6['Conference (short)']=="NeurIPS"]['Year'].unique()

array([2021., 2003., 2004., 2005., 2006., 2007., 2008., 2009., 2010.,
       2011., 2012., 2013., 2014., 2015., 2016., 2017., 2018., 2019.])