# Get Paper Info
These notebook will get the paper info for each speakers in the conference. The attributes are: Number of papers (by year), first paper year, number of citation (by year), country

In [3]:
%run setup.py

### Load the data

In [36]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
def append_data(folder):
    all_files = glob.glob(folder + "/*.csv")
    df = pd.DataFrame()
    for file in all_files:
        df_temp = pd.read_csv(file, index_col=None)
        df_temp['File name'] = os.path.splitext(os.path.basename(file))[0]
        df = df.append(df_temp, ignore_index=True)
    df['Conference (short)'] = df['File name'].str.split('_').str[0]
    df = df.drop(["Field","File name"], axis=1)
    try:
        df = df.drop(["Key Note Speaker"], axis=1)
    except:
        pass
    return df

df = append_data('Key Note Speakers')
print("Number of rows:", len(df))
print("Number of unique speakers:", df['Full name'].nunique())

df['dblp name'] = df['Full name'].str.replace(' ', '%20')
df.head()

# df = df.iloc[1243:].copy()

Number of rows: 2044
Number of unique speakers: 1797


Unnamed: 0,Full name,Year,Sex,Conference (short),dblp name
0,Atsushi Asada,1995,0,ASPDAC,Atsushi%20Asada
1,Jim Meadlock,1995,0,ASPDAC,Jim%20Meadlock
2,John Darringer,1995,0,ASPDAC,John%20Darringer
3,Tatsuo Izawa,1997,0,ASPDAC,Tatsuo%20Izawa
4,Daniel D. Gajski,1997,0,ASPDAC,Daniel%20D.%20Gajski


## dblp.org

In [27]:
df1 = pd.DataFrame(columns = ['Full name','Year','Year Count'])

for Author in tqdm(df['dblp name']):
    # print(Author)
    author_list = []
    paper_list = []
    
    ##### Check if author is on dblp #####
    dblp_URL = "https://dblp.org/search?q=" + Author
    page = requests.get(dblp_URL)
    soup = BeautifulSoup(page.content, "html.parser")
    control_check = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")
    if len(control_check) > 0:
        first_author = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")[0].find_all("a",href=True)[0]['href']
        # print(Author," is on the webpage and the process continues...")
        
        ##### Get info from author #####
        author_page = requests.get(first_author)
        soup_of_speaker = BeautifulSoup(author_page.content, "html.parser")

        Year_list = []

        sections = soup_of_speaker.find(id="publ-section").find_all("div", class_="hide-body")
        for s in range(0,len(sections)): # Nu går vi ind på hver enkelt sektion
            rows_in_sections = sections[s].find_all('ul', class_="publ-list")[0].findAll(True, {"class":['year','entry inproceedings toc','entry article toc','entry incollection toc', 'entry book toc','entry editor toc','entry reference toc']})
            last_row_idx = 0
            
            for row in range(0,len(rows_in_sections)): # Nu går vi ind på hver enkelt række
                #### Append year multipliers
                if rows_in_sections[row].p == None:
                    Year_list.append(rows_in_sections[row].text)
                    paper_list.append(row-last_row_idx-1)
                    last_row_idx = row 
                if row == len(rows_in_sections)-1: # den sidste linje i hver sektion
                    paper_list.append(len(rows_in_sections)-last_row_idx-1)
            
            # remove all -1 values from the list
            paper_list = [x for x in paper_list if x >= 0]
                
        # create a dataframe
        df2 = pd.DataFrame(columns = ['Full name','Year','Year Count'])
        # append full name to the dataframe
        
        df2['Year'] = Year_list
        df2['Year Count'] = paper_list
        df2['Full name'] = Author
        df1 = df1.append(df2, ignore_index=True)
        

100%|██████████| 801/801 [33:05<00:00,  2.48s/it]  


In [35]:
df1.to_csv("paper_count_per_year.csv", index=False)

## Now we do stuff

In [37]:
df1 = df4.copy()

In [42]:
(df1['Full name'].nunique()/df['Full name'].nunique())*100

92.37618252643294

In [45]:
# Find the names in df which is not i df1
df2 = df[~df['dblp name'].isin(df1['Full name'])]
df2

Unnamed: 0,Full name,Year,Sex,Conference (short),dblp name
1,Jim Meadlock,1995,0,ASPDAC,Jim%20Meadlock
15,Dipendcr Saluia,1999,x,ASPDAC,Dipendcr%20Saluia
18,Bill Herrick,2000,0,ASPDAC,Bill%20Herrick
20,Ming-Jeh Chien,2001,0,ASPDAC,Ming-Jeh%20Chien
22,Glovanni De WIichcli,2003,0,ASPDAC,Glovanni%20De%20WIichcli
...,...,...,...,...,...
2031,Bob Scheifler,1996,x,WWW,Bob%20Scheifler
2035,Steven McGeady,1996,x,WWW,Steven%20McGeady
2038,Edward A. Bennett,1995,x,WWW,Edward%20A.%20Bennett
2041,Christopher Dobbs,1995,x,WWW,Christopher%20Dobbs
