# Get Paper Info
These notebook will get the paper info for each speakers in the conference. The attributes are: Number of papers (by year), first paper year, number of citation (by year), country

In [1]:
%run setup.py

# Invited

## Load the data

In [6]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
df_new  = pd.read_csv(os.path.join(filepath,'factInvited_init.csv'), encoding='utf-8')
# df_base  = pd.read_csv(os.path.join(filepath,'factInvited.csv'), encoding='utf-8')

# Find new names
# df_new = df_new[~df_new['Full name'].isin(df_base['Full name'])].reset_index(drop=True)

dblp_name = df_new['Full name'].str.replace(' ', '%20')
dblp_name = dblp_name.drop_duplicates().reset_index(drop=True)

# drop nan values from the list
dblp_name = dblp_name.dropna().reset_index(drop=True)

## Get data from dblp.org

In [8]:
df1 = pd.DataFrame(columns = ['Full name','Year','Year Count'])

for Author in tqdm(dblp_name):
    author_list = []
    paper_list = []
    
    ##### Check if author is on dblp #####
    dblp_URL = "https://dblp.org/search?q=" + Author
    page = requests.get(dblp_URL)
    soup = BeautifulSoup(page.content, "html.parser")
    control_check = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")
    if len(control_check) > 0:
        first_author = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")[0].find_all("a",href=True)[0]['href']
        # print(Author," is on the webpage and the process continues...")
        
        ##### Get info from author #####
        author_page = requests.get(first_author)
        soup_of_speaker = BeautifulSoup(author_page.content, "html.parser")

        Year_list = []

        sections = soup_of_speaker.find(id="publ-section").find_all("div", class_="hide-body")
        for s in range(0,len(sections)): # Nu går vi ind på hver enkelt sektion
            rows_in_sections = sections[s].find_all('ul', class_="publ-list")[0].findAll(True, {"class":['year','entry inproceedings toc','entry article toc','entry incollection toc', 'entry book toc','entry editor toc','entry reference toc']})
            last_row_idx = 0
            
            for row in range(0,len(rows_in_sections)): # Nu går vi ind på hver enkelt række
                #### Append year multipliers
                if rows_in_sections[row].p == None:
                    Year_list.append(rows_in_sections[row].text)
                    paper_list.append(row-last_row_idx-1)
                    last_row_idx = row 
                if row == len(rows_in_sections)-1: # den sidste linje i hver sektion
                    paper_list.append(len(rows_in_sections)-last_row_idx-1)
            
            # remove all -1 values from the list
            paper_list = [x for x in paper_list if x >= 0]
                
        # create a dataframe
        df2 = pd.DataFrame(columns = ['Full name','Year','Year Count'])
        # append full name to the dataframe
        
        df2['Year'] = Year_list
        df2['Year Count'] = paper_list
        df2['Full name'] = Author
        df1 = df1.append(df2, ignore_index=True)
    time.sleep(1)
        

100%|██████████| 1134/1134 [1:03:42<00:00,  3.37s/it]


In [9]:
df1.to_csv(os.path.join(filepath,"paper_count_Invited_new.csv"), index=False)
df3 = df1.copy()

## Combine data and save as factTable

In [10]:
# Load the initial factTable
df  = pd.read_csv(os.path.join(filepath,'factInvited_init.csv'), encoding='utf-8')
# Load the table with paper count per year
# df2 = pd.read_csv(os.path.join(filepath,'paper_count_per_year_proceedings.csv'), encoding='utf-8')
# concat df1 and df2
# df1 = pd.concat([df2,df3], axis=0, ignore_index=True)

In [11]:
df1['Full name'] = df1['Full name'].str.replace('%20', ' ')
df1 = df1.drop_duplicates().reset_index(drop=True)
df1['Year'] = df1['Year'].astype(int)
df1['Year Count'] = df1['Year Count'].astype(int)

df1 = df1.sort_values(by=['Year'], ascending=True)

df1['Year Count'] = df1.groupby('Full name')['Year Count'].cumsum()

df1['max_year_count'] = df1.groupby('Full name')['Year Count'].transform('max')
df1['First year paper'] = df1.groupby('Full name')['Year'].transform('min')

In [12]:
# create a function to get the paper count for a given name and year
def get_paper_count(name, year):
    df = df1[df1['Full name'] == name]
    paper_count = df[df['Year'] <= year]['Year Count']
    if len(paper_count) == 0:
        return 0
    return paper_count.iloc[-1]

# merge the dataframes and add a new column with the paper count
merged = pd.merge(df, df1.drop_duplicates(subset=['Full name']), on='Full name', how='left')
merged['Year Count'] = merged.apply(lambda x: get_paper_count(x['Full name'], x['Year_x']), axis=1)

# Clean up and control check
print("Number of rows in original table:",df.shape[0], " and umber of rows in merged table:",merged.shape[0])
print("Pct. of speakers whose information I was able to collect",round(((df['Full name'].nunique()-df[~df['Full name'].isin(df1['Full name'])].shape[0])/df1['Full name'].nunique())*100,2),"%")
merged['max_year_count'] = merged['max_year_count'].fillna(0)
merged['First year paper'] = merged['First year paper'].fillna(0)
merged = merged.drop(columns=['Year_y'])
merged = merged.rename(columns={'Year_x': 'Year', 'Year Count': 'Paper Count', 'max_year_count': 'Max Paper Count'})
merged.to_csv(os.path.join(filepath, "factInvited.csv"), index=False)
print("The file is now saved")
merged.head(10)

Number of rows in original table: 1343  and umber of rows in merged table: 1343
Pct. of speakers whose information I was able to collect 99.62 %
The file is now saved


Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender,Paper Count,Max Paper Count,First year paper
0,Glovanni De WIichcli,2003,ASPDAC,Glovanni,De,WIichcli,M,0,0.0,0.0
1,Tadahiro Ohm,2003,ASPDAC,Tadahiro,,Ohm,M,27,92.0,1993.0
2,Ycrvant Zorian,2003,ASPDAC,Ycrvant,,Zorian,M,0,0.0,0.0
3,Gary L. Baldwi,2004,ASPDAC,Gary,L.,Baldwi,M,4,4.0,1962.0
4,Rudy Lauwereins,2004,ASPDAC,Rudy,,Lauwereins,M,112,196.0,1987.0
5,Rajeev Madhavan,2005,ASPDAC,Rajeev,,Madhavan,M,5,5.0,1994.0
6,Jan M. Rabaey,2005,ASPDAC,Jan,M.,Rabaey,M,146,321.0,1985.0
7,Zhenghua Jiang,2005,ASPDAC,Zhenghua,,Jiang,M,1,1.0,2005.0
8,Alberto Sangiovanni-Vincentelli,2006,ASPDAC,Alberto,,Sangiovanni-Vincentelli,M,458,691.0,1973.0
9,Satoru Ito,2006,ASPDAC,Satoru,,Ito,M,3,3.0,2000.0


In [85]:
merged['Conference (short)'].unique()

array(['ASPDAC', 'CHI', 'ECCV', 'HiPC', 'ic2s2', 'ICML', 'IJCAI', 'KDD',
       'LICS', 'RTA', 'SC', 'SIGGRAPH', 'SODA', 'STOC', 'SWAT', 'WADS',
       'WoLLIC', 'WWW', 'AAAI'], dtype=object)

# Proceedings

## Load the data

In [51]:
df  = pd.read_csv(os.path.join(filepath,'factProceedings_init.csv'), encoding='utf-8')

In [52]:
# remove [' '] from the string in Links column
df['Links'] = df['Links'].str.replace("[","")
df['Links'] = df['Links'].str.replace("']","")
df['Links'] = df['Links'].str.replace("'","")
df['Links'] = df['Links'].str.replace(".html","")

dblp_name = df['Full name'].unique()
df.head()

Unnamed: 0,Full name,Year,Links,Conference (short),First name,Middle name,Last name,gender
0,Hiroto Yasuura,2003,https://dblp.org/pid/43/4149,ASPDAC,Hiroto,,Yasuura,M
1,Farzan Fallah,2003,https://dblp.org/pid/02/1886,ASPDAC,Farzan,,Fallah,M
2,Satoshi Komatsu,2003,https://dblp.org/pid/08/4585,ASPDAC,Satoshi,,Komatsu,M
3,Masahiro Fujita,2003,https://dblp.org/pid/56/1768,ASPDAC,Masahiro,,Fujita,M
4,Sri Parameswaran,2003,https://dblp.org/pid/38/622,ASPDAC,Sri,,Parameswaran,F


In [58]:
df1 = pd.DataFrame(columns = ['Full name','New_Link'])
new_link = []
author = []

for Author in tqdm(dblp_name[305:30000]):
    ##### Check if author is on dblp #####
    dblp_URL = "https://dblp.org/search?q=" + Author
    page = requests.get(dblp_URL)
    soup = BeautifulSoup(page.content, "html.parser")
    control_check = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")
    if len(control_check) > 0:
        first_author = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")[0].find_all("a",href=True)[0]['href']
        author.append(Author)
        new_link.append(first_author)
    time.sleep(2)

df1['Full name'] = author
df1['New_Link'] = new_link

  0%|          | 24/29695 [01:08<21:14:48,  2.58s/it]

In [57]:
# 0:305
new_link1 = new_link.copy()
author1 = author.copy()

In [37]:
df3 = df.merge(df1, on='Full name', how='left')
df3['columns_equal'] = df3['New_Link'] == df3['Links']

In [39]:
df3

Unnamed: 0,Full name,Year,Links,Conference (short),First name,Middle name,Last name,gender,columns_equal,New_Link
0,Hiroto Yasuura,2003,https://dblp.org/pid/43/4149,ASPDAC,Hiroto,,Yasuura,M,True,https://dblp.org/pid/43/4149
1,Farzan Fallah,2003,https://dblp.org/pid/02/1886,ASPDAC,Farzan,,Fallah,M,True,https://dblp.org/pid/02/1886
2,Satoshi Komatsu,2003,https://dblp.org/pid/08/4585,ASPDAC,Satoshi,,Komatsu,M,True,https://dblp.org/pid/08/4585
3,Masahiro Fujita,2003,https://dblp.org/pid/56/1768,ASPDAC,Masahiro,,Fujita,M,True,https://dblp.org/pid/56/1768
4,Sri Parameswaran,2003,https://dblp.org/pid/38/622,ASPDAC,Sri,,Parameswaran,F,True,https://dblp.org/pid/38/622
5,Haris Lekatsas,2003,https://dblp.org/pid/75/5187,ASPDAC,Haris,,Lekatsas,M,True,https://dblp.org/pid/75/5187
6,Kento Yamaoka,2003,https://dblp.org/pid/65/3357,ASPDAC,Kento,,Yamaoka,M,True,https://dblp.org/pid/65/3357
7,Soroush Abbaspour,2003,https://dblp.org/pid/35/3721,ASPDAC,Soroush,,Abbaspour,M,True,https://dblp.org/pid/35/3721
8,Atsushi Sakai,2003,https://dblp.org/pid/76/1572,ASPDAC,Atsushi,,Sakai,M,True,https://dblp.org/pid/76/1572
9,Takashi Yamada,2003,https://dblp.org/pid/58/2991,ASPDAC,Takashi,,Yamada,M,True,https://dblp.org/pid/58/2991


## Get data from dblp.org

Nu har jeg df_base som er min gamle df_proceedings med rigtige uft-8 navne.
Når koden under har kørt skal jeg lægge de nye data sammen med df (df_proceedings) som KUN indeholder NeurIPS data. Så merger jeg. 
Når den data frame er helt færdig skal den lægges sammen med df_base.

In [49]:
df1 = pd.DataFrame(columns = ['Full name','Year','Year Count'])

for Author in tqdm(dblp_name[0:10000]):
    author_list = []
    paper_list = []
    
    ##### Check if author is on dblp #####
    dblp_URL = "https://dblp.org/search?q=" + Author
    page = requests.get(dblp_URL)
    soup = BeautifulSoup(page.content, "html.parser")
    control_check = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")
    if len(control_check) > 0:
        first_author = soup.find(id="completesearch-authors").find_all("ul",class_ = "result-list")[0].find_all("a",href=True)[0]['href']
        # print(Author," is on the webpage and the process continues...")
        
        ##### Get info from author #####
        author_page = requests.get(first_author)
        soup_of_speaker = BeautifulSoup(author_page.content, "html.parser")

        Year_list = []

        sections = soup_of_speaker.find(id="publ-section").find_all("div", class_="hide-body")
        for s in range(0,len(sections)): # Nu går vi ind på hver enkelt sektion
            rows_in_sections = sections[s].find_all('ul', class_="publ-list")[0].findAll(True, {"class":['year','entry inproceedings toc','entry article toc','entry incollection toc', 'entry book toc','entry editor toc','entry reference toc']})
            last_row_idx = 0
            
            for row in range(0,len(rows_in_sections)): # Nu går vi ind på hver enkelt række
                #### Append year multipliers
                if rows_in_sections[row].p == None:
                    Year_list.append(rows_in_sections[row].text)
                    paper_list.append(row-last_row_idx-1)
                    last_row_idx = row 
                if row == len(rows_in_sections)-1: # den sidste linje i hver sektion
                    paper_list.append(len(rows_in_sections)-last_row_idx-1)
            
            # remove all -1 values from the list
            paper_list = [x for x in paper_list if x >= 0]
                
        # create a dataframe
        df2 = pd.DataFrame(columns = ['Full name','Year','Year Count'])
        # append full name to the dataframe
        
        df2['Year'] = Year_list
        df2['Year Count'] = paper_list
        df2['Full name'] = Author
        df1 = df1.append(df2, ignore_index=True)
    time.sleep(4)
        

100%|██████████| 10000/10000 [17:40:02<00:00,  6.36s/it]  


In [51]:
df1.to_csv(os.path.join(filepath,"paper_count_Proceedings_new13.csv"), index=False)
# df3 = df1.copy()

In [38]:
df1 = df4.copy()

In [37]:
df4 = df1.append(df3, ignore_index=True)

In [52]:
df3 = df1.copy()

In [54]:
df1

Unnamed: 0,Full name,Year,Year Count
0,Hiroto Yasuura,2017,1
1,Hiroto Yasuura,2012,4
2,Hiroto Yasuura,2011,4
3,Hiroto Yasuura,2010,6
4,Hiroto Yasuura,2009,7
...,...,...,...
102986,Roberto Bresin,2002,1
102987,Roberto Bresin,2001,1
102988,Roberto Bresin,2000,3
102989,Roberto Bresin,1994,2


In [43]:
df1  = pd.read_csv(os.path.join(filepath,'paper_count_Proceedings_new12.csv'), encoding='utf-8')
df.head()

Unnamed: 0,Full name,Year,Year Count
0,Rosa Lutz,2022,1
1,Rosa Lutz,2019,1
2,Rosa Lutz,2018,1
3,Thomas Dylan,2022,2
4,Thomas Dylan,2021,2


## Combine data and save as factTable

In [155]:
df  = pd.read_csv(os.path.join(filepath,'factProceedings.csv'), encoding='utf-8')
dblp_name = df[df['First year paper'] == 0]['Full name']
dblp_name = dblp_name.drop_duplicates()
dblp_name = dblp_name.dropna().reset_index(drop=True)
df = df[df['First year paper'] == 0]
dblp_name = dblp_name[0:10000]
df = df[df['Full name'].isin(dblp_name)]
df = df.drop(['Paper Count','First year paper','Max Paper Count','Age','Productivity','Conference (long)','Main Topic'], axis=1)
df.tail()

Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender
216727,Lei Xie,2022,NeurIPS,Lei,,Xie,F
216735,Zhiyuan Wang,2022,NeurIPS,Zhiyuan,,Wang,F
216753,Hui Wang,2022,NeurIPS,Hui,,Wang,F
216759,Zheng Liu,2022,NeurIPS,Zheng,,Liu,M
216761,Chen Chen,2022,NeurIPS,Chen,,Chen,M


In [147]:
df1  = pd.read_csv(os.path.join(filepath,'paper_count_Proceedings_new12.csv'), encoding='utf-8')
df1.tail()

Unnamed: 0,Full name,Year,Year Count
215844,Yichen Zhou,2019,1
215845,Yichen Zhou,2018,0
215846,Yichen Zhou,2017,2
215847,Yichen Zhou,2015,1
215848,Yichen Zhou,2011,1


In [122]:
# df1['Full name'] = df1['Full name'].str.replace('%20', ' ')
df1 = df1.drop_duplicates().reset_index(drop=True)
df1['Year'] = df1['Year'].astype(int)
df1['Year Count'] = df1['Year Count'].astype(int)

df1 = df1.sort_values(by=['Year'], ascending=True)

df1['Year Count'] = df1.groupby('Full name')['Year Count'].cumsum()

df1['max_year_count'] = df1.groupby('Full name')['Year Count'].transform('max')
df1['First year paper'] = df1.groupby('Full name')['Year'].transform('min')
df1.head()

Unnamed: 0,Full name,Year,Year Count,max_year_count,First year paper
0,Charles Harrison,1959,1,9,1959
1,Shigeru Watanabe,1961,1,73,1961
2,Martin Cohn,1962,1,400,1962
3,Kazuo Isoda,1963,1,11,1963
4,Kai Li,1963,1,2841,1963


In [120]:
# create a function to get the paper count for a given name and year
def get_paper_count(name, year):
    df = df1[df1['Full name'] == name]
    paper_count = df[df['Year'] <= year]['Year Count']
    if len(paper_count) == 0:
        return 0
    return paper_count.iloc[-1]

# merge the dataframes and add a new column with the paper count
merged = pd.merge(df, df1.drop_duplicates(subset=['Full name']), on='Full name', how='left')
merged['Year Count'] = merged.apply(lambda x: get_paper_count(x['Full name'], x['Year_x']), axis=1)

# Clean up and control check
print("Number of rows in original table:",df.shape[0], " and umber of rows in merged table:",merged.shape[0])
print("Pct. of speakers whose information I was able to collect",round(((df['Full name'].nunique()-df[~df['Full name'].isin(df1['Full name'])].shape[0])/df1['Full name'].nunique())*100,2),"%")
merged['max_year_count'] = merged['max_year_count'].fillna(0)
merged['First year paper'] = merged['First year paper'].fillna(0)
merged = merged.drop(columns=['Year_y'])
merged = merged.rename(columns={'Year_x': 'Year', 'Year Count': 'Paper Count', 'max_year_count': 'Max Paper Count'})


Number of rows in original table: 36529  and umber of rows in merged table: 36529
Pct. of speakers whose information I was able to collect 98.53 %


In [124]:
merged.head()
merged[merged['First year paper'] == 0]

Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender,Paper Count,Max Paper Count,First year paper
53,Andras Martinelli,2003,ASPDAC,Andras,,Martinelli,M,0,0.0,0.0
234,Garan Jerke,2003,ASPDAC,Garan,,Jerke,M,0,0.0,0.0
444,Andras Martinelli,2004,ASPDAC,Andras,,Martinelli,M,0,0.0,0.0
445,Rena Krenz,2004,ASPDAC,Rena,,Krenz,F,0,0.0,0.0
547,Andra Borin Suarez,2005,ASPDAC,Andra,Borin,Suarez,F,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
35102,Carsten Rother,2013,NeurIPS,Carsten,,Rother,M,0,0.0,0.0
35166,Carsten Rother,2015,NeurIPS,Carsten,,Rother,M,0,0.0,0.0
35197,Carsten Rother,2016,NeurIPS,Carsten,,Rother,M,0,0.0,0.0
35793,Carsten Rother,2020,NeurIPS,Carsten,,Rother,M,0,0.0,0.0


In [127]:
merged.to_csv(os.path.join(filepath, "factProceedings_0_10000.csv"), index=False)

In [42]:
df_base  = pd.read_csv(os.path.join(filepath,'factProceedings.csv'), encoding='utf-8')

In [45]:
merged[merged['Conference (short)']=="NeurIPS"]['Year'].unique()

array([2020, 2022], dtype=int64)

In [46]:
df6 = df_base.append(merged, ignore_index=True)

In [48]:
df6.tail()

Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender,Paper Count,Max Paper Count,First year paper,Age,Conference (long),Main Topic,Productivity
216778,Margret Keuper,2022.0,NeurIPS,Margret,,Keuper,F,0,0.0,0.0,,,,
216779,Gabriele Farina,2022.0,NeurIPS,Gabriele,,Farina,F,0,0.0,0.0,,,,
216780,Ioannis Anagnostides,2022.0,NeurIPS,Ioannis,,Anagnostides,M,0,0.0,0.0,,,,
216781,Christian Kroer,2022.0,NeurIPS,Christian,,Kroer,M,0,0.0,0.0,,,,
216782,Yuan He,2022.0,NeurIPS,Yuan,,He,M,0,0.0,0.0,,,,


The file is now saved


Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender,Paper Count,Max Paper Count,First year paper,Age,Conference (long),Main Topic,Productivity
0,Hiroto Yasuura,2003.0,ASPDAC,Hiroto,,Yasuura,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
1,Farzan Fallah,2003.0,ASPDAC,Farzan,,Fallah,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
2,Satoshi Komatsu,2003.0,ASPDAC,Satoshi,,Komatsu,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
3,Masahiro Fujita,2003.0,ASPDAC,Masahiro,,Fujita,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
4,Sri Parameswaran,2003.0,ASPDAC,Sri,,Parameswaran,F,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
5,Haris Lekatsas,2003.0,ASPDAC,Haris,,Lekatsas,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
6,Kento Yamaoka,2003.0,ASPDAC,Kento,,Yamaoka,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
7,Soroush Abbaspour,2003.0,ASPDAC,Soroush,,Abbaspour,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
8,Atsushi Sakai,2003.0,ASPDAC,Atsushi,,Sakai,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0
9,Takashi Yamada,2003.0,ASPDAC,Takashi,,Yamada,M,0,0.0,0.0,0.0,Asia and South Pacific Design Automation Confe...,Computer Architecture,0.0


In [30]:
df6[df6['Conference (short)']=="NeurIPS"]['Year'].unique()

array([2021., 2003., 2004., 2005., 2006., 2007., 2008., 2009., 2010.,
       2011., 2012., 2013., 2014., 2015., 2016., 2017., 2018., 2019.])

In [169]:
df1 = pd.read_csv(os.path.join(filepath,'factProceedings_0_10000.csv'), encoding='utf-8')
df2 = pd.read_csv(os.path.join(filepath,'factProceedings_10000_30000.csv'), encoding='utf-8')
df3 = pd.read_csv(os.path.join(filepath,'factProceedings_30000_50000.csv'), encoding='utf-8')
df4 = pd.read_csv(os.path.join(filepath,'factProceedings_50000_.csv'), encoding='utf-8')

In [174]:
df4.head()

Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender,Paper Count,Max Paper Count,First year paper
0,Sam Saarinen,2016,AAAI,Sam,,Saarinen,M,3,7.0,2014.0
1,Walid Shalaby,2016,AAAI,Walid,,Shalaby,M,7,16.0,2014.0
2,Kripa Rajshekhar,2016,AAAI,Kripa,,Rajshekhar,F,1,1.0,2016.0
3,Ping Bai,2016,AAAI,Ping,,Bai,F,7,11.0,1999.0
4,Nur Syahidah Bte,2016,AAAI,Nur,Syahidah,Bte,F,1,1.0,2016.0


In [173]:
df3 = df3.drop(columns=['Paper Count','Max Paper Count','First year paper_x','Age','Productivity','Conference (long)','Main Topic','Year_y'], axis=1)
df3 = df3.rename(columns={'Year_x': 'Year', 'Year Count': 'Paper Count', 'max_year_count': 'Max Paper Count', 'First year paper_y': 'First year paper'})
df3.head()

Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender,Paper Count,Max Paper Count,First year paper
0,Emre Yilmaz,2017,KDD,Emre,,Yilmaz,M,38,75.0,1997.0
1,Hakan Ferhatosmanoglu,2017,KDD,Hakan,,Ferhatosmanoglu,M,76,98.0,1999.0
2,Zi Yin,2017,KDD,Zi,,Yin,F,4,11.0,2012.0
3,Yue Min,2017,KDD,Yue,,Min,M,2,6.0,2014.0
4,Thomas Lauvaux,2017,KDD,Thomas,,Lauvaux,M,2,3.0,2017.0


In [175]:
df = df1.append([df2,df3,df4], ignore_index=True)

In [176]:
df.head()

Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender,Paper Count,Max Paper Count,First year paper
0,Hiroto Yasuura,2003,ASPDAC,Hiroto,,Yasuura,M,71,115.0,1981.0
1,Farzan Fallah,2003,ASPDAC,Farzan,,Fallah,M,25,57.0,1998.0
2,Satoshi Komatsu,2003,ASPDAC,Satoshi,,Komatsu,M,10,51.0,1998.0
3,Masahiro Fujita,2003,ASPDAC,Masahiro,,Fujita,M,147,473.0,1983.0
4,Sri Parameswaran,2003,ASPDAC,Sri,,Parameswaran,F,21,208.0,1994.0


In [177]:
df_0 = df.copy()

In [185]:
df = pd.read_csv(os.path.join(filepath,'factProceedings.csv'), encoding='utf-8')
df = df.drop(columns=['Age','Productivity','Conference (long)','Main Topic'], axis=1)
df_1 = df[df['First year paper'] != 0]

In [186]:
df_new = df_0.append(df_1, ignore_index=True)

In [187]:
df_new.head()

Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender,Paper Count,Max Paper Count,First year paper
0,Hiroto Yasuura,2003,ASPDAC,Hiroto,,Yasuura,M,71,115.0,1981.0
1,Farzan Fallah,2003,ASPDAC,Farzan,,Fallah,M,25,57.0,1998.0
2,Satoshi Komatsu,2003,ASPDAC,Satoshi,,Komatsu,M,10,51.0,1998.0
3,Masahiro Fujita,2003,ASPDAC,Masahiro,,Fujita,M,147,473.0,1983.0
4,Sri Parameswaran,2003,ASPDAC,Sri,,Parameswaran,F,21,208.0,1994.0


In [190]:
df_new = df_new[df_new['First year paper'] != 0]

In [193]:
df_new = df_new.sort_values(by=['Conference (short)','Year','Full name'], ascending=True)
df_new = df_new.reset_index()

In [195]:
df_new.drop(['index'], axis=1, inplace=True)

In [199]:
df_new.to_csv(os.path.join(filepath, "factProceedings.csv"), index=False)