In [21]:
# load setup.py file
%run setup.py
# change number of lines in dataframe
pd.set_option('display.max_rows', None)

# Key Note Speakers

#### Load and combine all .csv files in the folder

In [62]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
def append_data(folder):
    all_files = glob.glob(folder + "/*.csv")
    df = pd.DataFrame()
    for file in all_files:
        df_temp = pd.read_csv(file, index_col=None)
        df_temp['File name'] = os.path.splitext(os.path.basename(file))[0]
        df = df.append(df_temp, ignore_index=True)
    df['Conference (short)'] = df['File name'].str.split('_').str[0]
    df = df.drop(["Field","File name"], axis=1)
    try:
        df = df.drop(["Key Note Speaker"], axis=1)
    except:
        pass
    return df

df = append_data('Key Note Speakers')
print("Number of rows:", len(df))
df.tail()

Number of rows: 2044


Unnamed: 0,Full name,Year,Sex,Conference (short)
2039,Victor Zue,1995,x,WWW
2040,David Goddeau,1995,x,WWW
2041,Christopher Dobbs,1995,x,WWW
2042,Robert W. Lucky,1995,x,WWW
2043,Thomas Reardon,1995,x,WWW


#### Split the the name in first and last name

In [63]:
df['First name'] = df['Full name'].str.split(' ').str[0].str.lower()
df['Last name'] = df['Full name'].str.split(' ').str[1:].str.join(' ')

# df_proceedings['Middel name'] = df_proceedings['Last name'].str.split(' ').str[0].str.lower()
# df_proceedings['Family name'] = df_proceedings['Last name'].str.split(' ').str[1:].str.join(' ').str.lower()

# if last name contains a space then split it into two columns called Middel name and Last name
if df['Last name'].str.contains(' ').any():
    df['Middle name'] = df['Last name'].str.split(' ', expand=True)[1]
    if df['Middle name'].any() == None:
        df['Family name'] = None
    else:
        df['Last name'] = df['Full name'].str.split(' ').str[2:].str.join(' ')
df.head()

Unnamed: 0,Full name,Year,Sex,Conference (short),First name,Last name,Middle name
0,Atsushi Asada,1995,0,ASPDAC,atsushi,,
1,Jim Meadlock,1995,0,ASPDAC,jim,,
2,John Darringer,1995,0,ASPDAC,john,,
3,Tatsuo Izawa,1997,0,ASPDAC,tatsuo,,
4,Daniel D. Gajski,1997,0,ASPDAC,daniel,Gajski,Gajski


In [46]:
# Add two new columns, where Full name is split into first and last name and the new columns are lowercased
df['First name'] = df['Full name'].str.split(' ').str[0].str.lower()
df['Last name'] = df['Full name'].str.split(' ').str[1:].str.join(' ')





# unicode the 'First name' column
df['First name'] = df["First name"].str.normalize('NFKD')\
       .str.encode('ascii', errors='ignore')\
       .str.decode('utf-8')


# Remove all rows from First name which only 1 letter and a dot for intance "A."
print("Number of rows where the first name only contains abbreviation: ", df[df['First name'].str.contains('^[a-z]\.$')].shape[0])
df = df[~df['First name'].str.contains('^[a-z]\.$')]

df.head()

Number of rows where the first name only contains abbreviation:  112


Unnamed: 0,Full name,Year,Sex,Conference (short),First name,Last name,Middel name,Family name
0,Atsushi Asada,1995,0,ASPDAC,atsushi,Asada,,Asada
1,Jim Meadlock,1995,0,ASPDAC,jim,Meadlock,,Meadlock
2,John Darringer,1995,0,ASPDAC,john,Darringer,,Darringer
3,Tatsuo Izawa,1997,0,ASPDAC,tatsuo,Izawa,,Izawa
4,Daniel D. Gajski,1997,0,ASPDAC,daniel,D. Gajski,Gajski,D.


#### Add gender data

In [24]:
# Add gender based on First name
name_df = pd.read_csv('Dimension Tables/wgnd_2_0_name-gender-code.csv')
# Keep only US or DK names #TODO: choose the country code according to the location
name_df = name_df[(name_df['code'] == 'US') | (name_df['code'] == 'DK')]
# drop duplicates
name_df = name_df.drop_duplicates(subset=['name',"gender"], keep='first')
# for all duplicated value keep the row with the max "wgt" value
name_df = name_df.sort_values('wgt', ascending=False).drop_duplicates(subset=['name'], keep='first')

print("Number of rows:", len(name_df))
name_df.head()

Number of rows: 97795


Unnamed: 0,name,code,gender,wgt
758,aaban,US,M,1.0
1857914,lexxy,US,F,1.0
1857667,lexia,US,F,1.0
1857563,lexi,US,F,1.0
1857549,lexey,US,F,1.0


In [25]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code', 'wgt'], axis=1)

print("Number of rows with null value in gender:", df["gender"].isnull().sum())

# Drop all rows with nan value in gender column #TODO
df = df.dropna(subset=['gender'])
print("Number of rows after removal of unknown genders:", len(df))

df.head()

Number of rows with null value in gender: 167
Number of rows after removal of unknown genders: 1765


Unnamed: 0,Full name,Year,Sex,Conference (short),First name,Last name,gender
0,Atsushi Asada,1995,0,ASPDAC,atsushi,asada,M
1,Jim Meadlock,1995,0,ASPDAC,jim,meadlock,M
2,John Darringer,1995,0,ASPDAC,john,darringer,M
3,Tatsuo Izawa,1997,0,ASPDAC,tatsuo,izawa,M
4,Daniel D. Gajski,1997,0,ASPDAC,daniel,d. gajski,M


##### Clean up gender columns

In [26]:
df['Sex'].replace(np.nan, 'unknown', inplace=True)
df['Sex'].replace('x','unknown',inplace=True)
df['Sex'].replace('0','M',inplace=True)
df['Sex'].replace(0,'M',inplace=True)
df['Sex'].replace('1','F',inplace=True)
df['Sex'].replace(1,'F',inplace=True)

In [27]:
print('Number of rows where "Sex" is F and "gender" is M:',len(np.where((df['Sex'] == 'F') & (df['gender']=='M'))[0]))
print('Number of rows where "Sex" is M and "gender" is F:',len(np.where((df['Sex'] == 'M') & (df['gender']=='F'))[0]))

# Overwrite the value in column 'gender' with the 'Sex' column
df.loc[df['Sex'] == 'F', 'gender'] = 'F'  
df.loc[df['Sex'] == 'M', 'gender'] = 'M'   

# Drop unnecessary columns
df = df.drop(['Sex'],axis=1)

Number of rows where "Sex" is F and "gender" is M: 3
Number of rows where "Sex" is M and "gender" is F: 10


#### Fix unique names

In [28]:
df_proceedings['Middel name'] = df_proceedings['Last name'].str.split(' ').str[0].str.lower()
df_proceedings['Family name'] = df_proceedings['Last name'].str.split(' ').str[1:].str.join(' ').str.lower()

# keep only non-nan values in Middel name
df_proceedings = df_proceedings[df_proceedings['Family name'] != ""]
# drop columns
df_proceedings = df_proceedings.drop(['Year', 'Field', 'File name','Last name','gender'], axis=1)

df1 = df_proceedings[df_proceedings.duplicated(['First name', 'Family name'], keep=False)]
# drop all rows with duplicate Full names and keep the first occurence
df2 = df_proceedings.drop_duplicates(['Full name'], keep='first') 

df3 = df2[df2.duplicated(['First name', 'Family name'], keep=False)].sort_values(by=['First name']) # reset index

# keep all rows from df_proceedings_copy that has index from df3
df4 = df_proceedings_copy[df_proceedings_copy.index.isin(df3.index)]

#### Save factTable

In [16]:
df.to_csv("factInvited.csv", index=False)

# Proceedings

#### Load and combine all .csv files in the folder

In [3]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
df = append_data('Proceedings')
print("Number of rows:", len(df))
df.tail()

Number of rows: 217093


Unnamed: 0,Full name,Year,Field,File name
217088,Julià Minguillón,2022,Data Management,WWW
217089,Tiziano Piccardi,2022,Data Management,WWW
217090,Martin Gerlach,2022,Data Management,WWW
217091,Robert West,2022,Data Management,WWW
217092,Subhashish Panigrahi,2022,Data Management,WWW


In [4]:
# Add two new columns, where Full name is split into first and last name and the new columns are lowercased
df['First name'] = df['Full name'].str.split(' ').str[0].str.lower()
df['Last name'] = df['Full name'].str.split(' ').str[1:].str.join(' ').str.lower()


# unicode the 'First name' column
df['First name'] = df["First name"].str.normalize('NFKD')\
       .str.encode('ascii', errors='ignore')\
       .str.decode('utf-8')


# Remove all rows from First name which only 1 letter and a dot for intance "A."
print("Number of rows where the name only contains abbreviation: ", df[df['First name'].str.contains('^[a-z]\.$')].shape[0])
df = df[~df['First name'].str.contains('^[a-z]\.$')]

df.head()

Number of rows where the name only contains abbreviation:  2778


Unnamed: 0,Full name,Year,Field,File name,First name,Last name
0,Tsunemasa Hayashi,1997,Computer Architecture,ASPDAC,tsunemasa,hayashi
1,Atsushi Takahara,1997,Computer Architecture,ASPDAC,atsushi,takahara
2,Ken-nosuke Fukami,1997,Computer Architecture,ASPDAC,ken-nosuke,fukami
3,Jang-Hyun Park,1997,Computer Architecture,ASPDAC,jang-hyun,park
4,Yea-Chul Rho,1997,Computer Architecture,ASPDAC,yea-chul,rho


#### Add gender data

In [5]:
# Add gender based on First name
name_df = pd.read_csv('Gender_Data/wgnd_2_0_name-gender-code.csv')
# Keep only US or DK names
name_df = name_df[(name_df['code'] == 'US') | (name_df['code'] == 'DK')]
# drop duplicates
name_df = name_df.drop_duplicates(subset=['name',"gender"], keep='first')
# for all duplicated value keep the row with the max "wgt" value
name_df = name_df.sort_values('wgt', ascending=False).drop_duplicates(subset=['name'], keep='first')

print("Number of rows:", len(name_df))
name_df.head()

Number of rows: 97795


Unnamed: 0,name,code,gender,wgt
758,aaban,US,M,1.0
1857914,lexxy,US,F,1.0
1857667,lexia,US,F,1.0
1857563,lexi,US,F,1.0
1857549,lexey,US,F,1.0


In [6]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code', 'wgt'], axis=1)

print("Number of rows with null value in gender:", df["gender"].isnull().sum())

# Drop all rows with nan value in gender column #TODO
df = df.dropna(subset=['gender'])
print("Number of rows after removal of unknown genders:", len(df))

df.head()

Number of rows with null value in gender: 52252
Number of rows after removal of unknown genders: 162063


Unnamed: 0,Full name,Year,Field,File name,First name,Last name,gender
1,Atsushi Takahara,1997,Computer Architecture,ASPDAC,atsushi,takahara,M
5,Yutaka Tamiya,1997,Computer Architecture,ASPDAC,yutaka,tamiya,M
6,Atsushi Takahashi,1997,Computer Architecture,ASPDAC,atsushi,takahashi,M
7,Yoji Kajitani,1997,Computer Architecture,ASPDAC,yoji,kajitani,M
10,Dirk Behrens,1997,Computer Architecture,ASPDAC,dirk,behrens,M


#### Change name from John A. Hansen to John Anders Hansen

In [None]:
#TODO The code does not work 100 % yet, but it is a start
'''
df_proceedings['Middel name'] = df_proceedings['Last name'].str.split(' ').str[0].str.lower()
df_proceedings['Family name'] = df_proceedings['Last name'].str.split(' ').str[1:].str.join(' ').str.lower()

# keep only non-nan values in Middel name
df_proceedings = df_proceedings[df_proceedings['Family name'] != ""]
# drop columns
df_proceedings = df_proceedings.drop(['Year', 'Field', 'File name','Last name','gender'], axis=1)

df1 = df_proceedings[df_proceedings.duplicated(['First name', 'Family name'], keep=False)]
# drop all rows with duplicate Full names and keep the first occurence
df2 = df_proceedings.drop_duplicates(['Full name'], keep='first') 

df3 = df2[df2.duplicated(['First name', 'Family name'], keep=False)].sort_values(by=['First name']) # reset index

# keep all rows from df_proceedings_copy that has index from df3
df4 = df_proceedings_copy[df_proceedings_copy.index.isin(df3.index)]
'''

#### Save factTable

In [7]:
df.to_csv("factProceedings.csv", index=False)