In [16]:
# load setup.py file
%run setup.py
# change number of lines in dataframe
pd.set_option('display.max_rows', None)

In [17]:
def append_data(folder):
    all_files = glob.glob(folder + "/*.csv")
    df = pd.DataFrame()
    for file in all_files:
        df_temp = pd.read_csv(file, index_col=None)
        df_temp['File name'] = os.path.splitext(os.path.basename(file))[0]
        df = df.append(df_temp, ignore_index=True)
    df['Conference (short)'] = df['File name'].str.split('_').str[0]
    df = df.drop(["Field","File name"], axis=1)
    try:
        df = df.drop(["Key Note Speaker"], axis=1)
    except:
        pass
    return df

def Name_preprocessing(df):
    # First Name
    df['First name'] = df['Full name'].str.split(' ').str[0].str.lower()
    df['First name'] = df["First name"].str.normalize('NFKD')\
       .str.encode('ascii', errors='ignore')\
       .str.decode('utf-8')
    print("Number of rows where the first name only contains abbreviation: ", df[df['First name'].str.contains('^[a-z]\.$')].shape[0])
    print("These rows will be removed")
    df = df[~df['First name'].str.contains('^[a-z]\.$')]
    print("There are now:",df.shape[0],"rows")
       
    # Middel and Last Name
    df['Middle name'] = df['Full name'].str.split(' ').str[1]
    df['Last name (temp)'] = df['Full name'].str.split(' ').str[2]
    df['Last name'] = np.where(df['Last name (temp)'].isnull(), df['Middle name'], df['Last name (temp)'])
    df['Middle name'] = np.where(df['Last name (temp)'].isnull(), np.nan, df['Middle name'])
    df = df.drop(["Last name (temp)"], axis=1)
    return df

def Middle_name_clean(df):
    # Find all rows with different middle names
    row_to_remove = df[df.duplicated(['First name','Last name'], keep=False)].sort_values(['First name','Last name']).drop_duplicates(subset=['First name','Middle name','Last name'], keep=False).drop_duplicates(subset=['First name','Last name'], keep=False)
    rows_to_change = df[df.duplicated(['First name','Last name'], keep=False)].sort_values(['First name','Last name']).drop_duplicates(subset=['First name','Middle name','Last name'], keep=False)
    rows_to_change = rows_to_change[~rows_to_change.isin(row_to_remove)].dropna(how='all').sort_values(['First name','Last name','Middle name'])

    for i in range(len(rows_to_change)-1):
        if (rows_to_change.iloc[i]['First name'] == rows_to_change.iloc[i+1]['First name']) and (rows_to_change.iloc[i]['Last name'] == rows_to_change.iloc[i+1]['Last name']):
            if pd.isnull(rows_to_change.iloc[i+1]['Middle name']):
                rows_to_change.iloc[i+1,rows_to_change.columns.get_loc('Middle name')] = rows_to_change.iloc[i]['Middle name']
    
    # Overwrite the Middle name
    df.loc[rows_to_change.index,'Middle name'] = rows_to_change['Middle name']
    df['Full name'] = df['First name'].str.capitalize() + ' ' + df['Middle name'].fillna('') + ' ' + df['Last name']
    return df

#############################################
# Gender Data
#############################################
name_df = pd.read_csv('Dimension Tables/wgnd_2_0_name-gender-code.csv')
# Keep only US or DK names
name_df = name_df[(name_df['code'] == 'US') | (name_df['code'] == 'DK')]
# drop duplicates
name_df = name_df.drop_duplicates(subset=['name',"gender"], keep='first')
# for all duplicated value keep the row with the max "wgt" value
name_df = name_df.sort_values('wgt', ascending=False).drop_duplicates(subset=['name'], keep='first')

print("Number of rows:", len(name_df))
name_df.head()

Number of rows: 97795


Unnamed: 0,name,code,gender,wgt
758,aaban,US,M,1.0
1857914,lexxy,US,F,1.0
1857667,lexia,US,F,1.0
1857563,lexi,US,F,1.0
1857549,lexey,US,F,1.0


# Key Note Speakers

#### Load and combine all .csv files in the folder

In [18]:
df = append_data('Invited Speakers')
print("Number of rows:", len(df))
df.head()

Number of rows: 2044


Unnamed: 0,Full name,Year,Sex,Conference (short)
0,Atsushi Asada,1995,0,ASPDAC
1,Jim Meadlock,1995,0,ASPDAC
2,John Darringer,1995,0,ASPDAC
3,Tatsuo Izawa,1997,0,ASPDAC
4,Daniel D. Gajski,1997,0,ASPDAC


#### Clean the name columns

In [19]:
df = Name_preprocessing(df)
df.head()

Number of rows where the first name only contains abbreviation:  112
These rows will be removed
There are now: 1932 rows


Unnamed: 0,Full name,Year,Sex,Conference (short),First name,Middle name,Last name
0,Atsushi Asada,1995,0,ASPDAC,atsushi,,Asada
1,Jim Meadlock,1995,0,ASPDAC,jim,,Meadlock
2,John Darringer,1995,0,ASPDAC,john,,Darringer
3,Tatsuo Izawa,1997,0,ASPDAC,tatsuo,,Izawa
4,Daniel D. Gajski,1997,0,ASPDAC,daniel,D.,Gajski


#### Add gender data

In [20]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code', 'wgt'], axis=1)

print("Number of rows with null value in gender:", df["gender"].isnull().sum())

# Drop all rows with nan value in gender column #TODO
df = df.dropna(subset=['gender'])
print("Number of rows after removal of unknown genders:", len(df))

df.head()

Number of rows with null value in gender: 167
Number of rows after removal of unknown genders: 1765


Unnamed: 0,Full name,Year,Sex,Conference (short),First name,Middle name,Last name,gender
0,Atsushi Asada,1995,0,ASPDAC,atsushi,,Asada,M
1,Jim Meadlock,1995,0,ASPDAC,jim,,Meadlock,M
2,John Darringer,1995,0,ASPDAC,john,,Darringer,M
3,Tatsuo Izawa,1997,0,ASPDAC,tatsuo,,Izawa,M
4,Daniel D. Gajski,1997,0,ASPDAC,daniel,D.,Gajski,M


##### Clean up gender columns

In [21]:
df['Sex'].replace(np.nan, 'unknown', inplace=True)
df['Sex'].replace('x','unknown',inplace=True)
df['Sex'].replace('0','M',inplace=True)
df['Sex'].replace(0,'M',inplace=True)
df['Sex'].replace('1','F',inplace=True)
df['Sex'].replace(1,'F',inplace=True)

In [22]:
print('Number of rows where "Sex" is F and "gender" is M:',len(np.where((df['Sex'] == 'F') & (df['gender']=='M'))[0]))
print('Number of rows where "Sex" is M and "gender" is F:',len(np.where((df['Sex'] == 'M') & (df['gender']=='F'))[0]))

# Overwrite the value in column 'gender' with the 'Sex' column
df.loc[df['Sex'] == 'F', 'gender'] = 'F'  
df.loc[df['Sex'] == 'M', 'gender'] = 'M'   

# Drop unnecessary columns
df = df.drop(['Sex'],axis=1)

Number of rows where "Sex" is F and "gender" is M: 3
Number of rows where "Sex" is M and "gender" is F: 10


#### Fix unique names

In [23]:
print("Before: There are ",df['Full name'].nunique()," unique names")
df = Middle_name_clean(df)
print("After: There are ",df['Full name'].nunique()," unique names")

Before: There are  1520  unique names
After: There are  1515  unique names


#### Save factTable

In [67]:
df.to_csv("factInvited.csv", index=False)

# Proceedings

#### Load and combine all .csv files in the folder

In [24]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
df = append_data('Proceedings')
print("Number of rows:", len(df))
df.tail()

Number of rows: 407531


Unnamed: 0,Full name,Year,Conference (short)
407526,Julià Minguillón,2022,WWW
407527,Tiziano Piccardi,2022,WWW
407528,Martin Gerlach,2022,WWW
407529,Robert West,2022,WWW
407530,Subhashish Panigrahi,2022,WWW


#### Clean the names

In [25]:
df = Name_preprocessing(df)
df.head()

Number of rows where the first name only contains abbreviation:  5709
These rows will be removed
There are now: 401822 rows


Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name
0,Isao Shirakawa,1995,ASPDAC,isao,,Shirakawa
1,Wen-Zen Shen,1995,ASPDAC,wen-zen,,Shen
2,Jiing-Yuan Lin,1995,ASPDAC,jiing-yuan,,Lin
3,Fong-Wen Wang,1995,ASPDAC,fong-wen,,Wang
4,How-Rern Lin,1995,ASPDAC,how-rern,,Lin


#### Add gender data

In [26]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code', 'wgt'], axis=1)

print("Number of rows with null value in gender:", df["gender"].isnull().sum())

# Drop all rows with nan value in gender column #TODO
df = df.dropna(subset=['gender'])
print("Number of rows after removal of unknown genders:", len(df))

df.head()

Number of rows with null value in gender: 112432
Number of rows after removal of unknown genders: 289390


Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender
0,Isao Shirakawa,1995,ASPDAC,isao,,Shirakawa,M
5,TingTing Hwang,1995,ASPDAC,tingting,,Hwang,F
6,Sanjay Dhar,1995,ASPDAC,sanjay,,Dhar,M
7,Dave J. Gurney,1995,ASPDAC,dave,J.,Gurney,M
8,Michel Thill,1995,ASPDAC,michel,,Thill,M


#### Fix Unique names

In [27]:
print("Before: There are ",df['Full name'].nunique()," unique names")
df = Middle_name_clean(df)
print("After: There are ",df['Full name'].nunique()," unique names")

Before: There are  99552  unique names
After: There are  98924  unique names


#### Save factTable

In [15]:
df.to_csv("factProceedings.csv", index=False)