In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import matplotlib.cm as cm
import requests
import json
import pandas as pd
import time
import datetime
import os
from unidecode import unidecode
import sys
import glob
import re
import math
import random
import pickle
import copy
import itertools
import collections
import warnings
warnings.filterwarnings('ignore')
from bs4 import BeautifulSoup

In [90]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
def append_data(folder):
    all_files = glob.glob(folder + "/*.csv")
    df = pd.DataFrame()
    for file in all_files:
        df_temp = pd.read_csv(file, index_col=None)
        df_temp['File name'] = os.path.splitext(os.path.basename(file))[0]
        df = df.append(df_temp, ignore_index=True)
    df['File name'] = df['File name'].str.split('_').str[0]
    return df

df = append_data('Proceedings')
print("Number of rows:", len(df))
df.tail()

Number of rows: 217093


Unnamed: 0,Full name,Year,Field,File name
217088,Julià Minguillón,2022,Data Management,WWW
217089,Tiziano Piccardi,2022,Data Management,WWW
217090,Martin Gerlach,2022,Data Management,WWW
217091,Robert West,2022,Data Management,WWW
217092,Subhashish Panigrahi,2022,Data Management,WWW


In [91]:
# Add two new columns, where Full name is split into first and last name and the new columns are lowercased
df['First name'] = df['Full name'].str.split(' ').str[0].str.lower()
df['Last name'] = df['Full name'].str.split(' ').str[1:].str.join(' ').str.lower()


# unicode the 'First name' column
df['First name'] = df["First name"].str.normalize('NFKD')\
       .str.encode('ascii', errors='ignore')\
       .str.decode('utf-8')


# Remove all rows from First name which only 1 letter and a dot for intance "A."
print("Number of rows where the name only contains abbreviation: ", df[df['First name'].str.contains('^[a-z]\.$')].shape[0])
df = df[~df['First name'].str.contains('^[a-z]\.$')]

df.head()

Number of rows where the name only contains abbreviation:  2778


Unnamed: 0,Full name,Year,Field,File name,First name,Last name
0,Tsunemasa Hayashi,1997,Computer Architecture,ASPDAC,tsunemasa,hayashi
1,Atsushi Takahara,1997,Computer Architecture,ASPDAC,atsushi,takahara
2,Ken-nosuke Fukami,1997,Computer Architecture,ASPDAC,ken-nosuke,fukami
3,Jang-Hyun Park,1997,Computer Architecture,ASPDAC,jang-hyun,park
4,Yea-Chul Rho,1997,Computer Architecture,ASPDAC,yea-chul,rho


### Add gender data

In [87]:
# Add gender based on First name
name_df = pd.read_csv('Gender_Data/wgnd_2_0_name-gender-code_langexp.csv')
# Keep only US or DK names
name_df = name_df[(name_df['code'] == 'US') | (name_df['code'] == 'DK')]
# drop duplicates
name_df = name_df.drop_duplicates(subset=['name',"gender"], keep='first')

print("Number of rows:", len(name_df))
name_df.head()

Number of rows: 221688


Unnamed: 0,name,code,gender
48,"""baby""",US,F
103,'aisyah,US,F
179,'anela,US,F
255,'fiyinfoluwa,US,F
310,'olioni,US,M


In [88]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code'], axis=1)
df.head()

Unnamed: 0,Full name,Year,Field,File name,First name,Last name,gender
0,Tsunemasa Hayashi,1997,Computer Architecture,ASPDAC,tsunemasa,hayashi,
1,Atsushi Takahara,1997,Computer Architecture,ASPDAC,atsushi,takahara,M
2,Ken-nosuke Fukami,1997,Computer Architecture,ASPDAC,ken-nosuke,fukami,
3,Jang-Hyun Park,1997,Computer Architecture,ASPDAC,jang-hyun,park,
4,Yea-Chul Rho,1997,Computer Architecture,ASPDAC,yea-chul,rho,


In [89]:
print("Number of rows with null value in gender:", df["gender"].isnull().sum())
df[df["gender"].isnull()]

Number of rows with null value in gender: 57261


Unnamed: 0,Full name,Year,Field,File name,First name,Last name,gender
0,Tsunemasa Hayashi,1997,Computer Architecture,ASPDAC,tsunemasa,hayashi,
2,Ken-nosuke Fukami,1997,Computer Architecture,ASPDAC,ken-nosuke,fukami,
3,Jang-Hyun Park,1997,Computer Architecture,ASPDAC,jang-hyun,park,
4,Yea-Chul Rho,1997,Computer Architecture,ASPDAC,yea-chul,rho,
8,Kwang-Su Seong,1997,Computer Architecture,ASPDAC,kwang-su,seong,
...,...,...,...,...,...,...,...
215086,Chin-Yew Lin,2022,Data Management,WWW,chin-yew,lin,
215088,Reda Benkhadra,2022,Data Management,WWW,reda,benkhadra,
215089,Puyu Yang,2022,Data Management,WWW,puyu,yang,
215091,Karthic Madanagopal,2022,Data Management,WWW,karthic,madanagopal,


### Vi prøver lige noget andet med gender

In [92]:
# Add gender based on First name
name_df = pd.read_csv('Gender_Data/wgnd_2_0_name-gender-code.csv')
# Keep only US or DK names
name_df = name_df[(name_df['code'] == 'US') | (name_df['code'] == 'DK')]
# drop duplicates
name_df = name_df.drop_duplicates(subset=['name',"gender"], keep='first')
# for all duplicated value keep the row with the max "wgt" value
name_df = name_df.sort_values('wgt', ascending=False).drop_duplicates(subset=['name'], keep='first')

print("Number of rows:", len(name_df))
name_df.head()

Number of rows: 97795


Unnamed: 0,name,code,gender,wgt
758,aaban,US,M,1.0
1857914,lexxy,US,F,1.0
1857667,lexia,US,F,1.0
1857563,lexi,US,F,1.0
1857549,lexey,US,F,1.0


In [93]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code', 'wgt'], axis=1)


print("Number of rows with null value in gender:", df["gender"].isnull().sum())

# Drop all rows with nan value in gender column
df = df.dropna(subset=['gender'])
print("Number of rows:", len(df))

df.head()

Number of rows with null value in gender: 52252
Number of rows: 162063


Unnamed: 0,Full name,Year,Field,File name,First name,Last name,gender
1,Atsushi Takahara,1997,Computer Architecture,ASPDAC,atsushi,takahara,M
5,Yutaka Tamiya,1997,Computer Architecture,ASPDAC,yutaka,tamiya,M
6,Atsushi Takahashi,1997,Computer Architecture,ASPDAC,atsushi,takahashi,M
7,Yoji Kajitani,1997,Computer Architecture,ASPDAC,yoji,kajitani,M
10,Dirk Behrens,1997,Computer Architecture,ASPDAC,dirk,behrens,M


### Save factTable

In [94]:
df.to_csv("factProceedings.csv", index=False)