In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import matplotlib.cm as cm
import requests
import json
import pandas as pd
import time
import datetime
import os
from unidecode import unidecode
import sys
import glob
import re
import math
import random
import pickle
import copy
import itertools
import collections
import warnings
warnings.filterwarnings('ignore')
from bs4 import BeautifulSoup

In [80]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
def append_data(folder):
    all_files = glob.glob(folder + "/*.csv")
    df = pd.DataFrame()
    for file in all_files:
        df_temp = pd.read_csv(file, index_col=None)
        df_temp['File name'] = os.path.splitext(os.path.basename(file))[0]
        df = df.append(df_temp, ignore_index=True)
    df['File name'] = df['File name'].str.split('_').str[0]
    return df

df = append_data('Key Note Speakers')
print("Number of rows:", len(df))
df.tail()

Number of rows: 7616


Unnamed: 0,Full name,Year,Sex,Field,File name,Key Note Speaker,Duration
7611,Victor Zue,1995,x,Data Management,WWW,,
7612,David Goddeau,1995,x,Data Management,WWW,,
7613,Christopher Dobbs,1995,x,Data Management,WWW,,
7614,Robert W. Lucky,1995,x,Data Management,WWW,,
7615,Thomas Reardon,1995,x,Data Management,WWW,,


In [81]:
# Add two new columns, where Full name is split into first and last name and the new columns are lowercased
df['First name'] = df['Full name'].str.split(' ').str[0].str.lower()
df['Last name'] = df['Full name'].str.split(' ').str[1:].str.join(' ').str.lower()


# unicode the 'First name' column
df['First name'] = df["First name"].str.normalize('NFKD')\
       .str.encode('ascii', errors='ignore')\
       .str.decode('utf-8')


# Remove all rows from First name which only 1 letter and a dot for intance "A."
print("Number of rows where the name only contains abbreviation: ", df[df['First name'].str.contains('^[a-z]\.$')].shape[0])
df = df[~df['First name'].str.contains('^[a-z]\.$')]

df.head()

Number of rows where the name only contains abbreviation:  124


Unnamed: 0,Full name,Year,Sex,Field,File name,Key Note Speaker,Duration,First name,Last name
0,Atsushi Asada,1995,0,Computer Architecture,ASPDAC,,,atsushi,asada
1,Jim Meadlock,1995,0,Computer Architecture,ASPDAC,,,jim,meadlock
2,John Darringer,1995,0,Computer Architecture,ASPDAC,,,john,darringer
3,Tatsuo Izawa,1997,0,Computer Architecture,ASPDAC,,,tatsuo,izawa
4,Daniel D. Gajski,1997,0,Computer Architecture,ASPDAC,,,daniel,d. gajski


### Add gender data

In [75]:
# Add gender based on First name
name_df = pd.read_csv('Gender_Data/wgnd_2_0_name-gender-code_langexp.csv')
# Keep only US or DK names
name_df = name_df[(name_df['code'] == 'US') | (name_df['code'] == 'DK')]
# drop duplicates
name_df = name_df.drop_duplicates(subset=['name',"gender"], keep='first')

print("Number of rows:", len(name_df))
name_df.head()

Number of rows: 221688


Unnamed: 0,name,code,gender
48,"""baby""",US,F
103,'aisyah,US,F
179,'anela,US,F
255,'fiyinfoluwa,US,F
310,'olioni,US,M


In [76]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code'], axis=1)
df.head()

Unnamed: 0,Full name,Year,Sex,Field,File name,Key Note Speaker,Duration,First name,Last name,gender
0,Atsushi Asada,1995,0,Computer Architecture,ASPDAC,,,atsushi,asada,M
1,Jim Meadlock,1995,0,Computer Architecture,ASPDAC,,,jim,meadlock,M
2,John Darringer,1995,0,Computer Architecture,ASPDAC,,,john,darringer,M
3,Tatsuo Izawa,1997,0,Computer Architecture,ASPDAC,,,tatsuo,izawa,M
4,Daniel D. Gajski,1997,0,Computer Architecture,ASPDAC,,,daniel,d. gajski,M


In [77]:
print("Number of rows with null value in gender:", df["gender"].isnull().sum())
df[df["gender"].isnull()]

Number of rows with null value in gender: 698


Unnamed: 0,Full name,Year,Sex,Field,File name,Key Note Speaker,Duration,First name,Last name,gender
6,Biswadip Mitra,2002,x,Computer Architecture,ASPDAC,,,biswadip,mitra,
13,Chi―Foon Chan,1999,x,Computer Architecture,ASPDAC,,,chifoon,chan,
15,Dipendcr Saluia,1999,x,Computer Architecture,ASPDAC,,,dipendcr,saluia,
19,Ming-Jeh Chien,2001,0,Computer Architecture,ASPDAC,,,ming-jeh,chien,
21,Glovanni De WIichcli,2003,0,Computer Architecture,ASPDAC,,,glovanni,de wiichcli,
...,...,...,...,...,...,...,...,...,...,...
7466,Lorrie Cranor,2005,x,Data Management,WWW,,,lorrie,cranor,
7471,Udi Manber Rick Rashid,2004,x,Data Management,WWW,,,udi,manber rick rashid,
7491,Egbert-Jan Sol,2000,x,Data Management,WWW,,,egbert-jan,sol,
7504,Xing Li,1998,x,Data Management,WWW,,,xing,li,


### Vi prøver lige noget andet med gender

In [82]:
# Add gender based on First name
name_df = pd.read_csv('Gender_Data/wgnd_2_0_name-gender-code.csv')
# Keep only US or DK names
name_df = name_df[(name_df['code'] == 'US') | (name_df['code'] == 'DK')]
# drop duplicates
name_df = name_df.drop_duplicates(subset=['name',"gender"], keep='first')
# for all duplicated value keep the row with the max "wgt" value
name_df = name_df.sort_values('wgt', ascending=False).drop_duplicates(subset=['name'], keep='first')

print("Number of rows:", len(name_df))
name_df.head()

Number of rows: 97795


Unnamed: 0,name,code,gender,wgt
758,aaban,US,M,1.0
1857914,lexxy,US,F,1.0
1857667,lexia,US,F,1.0
1857563,lexi,US,F,1.0
1857549,lexey,US,F,1.0


In [83]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code', 'wgt'], axis=1)


print("Number of rows with null value in gender:", df["gender"].isnull().sum())

# Drop all rows with nan value in gender column
df = df.dropna(subset=['gender'])
print("Number of rows:", len(df))

df.head()

Number of rows with null value in gender: 690
Number of rows: 6802


Unnamed: 0,Full name,Year,Sex,Field,File name,Key Note Speaker,Duration,First name,Last name,gender
0,Atsushi Asada,1995,0,Computer Architecture,ASPDAC,,,atsushi,asada,M
1,Jim Meadlock,1995,0,Computer Architecture,ASPDAC,,,jim,meadlock,M
2,John Darringer,1995,0,Computer Architecture,ASPDAC,,,john,darringer,M
3,Tatsuo Izawa,1997,0,Computer Architecture,ASPDAC,,,tatsuo,izawa,M
4,Daniel D. Gajski,1997,0,Computer Architecture,ASPDAC,,,daniel,d. gajski,M


### Save factTable

In [84]:
df.to_csv("factInvited.csv", index=False)