In [191]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import matplotlib.cm as cm
import requests
import json
import pandas as pd
import time
import datetime
import os
from unidecode import unidecode
import sys
import glob
import re
import math
import random
import pickle
import copy
import itertools
import collections
import warnings
warnings.filterwarnings('ignore')
from bs4 import BeautifulSoup

In [192]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
def append_data(folder):
    all_files = glob.glob(folder + "/*.csv")
    df = pd.DataFrame()
    for file in all_files:
        df_temp = pd.read_csv(file, index_col=None, header=0)
        df_temp['File name'] = os.path.splitext(os.path.basename(file))[0]
        df = df.append(df_temp, ignore_index=True)
    df['File name'] = df['File name'].str.split('_').str[0]
    return df

df = append_data('Data')
print("Number of rows:", len(df))
df.head()

Number of rows: 31209


Unnamed: 0,Full name,Year,File name,Full Name,Key Note Speaker,Sex,Duration,Field
0,Rana Adhikari,2005,APS,,,,,
1,Kaustubh Agashe,2005,APS,,,,,
2,Felix Aharonian,2005,APS,,,,,
3,John Ahearne,2005,APS,,,,,
4,Hiroaki Aihara,2005,APS,,,,,


In [193]:
# Add two new columns, where Full name is split into first and last name and the new columns are lowercased
df['First name'] = df['Full name'].str.split(' ').str[0].str.lower()
df['Last name'] = df['Full name'].str.split(' ').str[1]

# unicode the First name column
df["First name"] = df["First name"].str.normalize('NFKD')\
       .str.encode('ascii', errors='ignore')\
       .str.decode('utf-8')


# Remove all rows from First name which has less than 3 characters
print("Number of names less that 3 characters: ", len(df[df['First name'].str.len() < 3]))
df = df[df['First name'].str.len() > 2]

df.head()

Number of names less that 3 characters:  697


Unnamed: 0,Full name,Year,File name,Full Name,Key Note Speaker,Sex,Duration,Field,First name,Last name
0,Rana Adhikari,2005,APS,,,,,,rana,Adhikari
1,Kaustubh Agashe,2005,APS,,,,,,kaustubh,Agashe
2,Felix Aharonian,2005,APS,,,,,,felix,Aharonian
3,John Ahearne,2005,APS,,,,,,john,Ahearne
4,Hiroaki Aihara,2005,APS,,,,,,hiroaki,Aihara


### Add gender data

In [184]:
# Add gender based on First name
name_df = pd.read_csv('Gender_Data/wgnd_2_0_name-gender-code_langexp.csv')
# Keep only US or DK names
name_df = name_df[(name_df['code'] == 'US') | (name_df['code'] == 'DK')]
# drop duplicates
name_df = name_df.drop_duplicates(subset=['name',"gender"], keep='first')

print("Number of rows:", len(name_df))
name_df.head()

Number of rows: 221688


Unnamed: 0,name,code,gender
48,"""baby""",US,F
103,'aisyah,US,F
179,'anela,US,F
255,'fiyinfoluwa,US,F
310,'olioni,US,M


In [154]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code'], axis=1)
df.head()

Unnamed: 0,Full name,Year,File name,Full Name,Key Note Speaker,Sex,Duration,Field,First name,Last name,gender
0,Rana Adhikari,2005,APS,,,,,,rana,Adhikari,F
1,Kaustubh Agashe,2005,APS,,,,,,kaustubh,Agashe,M
2,Felix Aharonian,2005,APS,,,,,,felix,Aharonian,M
3,John Ahearne,2005,APS,,,,,,john,Ahearne,M
4,Hiroaki Aihara,2005,APS,,,,,,hiroaki,Aihara,M


In [155]:
print("Number of rows with null value in gender:", df["gender"].isnull().sum())
df[df["gender"].isnull()]

Number of rows with null value in gender: 3763


Unnamed: 0,Full name,Year,File name,Full Name,Key Note Speaker,Sex,Duration,Field,First name,Last name,gender
7,Moskov Amarian,2005,APS,,,,,,moskov,Amarian,
10,Hessamaddin Arfaei,2005,APS,,,,,,hessamaddin,Arfaei,
42,III Coyle,2005,APS,,,,,,iii,Coyle,
58,Eanna Flanagan,2005,APS,,,,,,eanna,Flanagan,
62,Olival Freire,2005,APS,,,,,,olival,Freire,
...,...,...,...,...,...,...,...,...,...,...,...
30388,Olivier Danvy,2008,WoLLIC,,,,,,olivier,Danvy,
30429,Erich Grädel,2002,WoLLIC,,,,,,erich,Grädel,
30430,Gopalan Nadathur,2002,WoLLIC,,,,,,gopalan,Nadathur,
30439,Jouko Väänänen.,2001,WoLLIC,,,,,,jouko,Väänänen.,


### Vi prøver lige noget andet med gender

In [194]:
# Add gender based on First name
name_df = pd.read_csv('Gender_Data/wgnd_2_0_name-gender-code.csv')
# Keep only US or DK names
name_df = name_df[(name_df['code'] == 'US') | (name_df['code'] == 'DK')]
# drop duplicates
name_df = name_df.drop_duplicates(subset=['name',"gender"], keep='first')
# for all duplicated value keep the row with the max "wgt" value
name_df = name_df.sort_values('wgt', ascending=False).drop_duplicates(subset=['name'], keep='first')

print("Number of rows:", len(name_df))
name_df.head()

Number of rows: 97795


Unnamed: 0,name,code,gender,wgt
758,aaban,US,M,1.0
1857914,lexxy,US,F,1.0
1857667,lexia,US,F,1.0
1857563,lexi,US,F,1.0
1857549,lexey,US,F,1.0


In [195]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code', 'wgt'], axis=1)


print("Number of rows with null value in gender:", df["gender"].isnull().sum())

# Drop all rows with nan value in gender column
df = df.dropna(subset=['gender'])
print("Number of rows:", len(df))

df.head()

Number of rows with null value in gender: 3612
Number of rows: 26765


Unnamed: 0,Full name,Year,File name,Full Name,Key Note Speaker,Sex,Duration,Field,First name,Last name,gender
0,Rana Adhikari,2005,APS,,,,,,rana,Adhikari,F
1,Kaustubh Agashe,2005,APS,,,,,,kaustubh,Agashe,M
2,Felix Aharonian,2005,APS,,,,,,felix,Aharonian,M
3,John Ahearne,2005,APS,,,,,,john,Ahearne,M
4,Hiroaki Aihara,2005,APS,,,,,,hiroaki,Aihara,M


In [196]:
df.to_csv("factConference.csv", index=False)