In [1]:
# NER
# PERSON People, including fictional. NORP Nationalities or religious or political groups.
# FAC Buildings, airports, highways, bridges, etc.
# ORG Companies, agencies, institutions, etc.
# GPE Countries, cities, states.
# LOC Non-GPE locations, mountain ranges, bodies of water.
# PRODUCT Objects, vehicles, foods, etc. (Not services.)
# EVENT Named hurricanes, battles, wars, sports events, etc.
# WORK_OF_ART Titles of books, songs, etc.
# LAW Named documents made into laws.
# LANGUAGE Any named language.
# DATE Absolute or relative dates or periods.
# TIME Times smaller than a day.
# PERCENT Percentage, including ”%“.
# MONEY Monetary values, including unit.
# QUANTITY Measurements, as of weight or distance.
# ORDINAL “first”, “second”, etc.
# CARDINAL Numerals that do not fall under another type.

#deduplicate_data =  deduplica_nome(authors, "last_name", "full_name")
#authors['names_variation_last'] = deduplicate_data['names_variation']
#authors['deduplicated_name_last'] = deduplicate_data['deduplicated_name']
#authors['n_variacoes_last'] = deduplicate_data['n_variacoes']

#count_deduplicated = pd.DataFrame(authors['deduplicated_name_last'].value_counts()).join(pd.DataFrame(authors['deduplicated_name_std'].value_counts()))
#diff_deduplicated = count_deduplicated[count_deduplicated.deduplicated_name_last != count_deduplicated.deduplicated_name_std]
#diff_deduplicated.to_csv(path + "diff_deduplicated.csv")

#diff_deduplicated

In [1]:
import csv
import json
import time
import pandas as pd
import numpy as np

import gender_guesser.detector as gender

import requests
import re
import jellyfish
import spacy


from os import makedirs
from os.path import exists, join
from unidecode import unidecode
from collections import Counter
from pandas_profiling import ProfileReport
from spacy import displacy 
from alive_progress import alive_bar

nlp = spacy.load("en_core_web_sm")

In [2]:
import geonamescache

gc = geonamescache.GeonamesCache()

# gets nested dictionary for countries
countries = gc.get_countries()

# gets nested dictionary for cities
cities = gc.get_cities()

def gen_dict_extract(var, key):
    if isinstance(var, dict):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, (dict, list)):
                yield from gen_dict_extract(v, key)
    elif isinstance(var, list):
        for d in var:
            yield from gen_dict_extract(d, key)

cities = [*gen_dict_extract(cities, 'name')]
countries = [*gen_dict_extract(countries, 'name')]

In [26]:
def get_longest_name(group):        
    names_list = group['unique'].iloc[0].tolist()
    candidate_name = unidecode(max(names_list, key=len))

    while len(candidate_name.split(' ')) == 1 and len(names_list) > 1:
        names_list.remove(candidate_name)
        candidate_name = unidecode(max(names_list, key=len))   
    return candidate_name


def deduplica_nome(inicial_data, column_group, column_agg):
    return_data = pd.DataFrame()
    deduplicated_name = []
    names_variation = []
    n_variacoes = []
    
    names_variation_unique = inicial_data.groupby(column_group)[column_agg].agg(['unique']).reset_index()

    for index, row in inicial_data.iterrows():
        group = names_variation_unique[names_variation_unique[column_group] == row[column_group]]
        candidate_name = get_longest_name(group)

        deduplicated_name += [candidate_name]
        
        names_variation += [group.unique.iloc[0].tolist()]
        n_variacoes += [len(group.unique.iloc[0].tolist())]
        
    return_data['names_variation'] = names_variation
    return_data['deduplicated_name'] = deduplicated_name
    return_data['n_variacoes'] = n_variacoes
    
    return return_data


def gess_gender_author(name):
    d = gender.Detector()
    first_name = name.split(' ')[0]
    guessed_gender = 'unknown'
    
    if len(first_name) > 1:
        guessed_gender = d.get_gender(first_name)
        
    return guessed_gender


def create_deduplicated_columns(authors):
    deduplicate_data =  deduplica_nome(authors, "standard_name", "full_name")
    authors['names_variation_std'] = deduplicate_data['names_variation']
    authors['deduplicated_name_std'] = deduplicate_data['deduplicated_name']
    authors['n_variacoes_std'] = deduplicate_data['n_variacoes']
    
    return authors

def prepare_std_name(authors):
    authors['last_name'] = [unidecode(nome.split(" ")[-1]).upper() for nome in authors['full_name']]
    authors['first_letter_name'] = [unidecode(nome.split(" ")[0][0]).upper() for nome in authors['full_name']]
    authors['standard_name'] = [linha.first_letter_name + " " + linha.last_name for index, linha in authors.iterrows()]
    
    return authors


def remove_groups_names(authors):    
    some_list = ['World Health Organization', 'WHO', 'Research', 'Reproductive', 'Study', 'Health', 'GROUP', 'NETWORK',
                'Consortium', 'committee', 'all the authors', 'for IeDEA-Southern Africa', 'Systematic', 
                 'collaborations', 'Organizacion', 'College', 'Consortium', 'Association', 'Survey', 'Expert',
                'de la Salud', 'Control']
    for nome in authors.full_name.value_counts().index:
        if any( s.upper() in nome.upper() for s in some_list):
            authors = authors[authors.full_name != nome]
            
    return authors

In [27]:
path = 'C:\\Users\\livia\\Dropbox\\HRP Alliance authorship paper\\Data 2022-04-14\\'
authors = pd.read_csv(path + 'authors.csv')

authors = authors.drop("Unnamed: 0", axis = 1)
authors.head()
print(len(authors))

print("Number of authors: {}".format(len(authors['full_name'])))
print("Number unique authors: {}".format(len(authors['full_name'].value_counts())))

authors = remove_groups_names(authors)

print(len(authors))
print("Number of authors: {}".format(len(authors['full_name'])))
print("Number unique authors: {}".format(len(authors['full_name'].value_counts())))

18824
Number of authors: 18824
Number unique authors: 10536
18619
Number of authors: 18619
Number unique authors: 10373


In [29]:
authors = prepare_std_name(authors)
authors = create_deduplicated_columns(authors)

In [30]:
print("Number of authors: {}".format(len(authors['full_name'])))
print("Number unique authors: {}".format(len(authors['full_name'].value_counts())))

print("Number of authors: {}".format(len(authors['deduplicated_name_std'])))
print("Number unique authors: {}".format(len(authors['deduplicated_name_std'].value_counts())))

Number of authors: 18619
Number unique authors: 10373
Number of authors: 18619
Number unique authors: 8839


In [31]:
authors.head()

Unnamed: 0,full_name,affiliation,index_authorship,pmid,last_name,first_letter_name,standard_name,names_variation_std,deduplicated_name_std,n_variacoes_std
0,Erika Hurtado-Salgado,"['Center for Population Health Research, Natio...",1,34921727,HURTADO-SALGADO,E,E HURTADO-SALGADO,[Erika Hurtado-Salgado],Erika Hurtado-Salgado,1.0
1,Luz Cárdenas-Cárdenas,"['Center for Population Health Research, Natio...",2,34921727,CARDENAS-CARDENAS,L,L CARDENAS-CARDENAS,[Luz Cárdenas-Cárdenas],Luz Cardenas-Cardenas,1.0
2,Jorge Salmerón,"['Political, Population and Health Research Ce...",3,34921727,SALMERON,J,J SALMERON,[Jorge Salmerón],Jorge Salmeron,1.0
3,Rufino Luna-Gordillo,"['National Perinatology Institute, Health Mini...",4,34921727,LUNA-GORDILLO,R,R LUNA-GORDILLO,[Rufino Luna-Gordillo],Rufino Luna-Gordillo,1.0
4,Eduardo Ortiz-Panozo,"['Center for Population Health Research, Natio...",5,34921727,ORTIZ-PANOZO,E,E ORTIZ-PANOZO,"[Eduardo Ortiz-Panozo, E Ortiz-Panozo]",Eduardo Ortiz-Panozo,2.0


In [32]:
authors.groupby('pmid').nth(-1)['deduplicated_name_std'].value_counts().to_csv(path + "last_author.csv", encoding="utf-8")

In [33]:
pd.DataFrame(authors['deduplicated_name_std'].value_counts())

Unnamed: 0,deduplicated_name_std
Ahmet M Metin Gulmezoglu,203
Ozge Tuncalp,151
Lale Say,116
Joshua Peter Vogel,113
Olufemi Taiwo Oladapo,109
...,...
Anabel Chade,1
Elham Manouchehri,1
Alexander W Kay,1
Amanda Gabster,1


In [34]:
gender_list = []
unique_names = authors['deduplicated_name_std'].value_counts().index
with alive_bar(len(unique_names), force_tty=True) as bar:
    for author in unique_names:
        #print(author)
        author_guessed_gender = gess_gender_author(author)
        authors.loc[authors['deduplicated_name_std'] == author, 'gender'] = author_guessed_gender
        gender_list += [author_guessed_gender]
        bar()


|███▉                                    | █▆▄ 856/8839 [10%] in 5:23 (2.7/s, eta: 50:07) ▅ 3/8839 [0%] in 2s (1.3/s, eta: 2:10:17) in 4s (1.7/s, eta: 1:29:52)  ▆█▆ 13/8839 [0%] in 6s (2.1/s, eta: 1:10:47) (2.3/s, eta: 1:04:16)  ▇▅▃ 22/8839 [0%] in 9s (2.3/s, eta: 1:02:44) ▆█▆ 29/8839 [0%] in 12s (2.5/s, eta: 59:49)                                   34/8839 [0%] in 14s (2.5/s, eta: 58:46) 36/8839 [0%] in 14s (2.5/s, eta: 58:02) █▆▄ 38/8839 [0%] in 15s (2.5/s, eta: 58:01) ▇▅▃ 38/8839 [0%] in 15s (2.5/s, eta: 58:01)  ▆▄▂ 39/8839 [0%] in 15s (2.5/s, eta: 57:53)  ▅▇▇ 52/8839 [1%] in 20s (2.6/s, eta: 55:58) (2.6/s, eta: 55:48) 57/8839 [1%] in 22s (2.6/s, eta: 55:27)  58/8839 [1%] in 22s (2.6/s, eta: 55:31) (2.5/s, eta: 56:25)  ▇▅▃ 65/8839 [1%] in 26s (2.5/s, eta: 56:25)  71/8839 [1%] in 29s (2.4/s, eta: 59:44)  73/8839 [1%] in 30s (2.4/s, eta: 59:30)  76/8839 [1%] in 31s (2.5/s, eta: 59:17)  77/8839 [1%] in 31s (2.5/s, eta: 59:18) ▃▅▇ 79/8839 [1%] in 32s (2.4/s, eta: 59:19)  81/8839 [1%] in

|████████▎                               | ▅▃▁ 1823/8839 [21%] in 11:34 (2.6/s, eta: 44:26) ▃ 858/8839 [10%] in 5:23 (2.7/s, eta: 50:07) (2.7/s, eta: 50:06)  ▅▇▇ 862/8839 [10%] in 5:25 (2.7/s, eta: 50:04)  ▄▂▂ 865/8839 [10%] in 5:26 (2.7/s, eta: 50:03) ▃▅▇ 868/8839 [10%] in 5:27 (2.7/s, eta: 50:01)  ▅▃▁ 872/8839 [10%] in 5:28 (2.7/s, eta: 49:58)  ▃▅▇ 876/8839 [10%] in 5:29 (2.7/s, eta: 49:55) (2.7/s, eta: 49:54)  ▁▃▅ 897/8839 [10%] in 5:37 (2.7/s, eta: 49:42) ▇▇▅ 906/8839 [10%] in 5:40 (2.7/s, eta: 49:39)  █▆▄ 907/8839 [10%] in 5:40 (2.7/s, eta: 49:38) ▁▃▅ 910/8839 [10%] in 5:42 (2.7/s, eta: 49:37) ▂▄▆ 911/8839 [10%] in 5:42 (2.7/s, eta: 49:36) ▅▇▇ 920/8839 [10%] in 5:45 (2.7/s, eta: 49:30)  ▅▃▁ 941/8839 [11%] in 5:53 (2.7/s, eta: 49:25)  941/8839 [11%] in 5:53 (2.7/s, eta: 49:25) (2.7/s, eta: 49:14)  ▄▂▂ 963/8839 [11%] in 6:01 (2.7/s, eta: 49:13)  ▅▇▇ 966/8839 [11%] in 6:02 (2.7/s, eta: 49:13) 969/8839 [11%] in 6:04 (2.7/s, eta: 49:13)  ▄▂▂ 974/8839 [11%] in 6:06 (2.7/s, eta: 49:13)  

|█████████████▎                          | ▂▄▆ 2920/8839 [33%] in 18:57 (2.6/s, eta: 38:26)  ▅▇▇ 1831/8839 [21%] in 11:38 (2.6/s, eta: 44:32) 1838/8839 [21%] in 11:41 (2.6/s, eta: 44:30)  1840/8839 [21%] in 11:42 (2.6/s, eta: 44:30)  ▂▄▆ 1841/8839 [21%] in 11:43 (2.6/s, eta: 44:31)  ▃▅▇ 1860/8839 [21%] in 11:54 (2.6/s, eta: 44:40)  1861/8839 [21%] in 11:55 (2.6/s, eta: 44:40) ▃▅▇ 1867/8839 [21%] in 11:57 (2.6/s, eta: 44:37)  ▂▂▄ 1878/8839 [21%] in 12:01 (2.6/s, eta: 44:33)  ▁▃▅ 1879/8839 [21%] in 12:02 (2.6/s, eta: 44:33)  1881/8839 [21%] in 12:02 (2.6/s, eta: 44:32)  1882/8839 [21%] in 12:03 (2.6/s, eta: 44:31) in 12:03 (2.6/s, eta: 44:31) (2.6/s, eta: 44:30)  1894/8839 [21%] in 12:07 (2.6/s, eta: 44:26) (2.6/s, eta: 44:24) 1899/8839 [21%] in 12:09 (2.6/s, eta: 44:23)  ▅▃▁ 1900/8839 [21%] in 12:09 (2.6/s, eta: 44:23) ▃▅▇ 1902/8839 [22%] in 12:10 (2.6/s, eta: 44:22) (2.6/s, eta: 44:21) 1913/8839 [22%] in 12:15 (2.6/s, eta: 44:19) 1917/8839 [22%] in 12:17 (2.6/s, eta: 44:19)  1921/8839 

|██████████████████▎                     | ▄▂▂ 4023/8839 [46%] in 26:48 (2.5/s, eta: 32:05) (2.6/s, eta: 38:24)  ▃▅▇ 2927/8839 [33%] in 19:00 (2.6/s, eta: 38:23)  2935/8839 [33%] in 19:03 (2.6/s, eta: 38:20) 2936/8839 [33%] in 19:04 (2.6/s, eta: 38:20)  ▄▂▂ 2937/8839 [33%] in 19:04 (2.6/s, eta: 38:19)  2937/8839 [33%] in 19:04 (2.6/s, eta: 38:19)  ▂▄▆ 2939/8839 [33%] in 19:05 (2.6/s, eta: 38:18) ▂▂▄ 2945/8839 [33%] in 19:07 (2.6/s, eta: 38:16) █▆▄ 2949/8839 [33%] in 19:08 (2.6/s, eta: 38:14)  ▃▅▇ 2954/8839 [33%] in 19:10 (2.6/s, eta: 38:12) 2960/8839 [33%] in 19:12 (2.6/s, eta: 38:09)  2961/8839 [33%] in 19:13 (2.6/s, eta: 38:08)  2964/8839 [34%] in 19:14 (2.6/s, eta: 38:07) 2972/8839 [34%] in 19:17 (2.6/s, eta: 38:04)  2988/8839 [34%] in 19:23 (2.6/s, eta: 37:58)  ▁▃▅ 2999/8839 [34%] in 19:27 (2.6/s, eta: 37:53)  ▂▄▆ 2999/8839 [34%] in 19:27 (2.6/s, eta: 37:53)  3001/8839 [34%] in 19:28 (2.6/s, eta: 37:52) 3004/8839 [34%] in 19:29 (2.6/s, eta: 37:51) 3008/8839 [34%] in 19:30 (2.6/s, e

|████████████████████████▍               | ▃▁▃ 5396/8839 [61%] in 36:27 (2.5/s, eta: 23:16)  4028/8839 [46%] in 26:50 (2.5/s, eta: 32:03)  4031/8839 [46%] in 26:51 (2.5/s, eta: 32:02) (2.5/s, eta: 32:00)  4035/8839 [46%] in 26:52 (2.5/s, eta: 32:00) (2.5/s, eta: 31:58)  ▇▅▃ 4069/8839 [46%] in 27:09 (2.5/s, eta: 31:49) in 27:09 (2.5/s, eta: 31:49) (2.5/s, eta: 31:47) (2.5/s, eta: 31:43)  ▅▇▇ 4089/8839 [46%] in 27:17 (2.5/s, eta: 31:42)  4102/8839 [46%] in 27:22 (2.5/s, eta: 31:37)  ▄▂▂ 4103/8839 [46%] in 27:23 (2.5/s, eta: 31:37)  ▂▄▆ 4116/8839 [47%] in 27:28 (2.5/s, eta: 31:32)  ▅▇▇ 4122/8839 [47%] in 27:31 (2.5/s, eta: 31:30)  4132/8839 [47%] in 27:36 (2.5/s, eta: 31:26) 4134/8839 [47%] in 27:36 (2.5/s, eta: 31:26)  4138/8839 [47%] in 27:38 (2.5/s, eta: 31:24)  4139/8839 [47%] in 27:38 (2.5/s, eta: 31:24)  ▆█▆ 4151/8839 [47%] in 27:43 (2.5/s, eta: 31:19)  4152/8839 [47%] in 27:44 (2.5/s, eta: 31:18) in 27:48 (2.5/s, eta: 31:13) (2.5/s, eta: 31:12) (2.5/s, eta: 31:07) (2.5/s, eta: 31:0

|██████████████████████████████          | ▁▃▅ 6617/8839 [75%] in 44:46 (2.5/s, eta: 15:02)  5403/8839 [61%] in 36:30 (2.5/s, eta: 23:13)  ▃▁▃ 5409/8839 [61%] in 36:32 (2.5/s, eta: 23:11) ▃▁▃ 5416/8839 [61%] in 36:35 (2.5/s, eta: 23:08)  ▆█▆ 5419/8839 [61%] in 36:36 (2.5/s, eta: 23:06) ▄▆█ 5435/8839 [61%] in 36:43 (2.5/s, eta: 23:00) ▃▁▃ 5445/8839 [62%] in 36:47 (2.5/s, eta: 22:56)  5462/8839 [62%] in 36:55 (2.5/s, eta: 22:50)  5465/8839 [62%] in 36:56 (2.5/s, eta: 22:48) 5469/8839 [62%] in 36:58 (2.5/s, eta: 22:47)  ▁▃▅ 5470/8839 [62%] in 36:58 (2.5/s, eta: 22:47) 5481/8839 [62%] in 37:03 (2.5/s, eta: 22:42)  ▃▅▇ 5482/8839 [62%] in 37:04 (2.5/s, eta: 22:42) in 37:05 (2.5/s, eta: 22:41)  ▇▇▅ 5495/8839 [62%] in 37:09 (2.5/s, eta: 22:37) in 37:11 (2.5/s, eta: 22:35)  5501/8839 [62%] in 37:11 (2.5/s, eta: 22:34)  ▂▄▆ 5514/8839 [62%] in 37:17 (2.5/s, eta: 22:29) in 37:18 (2.5/s, eta: 22:29)  ▅▇▇ 5515/8839 [62%] in 37:18 (2.5/s, eta: 22:29)  ▆▄▂ 5518/8839 [62%] in 37:19 (2.5/s, eta: 22:28) 

|███████████████████████████████████▉    | ▄▂▂ 7927/8839 [90%] in 55:04 (2.4/s, eta: 6:21)                              .5/s, eta: 15:01)  ▇▇▅ 6626/8839 [75%] in 44:49 (2.5/s, eta: 14:59) (2.5/s, eta: 14:55) in 44:58 (2.5/s, eta: 14:51)  ▄▆█ 6671/8839 [75%] in 45:11 (2.5/s, eta: 14:41) ▇▇▅ 6673/8839 [75%] in 45:12 (2.5/s, eta: 14:41)  6674/8839 [76%] in 45:12 (2.5/s, eta: 14:40)  6680/8839 [76%] in 45:15 (2.5/s, eta: 14:38) (2.5/s, eta: 14:35)  ▆▄▂ 6687/8839 [76%] in 45:17 (2.5/s, eta: 14:35)  ▄▆█ 6702/8839 [76%] in 45:23 (2.5/s, eta: 14:29)  6703/8839 [76%] in 45:24 (2.5/s, eta: 14:28)  ▂▄▆ 6707/8839 [76%] in 45:26 (2.5/s, eta: 14:27) ▇▇▅ 6710/8839 [76%] in 45:26 (2.5/s, eta: 14:25)  ▄▆█ 6716/8839 [76%] in 45:29 (2.5/s, eta: 14:23) ▁▃▅ 6728/8839 [76%] in 45:33 (2.5/s, eta: 14:18) ▂▄▆ 6728/8839 [76%] in 45:33 (2.5/s, eta: 14:18) (2.5/s, eta: 14:15)  ▂▄▆ 6735/8839 [76%] in 45:36 (2.5/s, eta: 14:15) in 45:36 (2.5/s, eta: 14:15)  6738/8839 [76%] in 45:37 (2.5/s, eta: 14:14)  █▆▄ 6744/8839

|████████████████████████████████████████| 8839/8839 [100%] in 1:01:41.4 (2.39/s)                                       (2.4/s, eta: 6:13)  ▇▇▅ 7951/8839 [90%] in 55:15 (2.4/s, eta: 6:11)  █▆▄ 7965/8839 [90%] in 55:22 (2.4/s, eta: 6:05) ▃▅▇ 7969/8839 [90%] in 55:24 (2.4/s, eta: 6:03)  ▄▆█ 7969/8839 [90%] in 55:24 (2.4/s, eta: 6:03)  7984/8839 [90%] in 55:31 (2.4/s, eta: 5:57) (2.4/s, eta: 5:54) ▇▅▃ 8000/8839 [91%] in 55:39 (2.4/s, eta: 5:51) (2.4/s, eta: 5:48) 8013/8839 [91%] in 55:46 (2.4/s, eta: 5:45) in 55:50 (2.4/s, eta: 5:42)  ▂▄▆ 8033/8839 [91%] in 55:58 (2.4/s, eta: 5:37)  ▃▅▇ 8033/8839 [91%] in 55:58 (2.4/s, eta: 5:37)  ▄▂▂ 8042/8839 [91%] in 56:02 (2.4/s, eta: 5:34) (2.4/s, eta: 5:30)  8057/8839 [91%] in 56:09 (2.4/s, eta: 5:27)  8058/8839 [91%] in 56:10 (2.4/s, eta: 5:27)  8067/8839 [91%] in 56:14 (2.4/s, eta: 5:23) (2.4/s, eta: 5:22)  ▄▂▂ 8077/8839 [91%] in 56:18 (2.4/s, eta: 5:19) (2.4/s, eta: 5:04)  8144/8839 [92%] in 56:48 (2.4/s, eta: 4:51) (2.4/s, eta: 4:51) (2.4/s, eta

In [35]:
#unique_authors = pd.DataFrame(index = unique_names)
#unique_authors["gender"] = gender_list
#unique_authors[unique_authors.gender == 'unknown'].sort_index().to_csv(path + "author_unknown_gender.csv", encoding="utf-8")

unknown_gender_authors = authors[authors.gender == 'unknown'].groupby('deduplicated_name_std')[['pmid']].count().sort_values(by='pmid', ascending=False)
unknown_gender_authors['first_name_is_letter'] = [1 if len(first_name.split(' ')[0]) == 1 else 0 for first_name in unique_authors.index]
unknown_gender_authors.to_csv(path + "author_unknown_gender.csv", encoding="utf-8")
unknown_gender_authors.head()

NameError: name 'unique_authors' is not defined

In [None]:
authors.groupby('deduplicated_name_std').nth(0)

In [None]:
print(len(authors['full_name'].value_counts().index))
print(len(unique_names))
print(authors['gender'].value_counts())
Counter(gender_list)

Unnamed: 0_level_0,pmid,first_name_is_letter
deduplicated_name_std,Unnamed: 1_level_1,Unnamed: 2_level_1
Ozge Tuncalp,151,0
Olufemi Taiwo Oladapo,110,0
Venkatraman C Chandra-Mouli,73,0
Pisake Lumbiganon,63,0
Manjulaa L Narasimhan,42,0
Bukola Fawole,41,0
Rajat Khosla,34,0
Rintaro Mori,29,0
Jianwei Zhang,29,0
World Health Organization Calcium Supplementation for the Prevention of Preeclampsia Trial Group,27,0


In [84]:
authors.to_csv(path+'authors_gender.csv', encoding="utf-8", index=False)

In [98]:
authors = pd.read_csv(path + 'authors_gender.csv')
authors.head()

Unnamed: 0,full_name,affiliation,index_authorship,pmid,last_name,first_letter_name,standard_name,names_variation_std,deduplicated_name_std,n_variacoes_std,gender
0,Erika Hurtado-Salgado,"['Center for Population Health Research, Natio...",1,34921727,HURTADO-SALGADO,E,E HURTADO-SALGADO,['Erika Hurtado-Salgado'],Erika Hurtado-Salgado,1,female
1,Luz Cárdenas-Cárdenas,"['Center for Population Health Research, Natio...",2,34921727,CARDENAS-CARDENAS,L,L CARDENAS-CARDENAS,['Luz Cárdenas-Cárdenas'],Luz Cardenas-Cardenas,1,mostly_female
2,Jorge Salmerón,"['Political, Population and Health Research Ce...",3,34921727,SALMERON,J,J SALMERON,['Jorge Salmerón'],Jorge Salmeron,1,male
3,Rufino Luna-Gordillo,"['National Perinatology Institute, Health Mini...",4,34921727,LUNA-GORDILLO,R,R LUNA-GORDILLO,['Rufino Luna-Gordillo'],Rufino Luna-Gordillo,1,male
4,Eduardo Ortiz-Panozo,"['Center for Population Health Research, Natio...",5,34921727,ORTIZ-PANOZO,E,E ORTIZ-PANOZO,"['Eduardo Ortiz-Panozo', 'E Ortiz-Panozo']",Eduardo Ortiz-Panozo,2,male


In [86]:
authors[authors.gender == 'unknown'].deduplicated_name_std.value_counts()

Ozge Tuncalp                   151
Olufemi Taiwo Oladapo          110
Venkatraman C Chandra-Mouli     73
Pisake Lumbiganon               63
Manjulaa L Narasimhan           42
                              ... 
S Vargas                         1
Benido Impouma                   1
Liknaw Bewket                    1
J Hussein                        1
Ragaa Mansour                    1
Name: deduplicated_name_std, Length: 2754, dtype: int64

In [87]:
org = []
gpe = []
city = []
country = []
other_gpe = []
loc = []
for affiliation in authors['affiliation']:
    doc = nlp(affiliation)

    list_org = []
    list_gpe = []
    list_city = []
    list_country = []
    list_other_gpe = []
    list_loc = []
    for ent in doc.ents:
        if ent.label_== 'ORG':
            list_org += [unidecode(ent.text).upper()]
        elif ent.label_== 'GPE':
            list_gpe += [unidecode(ent.text).upper()]
            if ent.text in countries:
                list_country += [unidecode(ent.text).upper()]
            elif ent.text in cities:
                list_city += [unidecode(ent.text).upper()]
            else:
                list_other_gpe += [unidecode(ent.text).upper()]
        elif ent.label_== 'LOC':
            list_loc += [unidecode(ent.text).upper()]
            
    org += [", ".join(set(list_org))]
    gpe += [", ".join(set(list_gpe))]
    country += [", ".join(set(list_country))]
    city += [", ".join(set(list_city))]
    other_gpe += [", ".join(set(list_other_gpe))]
    loc += [", ".join(set(list_loc))]
    
authors['org'] = org
authors['gpe'] = gpe
authors['loc'] = loc
authors['city'] = city
authors['country'] = country
authors['other_gpe'] = other_gpe


In [88]:
pd.DataFrame(authors[['org','affiliation']].value_counts().sort_index()).to_csv(path + "organizations_affiliation.csv", encoding="utf-8")
pd.DataFrame(authors[['gpe','affiliation']].value_counts().sort_index()).to_csv(path + "gpe_affiliation.csv", encoding="utf-8")
pd.DataFrame(authors['org'].value_counts().sort_index()).to_csv(path + "organizations.csv", encoding="utf-8")
pd.DataFrame(authors['gpe'].value_counts().sort_index()).to_csv(path + "gpe.csv", encoding="utf-8")
pd.DataFrame(authors['loc'].value_counts().sort_index()).to_csv(path + "loc.csv", encoding="utf-8")
pd.DataFrame(authors['affiliation'].value_counts().sort_index()).to_csv(path + "affiliation.csv", encoding="utf-8")

In [89]:
authors['country'].value_counts()

                         9334
SWITZERLAND              2815
SOUTH AFRICA              506
BRAZIL                    439
AUSTRALIA                 327
                         ... 
FRANCE, CANADA              1
TAIWAN, GAMBIA              1
GERMANY, SOUTH AFRICA       1
SWITZERLAND, JAPAN          1
ETHIOPIA, ITALY             1
Name: country, Length: 290, dtype: int64

In [90]:
authors['city'].value_counts()

                                 8883
GENEVA                           2361
LONDON                            428
BALTIMORE                         235
BIRMINGHAM                        183
                                 ... 
ADDIS ABABA, LONDON, BRIGHTON       1
MINSK                               1
KAMPALA, OSLO                       1
BIRNIN KUDU                         1
EVANSVILLE                          1
Name: city, Length: 896, dtype: int64

In [91]:
pd.DataFrame(authors[['gpe','affiliation']].value_counts().sort_index())

Unnamed: 0_level_0,Unnamed: 1_level_0,0
gpe,affiliation,Unnamed: 2_level_1
,"[""Aga Khan University Center for Women's Health and Research. Electronic address: marleen.temmerman@aku.edu.""]",1
,"[""Baylor College of Medicine Children's Foundation, Mbabane, Eswatini.""]",2
,"[""Center for Perinatology, Women's Health, and Reproduction (CLAP/PAHO), Montevideo, Uruguay.""]",2
,"[""Centre de Prise en Charge de Recherche et de Formation Enfants, Abidjan, Côte d'Ivoire.""]",1
,"[""Children's Hospital Colorado, Aurora.""]",1
...,...,...
"ZIMBABWE, SAN FRANCISCO, HARARE","['University of Zimbabwe-University of California, San Francisco, Harare, Zimbabwe.']",3
"ZIMBABWE, SOUTHERTON, HARARE","['National Microbiology Reference Laboratory, Southerton, Harare, Zimbabwe.']",3
ZINDER,"['Maternité Centrale Zinder, Zinder, Niger.']",2
ZOMBA,"['College of Medicine, Department of Obstetrics and Gynaecology, University of Malawi, Zomba, Malawi.']",2


In [92]:
authors.to_csv(path+'authors_gender_loc.csv', index=False, encoding="utf-8")

In [93]:
authors
profile = ProfileReport(authors, minimal=True)
profile.to_file(path + "authors.html")

Summarize dataset:   0%|          | 0/25 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]