In [1]:
import csv
import json
import time
import pandas as pd
import numpy as np

import gender_guesser.detector as gender

import requests
import re
import jellyfish
import spacy
import geonamescache

from functions import *

from os import makedirs
from os.path import exists, join
from unidecode import unidecode
from collections import Counter
from pandas_profiling import ProfileReport
from spacy import displacy 
from alive_progress import alive_bar

nlp = spacy.load("en_core_web_lg")
d = gender.Detector()
gc = geonamescache.GeonamesCache()
# gets nested dictionary for countries
countries = gc.get_countries()
# gets nested dictionary for cities
cities = gc.get_cities()

import sys  
sys.path.insert(0, 'D:\Google Drive\Analises\Codigos python')
from build_features import *


C:\Users\livia\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\Users\livia\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


In [2]:
def gen_dict_extract(var, key):
    if isinstance(var, dict):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, (dict, list)):
                yield from gen_dict_extract(v, key)
    elif isinstance(var, list):
        for d in var:
            yield from gen_dict_extract(d, key)

cities = [*gen_dict_extract(cities, 'name')]
countries = [*gen_dict_extract(countries, 'name')]

countries = [clean_string(country) for country in countries]
cities = [clean_string(city) for city in cities]

In [3]:
institutions_list = ['World Health Organization', 'WHO', 'Research', 'Reproductive', 'Study', 'Health', 
             'GROUP', 'NETWORK', 'Team', 'University', 'Project', 'IEDEA', 
             'Consortium', 'committee', 'all the authors', 'for IeDEA-Southern Africa', 'Systematic', 
             'collaborations', 'Organizacion', 'College', 'Consortium', 'Association', 'Survey', 'Expert',
             'de la Salud', 'Control','collaborations', 'Collaboration', 'committee', 'Universite', 'BURKINA FASO']
institutions_list = [institutions.upper() for institutions in institutions_list]


def prepare_std_name(authors):
    last_name = []
    first_letter_name = []
    standard_name = []
    for nome in authors['full_name']:
        nome = re.sub(' +', ' ', unidecode(nome).replace('-', ' ').strip().upper())
        last_name += [nome.split(" ")[-1]]
        first_letter_name += [nome.split(" ")[0][0]]
        standard_name += [nome.split(" ")[0][0] + " " + nome.split(" ")[-1]]
    
    authors['last_name'] = last_name
    authors['first_letter_name'] = first_letter_name
    authors['standard_name'] = standard_name
    
    return authors

def get_longest_name(group):        
    names_list = group['unique'].iloc[0].tolist()
    candidate_name = unidecode(max(names_list, key=len))

    while len(candidate_name.split(' ')) == 1 and len(names_list) > 1:
        names_list.remove(candidate_name)
        candidate_name = unidecode(max(names_list, key=len))   
    return candidate_name


def deduplica_nome(inicial_data, column_group, column_agg):
    return_data = pd.DataFrame(inicial_data)
    names_variation_unique = inicial_data.groupby(column_group)[column_agg].agg(['unique']).reset_index()
    #authors.to_csv(path+'names_variation_unique.csv', encoding="utf-8", index=False)

    return_data = return_data.reset_index()
    for index, row in return_data.iterrows():
        group = names_variation_unique[names_variation_unique[column_group] == row[column_group]]
        candidate_name = get_longest_name(group)
        
        return_data.loc[index, 'deduplicated_name_std'] = candidate_name
        return_data.loc[index, 'names_variation_std'] = ", ".join(group.unique.iloc[0].tolist())
        return_data.loc[index, 'n_variacoes_std'] = len(group.unique.iloc[0].tolist())
        #print(row[column_group], candidate_name, ", ".join(group.unique.iloc[0].tolist()), len(group.unique.iloc[0].tolist()))
            
    return return_data

def gess_gender_author(name):
    first_name = name.split(' ')[0]
    guessed_gender = 'unknown'
    
    if len(first_name) > 1:
        guessed_gender = d.get_gender(first_name.title())
    
    if guessed_gender =='mostly_female':
        guessed_gender = 'female'
    if guessed_gender =='mostly_male':
        guessed_gender = 'male'
    
    return guessed_gender


def create_deduplicated_columns(authors):
    deduplicate_data =  deduplica_nome(authors, "standard_name", "full_name")    
    return deduplicate_data


def is_groups_name(nome):
    if any(s.upper() in nome.upper() for s in institutions_list):
        return True
    else:
        return False

def remove_groups_names(authors):  
    institutions = []
    for nome in authors.full_name.value_counts().index:
        if is_groups_name(nome):
            authors = authors[authors.full_name != nome]
            if nome not in institutions:
                institutions += [nome]
                
    return authors, institutions


def get_list_names(authors, author):
    list_names = authors[authors['deduplicated_name_std'] == author].iloc[0]['names_variation_std'].split(", ")    
    return list_names

def guess_gender_in_list(list_names, last=False):
    #print(list_names)
    author_guessed_gender = 'unknown'
    index_name = 0
    while (author_guessed_gender == 'unknown' and len(list_names) > index_name):
        if last:
            name = list_names[index_name].split(" ")[-1]
        else:
            name = list_names[index_name].split(" ")[0]
            
        author_guessed_gender = gess_gender_author(name)
        #print(name, author_guessed_gender, index_name)
        index_name = index_name + 1
    
    return author_guessed_gender

def remap_affiliation(authors):   
    secondary_list = []
    authors['countries'] = [[] for i in authors['affiliation']]  
    authors['countries'] = authors['countries'].astype('object')
    affiliations_unique = pd.read_csv(path+'..\\Reviewed Files\\01_affiliations_unique_with_countries_v5.csv')
    affiliations_unique.original = [clean_string(affiliation, True) for affiliation in affiliations_unique.original]
    with alive_bar(len(authors['affiliation']), force_tty=True) as bar:
        for index, row in authors.iterrows():
            affiliation_list = row['affiliation']
            remaped_list = []
            country_list = []
            if affiliation_list != "Collaborators":  
                #print(affiliation_list)
                affiliations = affiliation_list
                
                for affiliation in (affiliations):
                    if not pd.isnull(affiliation):
                        remaps = affiliations_unique.loc[affiliations_unique.original == clean_string(affiliation, True)].reset_index()
                        if len(remaps) > 0:
                            remaped_list += [remaps.iloc[0]['remap']]
                            country_list += [remaps.iloc[0]['country']]
                        else:
                            print(affiliation)
                            remaped_list += [affiliation]
                            secondary_list += [clean_string(str(affiliation), True)]
                authors.at[index, 'affiliation'] = remaped_list
                authors.at[index, 'countries'] = country_list
            bar()
    df = pd.DataFrame(secondary_list, columns=["affiliations_secondary"])
    df.to_csv(path+'secondary_list.csv', index=False)

# Preparação do banco

In [28]:
path = 'C:\\Users\\livia\\Dropbox\\HRP Alliance authorship paper\\Data 2022-12-16\\'

papers = pd.read_csv(path + 'DatabaseCompleted_v2_VB.csv')
if "Unnamed: 0" in papers.columns:
    papers = papers.drop("Unnamed: 0", axis = 1)
papers.reset_index(inplace=True)
papers = papers.drop("index", axis = 1)

print(len(papers), "\tCount total of retrieved papers")
print(sum(papers.pmid.isna()), "\tTotal of papers without PMID")
print(sum(papers['pmid'].duplicated()), "\tTotal of papers with duplicated PMID")
print(sum(papers['doi'].isna()), "\tTotal of papers without DOI")
print(sum(papers['doi'].dropna().duplicated()), "\tTotal of papers with duplicated DOI")

papers = papers[~(papers['pmid'].duplicated()) | (papers['pmid'].isna())]
print(len(papers), "\tTotal of papers after removed nans and duplicated pmids")


print("{}\tNumber of unique pmids".format(len(papers.pmid.unique())))
print("{}\tNumber rows".format(len(papers)))

4382 	Count total of retrieved papers
0 	Total of papers without PMID
0 	Total of papers with duplicated PMID
1 	Total of papers without DOI
0 	Total of papers with duplicated DOI
4382 	Total of papers after removed nans and duplicated pmids
4382	Number of unique pmids
4382	Number rows


In [29]:
papers["Tags"].value_counts(dropna=False)
papers["VB check"].value_counts(dropna=False)

NaN               3250
No HRP authors     929
Include            111
Not found           91
Unclear              1
Name: VB check, dtype: int64

In [33]:
papers["Inclusion"] = ["Include" if row["Tags"] in ["Included by VB", "Has HRP authors"] else
                         "Not included" if row["Tags"] in ["Not included"] else
                         "Not included" if row["VB check"] in ["No HRP authors", "Not found", "Unclear"] else 
                         "Include" if row["VB check"] in ["Include"] else "Unclassifed" for index, row in papers.iterrows()]
papers["Inclusion"].value_counts(dropna=False)

Not included    2873
Include         1509
Name: Inclusion, dtype: int64

In [37]:
print("{}\tNumber of papers from 2022+".format(len(papers[papers.publication_year >= 2022].pmid)))
print("{}\tNumber of papers with type Published Erratum".format(len(papers[papers.publication_type == "Published Erratum"].pmid)))
print("{}\tNumber of papers with type Interview".format(len(papers[papers.publication_type == "Interview"].pmid)))
print("{}\tNumber of papers not included".format(len(papers[papers.Inclusion == "Not included"].pmid)))

##pmid_to_delete = list(papers[papers.doi == "10.1159/000343054"].pmid) -> interview
#pmid_to_delete += [27227232]
pmid_to_delete = []
pmid_to_delete += list(papers[papers.publication_type == "Published Erratum"].pmid)
pmid_to_delete += list(papers[papers.publication_type == "Interview"].pmid)
pmid_to_delete += list(papers[papers.publication_year >= 2022].pmid)
pmid_to_delete += list(papers[papers.Inclusion == "Not included"].pmid)


print("{}\tPmid to delete".format(len(set(pmid_to_delete))))

63	Number of papers from 2022+
17	Number of papers with type Published Erratum
1	Number of papers with type Interview
2873	Number of papers not included
2873	Pmid to delete


In [38]:
papers = papers[~papers.pmid.isin(pmid_to_delete)]

print("{} Remained elegible pmids: ".format(len(papers.pmid.unique())))
print("{} Number rows".format(len(papers)))

1509 Remained elegible pmids: 
1509 Number rows


In [40]:
## Read authors file, reset the index, drop duplicated rows
authors = pd.read_csv(path + 'authors.csv').reset_index()
authors = authors.drop("index", axis = 1)

authors = authors.reset_index()
if "Unnamed: 0" in authors.columns:
    authors = authors.drop("Unnamed: 0", axis = 1)
authors = authors.drop("index", axis = 1)

print(len(authors), "\tTotal of authors *before* drop duplicated")
authors.drop_duplicates(inplace=True)
print(len(authors), "\tTotal of authors *after* drop duplicated")


## Remove the authors from papers deleted from the excluded criteria
authors = authors[~authors.pmid.isin(pmid_to_delete)]
print(len(authors), "\tTotal of authors *after* drop excluded pmid")


## Keep only authors with pmid on list of elegible papers
elegible_papers_pmid = list(papers.pmid)
authors = authors[authors.pmid.isin(elegible_papers_pmid)]
print(len(authors), "\tTotal of authors *after* drop excluded pmid")

print("{}\tTotal of pmids in authors file".format(len(authors.pmid.value_counts())))
print("{}\tTotal of pmids in papers file".format(len(papers.pmid.value_counts(dropna=False))))
print("{}\tPMIDs diff between authors and papers".format(len(papers.pmid.value_counts()) - len(authors.pmid.value_counts())))

## Check if the author is a institution or a correpondent group (not an individual)
authors["is_institutions_name"] = [False for a in authors.full_name]
for index, row in authors.iterrows():
    affiliation_list= row['affiliation']
    remaped_list = []
    country_list = []
    if affiliation_list != "Collaborators": 
        affiliations = eval(str(affiliation_list))      
    authors.at[index, 'affiliation'] = affiliations
    authors.at[index, "is_institutions_name"] = is_groups_name(row.full_name)

42606 	Total of authors *before* drop duplicated
38678 	Total of authors *after* drop duplicated
15803 	Total of authors *after* drop excluded pmid
14607 	Total of authors *after* drop excluded pmid
1492	Total of pmids in authors file
1509	Total of pmids in papers file
17	PMIDs diff between authors and papers


## Create auxiliar columns to count authorship type

In [8]:
papers["count_authors"] = [0 for a in papers.authors_list]
papers["count_authors_institutions"] = [0 for a in papers.authors_list]
papers["count_authors_individual"] = [0 for a in papers.authors_list]

papers["count_collaborators"] = [0 for a in papers.authors_list]
papers["count_collaborators_institutions"] = [0 for a in papers.authors_list]
papers["count_collaborators_individual"] = [0 for a in papers.authors_list]

##For each paper, get the author list and count authors
for index, row in papers.iterrows():
    if not pd.isnull(row.authors_list) and str(row.authors_list) != "" and str(row.authors_list).lower() != "nan":
        authors_list = str(row.authors_list).split(",")
        papers.at[index, "count_authors_1"] = len(authors_list)    
        papers.at[index, "count_authors_institutions_1"] = sum([1 if any(institutions in authors_name for institutions in institutions_list) else 0 for authors_name in authors_list])
        papers.at[index, "count_authors_individual_1"] = papers.at[index, "count_authors"] - papers.at[index, "count_authors_institutions"]
    else:
        papers.at[index, "count_authors_1"] = 0
        papers.at[index, "count_authors_institutions_1"] = 0
        papers.at[index, "count_authors_individual_1"] = 0
        
    authors_list = list(authors[(authors.index_authorship != 0) & (authors.pmid == row.pmid)].full_name)
    papers.at[index, "count_authors"] = len(authors_list)    
    papers.at[index, "count_authors_institutions"] = sum([1 if any(institutions in authors_name for institutions in institutions_list) else 0 for authors_name in authors_list])
    papers.at[index, "count_authors_individual"] = papers.at[index, "count_authors"] - papers.at[index, "count_authors_institutions"]

    collaborators_list = list(authors[(authors.index_authorship == 0) & (authors.pmid == row.pmid)].full_name)
    papers.at[index, "count_collaborators"] = len(collaborators_list)
    papers.at[index, "count_collaborators_institutions"] = sum([1 if any(institutions in collaborators_name for institutions in institutions_list) else 0 for collaborators_name in collaborators_list])
    papers.at[index, "count_collaborators_individual"] = papers.at[index, "count_collaborators"] - papers.at[index, "count_collaborators_institutions"]
        
print("{}\tNumber of pmids".format(len(papers.pmid.unique())))
papers.drop_duplicates(inplace=True)
print("{}\tNumber rows".format(len(papers)))
#papers.to_csv(path + "papers_final.csv", index=False)

2529	Number of pmids
2529	Number rows


In [9]:
print("Total of authors:", sum(papers["count_authors"]), 
        "\nTotal of collaborators:", sum(papers["count_collaborators"]),
        "\nTotal:", sum(papers["count_authors"]) + sum(papers["count_collaborators"]),
        "\nTotal:", sum(papers["count_authors_1"]) + sum(papers["count_collaborators"])
     )

Total of authors: 18940 
Total of collaborators: 3540 
Total: 22480 
Total: 22480.0


## Validating numbers of authorship

In [10]:
only_collaboration_groups = papers[(papers["count_authors"] >= 1) & (papers["count_authors_individual"] == 0) & (papers["count_authors_institutions"] >= 1)]
only_individual_authors = papers[(papers["count_authors"] >= 1) & (papers["count_authors_individual"] >= 1) & (papers["count_authors_institutions"] == 0)]
both = papers[(papers["count_authors"] >= 1) & (papers["count_authors_individual"] >= 1) & (papers["count_authors_institutions"] >= 1)]
no_authors = papers[(papers["count_authors"] == 0)]

print("Papers with only collaborations groups authors: ", len(only_collaboration_groups))
print("Total of individual collaborators on papers with only collaborations groups authors: ", sum(only_collaboration_groups["count_collaborators_individual"]))
print("Total of collaborators on papers with only collaborations groups authors: ", sum(only_collaboration_groups["count_collaborators"]))
print("\nPapers with only individual authors: ", len(only_individual_authors))
print("Total of individual authors on papers with only individual authors: ", sum(only_individual_authors["count_authors_individual"]))
print("Total of authors on papers with only individual authors: ", sum(only_individual_authors["count_authors"]))
print("\nPapers of individual and collaboration authors: ", len(both))
print("Total of individual authors on papers with both types of authors: ", sum(both["count_authors_individual"]))
print("Total of authors on papers with both types of authors: ", sum(both["count_authors"]))
print("Total of colaborators groups on papers with both types of authors: ", sum(both["count_authors_institutions"]))
print("Total of individual collaborators on papers with both types of authors: ", sum(both["count_collaborators_individual"]))
print("Total of collaborators on papers with both types of authors: ", sum(both["count_collaborators"]))
print("\nPapers with no authors: ", len(no_authors))
#papers[(papers["count_authors"] == 0)].doi

Papers with only collaborations groups authors:  17
Total of individual collaborators on papers with only collaborations groups authors:  453
Total of collaborators on papers with only collaborations groups authors:  454

Papers with only individual authors:  2314
Total of individual authors on papers with only individual authors:  15733
Total of authors on papers with only individual authors:  15733

Papers of individual and collaboration authors:  178
Total of individual authors on papers with both types of authors:  3005
Total of authors on papers with both types of authors:  3187
Total of colaborators groups on papers with both types of authors:  182
Total of individual collaborators on papers with both types of authors:  3013
Total of collaborators on papers with both types of authors:  3014

Papers with no authors:  20


In [11]:
institutions_pmid = papers[(papers["count_authors"] >= 1) & (papers["count_authors_individual"] == 0) & 
                           (papers["count_authors_institutions"] >= 1) & (papers["count_collaborators"] == 0)].pmid
#authors[authors.pmid.isin(institutions_pmid)]
len(institutions_pmid)

11

In [12]:
n_papers = len(authors.pmid.value_counts())
n_authors = len(authors['full_name'])
print("{}\tNumber of pmids".format(len(authors.pmid.unique())))
print("{}\tNumber of authors".format(n_authors))
print("{}\tNumber unique authors".format(len(authors['full_name'].value_counts())))

authors, institutions = remove_groups_names(authors)

print("\n\tRemoving institutions")
print("{}\tNumber of pmids with authors (here is excluded the papers authored only by an institution and without collaborators)".format(len(authors.pmid.unique())))
print("{}\tRemoved papers".format(n_papers - len(authors.pmid.unique())))
print("{}\tNumber of authors".format(len(authors['full_name'])))
print("{}\tNumber unique authors".format(len(authors['full_name'].value_counts())))


print("{}\tNumber of institutions".format(n_authors - len(authors['full_name'])))
#authors.pmid.value_counts()

2509	Number of pmids
22480	Number of authors
13361	Number unique authors

	Removing institutions
2498	Number of pmids with authors (here is excluded the papers authored only by an institution and without collaborators)
11	Removed papers
22276	Number of authors
13213	Number unique authors
204	Number of institutions


In [13]:
authors

Unnamed: 0,full_name,affiliation,index_authorship,pmid,is_institutions_name
0,SUSANNE JACOBSSON,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,1,30456870,False
1,IRYNA BOIKO,[CLINICAL LABORATORY DEPARTMENT TERNOPIL REGIO...,2,30456870,False
2,DANIEL GOLPARIAN,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,3,30456870,False
3,KAREL BLONDEEL,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,4,30456870,False
4,JAMES KIARIE,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,5,30456870,False
...,...,...,...,...,...
42597,DORIS CHOU,[],3,23584466,False
42598,NOBUKO MIZOGUCHI,[],4,23584466,False
42599,LALE SAY,[],5,23584466,False
42600,EMI SUZUKI,[],6,23584466,False


In [14]:
authors = prepare_std_name(authors)
authors.tail(2)

Unnamed: 0,full_name,affiliation,index_authorship,pmid,is_institutions_name,last_name,first_letter_name,standard_name
42600,EMI SUZUKI,[],6,23584466,False,SUZUKI,E,E SUZUKI
42601,JOHN WILMOTH,[],7,23584466,False,WILMOTH,J,J WILMOTH


In [15]:
authors = create_deduplicated_columns(authors)

In [16]:
print("Number of authors: {}".format(len(authors)))
print("Number unique authors (after deduplication): {}\n".format(len(authors['deduplicated_name_std'].value_counts())))

Number of authors: 22276
Number unique authors (after deduplication): 10865



In [17]:
authors

Unnamed: 0,index,full_name,affiliation,index_authorship,pmid,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std
0,0,SUSANNE JACOBSSON,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,1,30456870,False,JACOBSSON,S,S JACOBSSON,SUSANNE JACOBSSON,SUSANNE JACOBSSON,1.0
1,1,IRYNA BOIKO,[CLINICAL LABORATORY DEPARTMENT TERNOPIL REGIO...,2,30456870,False,BOIKO,I,I BOIKO,IRYNA BOIKO,IRYNA BOIKO,1.0
2,2,DANIEL GOLPARIAN,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,3,30456870,False,GOLPARIAN,D,D GOLPARIAN,DANIEL GOLPARIAN,DANIEL GOLPARIAN,1.0
3,3,KAREL BLONDEEL,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,4,30456870,False,BLONDEEL,K,K BLONDEEL,KAREL BLONDEEL,KAREL BLONDEEL,1.0
4,4,JAMES KIARIE,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,5,30456870,False,KIARIE,J,J KIARIE,JAMES NJOGU KIARIE,"JAMES KIARIE, JAMES N KIARIE, JAMES NJOGU KIAR...",5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
22271,42597,DORIS CHOU,[],3,23584466,False,CHOU,D,D CHOU,DORIS CHOU,"DORIS CHOU, D CHOU",2.0
22272,42598,NOBUKO MIZOGUCHI,[],4,23584466,False,MIZOGUCHI,N,N MIZOGUCHI,NOBUKO MIZOGUCHI,NOBUKO MIZOGUCHI,1.0
22273,42599,LALE SAY,[],5,23584466,False,SAY,L,L SAY,LALE SAY,"LALE SAY, L SAY",2.0
22274,42600,EMI SUZUKI,[],6,23584466,False,SUZUKI,E,E SUZUKI,EMI SUZUKI,EMI SUZUKI,1.0


In [18]:
len(papers)

2529

In [19]:
sum(papers.authors_list.isna())

20

In [20]:
papers.index = list(papers.pmid)
authors.index = list(authors.pmid)
authors.head()

Unnamed: 0,index,full_name,affiliation,index_authorship,pmid,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std
30456870,0,SUSANNE JACOBSSON,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,1,30456870,False,JACOBSSON,S,S JACOBSSON,SUSANNE JACOBSSON,SUSANNE JACOBSSON,1.0
30456870,1,IRYNA BOIKO,[CLINICAL LABORATORY DEPARTMENT TERNOPIL REGIO...,2,30456870,False,BOIKO,I,I BOIKO,IRYNA BOIKO,IRYNA BOIKO,1.0
30456870,2,DANIEL GOLPARIAN,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,3,30456870,False,GOLPARIAN,D,D GOLPARIAN,DANIEL GOLPARIAN,DANIEL GOLPARIAN,1.0
30456870,3,KAREL BLONDEEL,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,4,30456870,False,BLONDEEL,K,K BLONDEEL,KAREL BLONDEEL,KAREL BLONDEEL,1.0
30456870,4,JAMES KIARIE,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,5,30456870,False,KIARIE,J,J KIARIE,JAMES NJOGU KIARIE,"JAMES KIARIE, JAMES N KIARIE, JAMES NJOGU KIAR...",5.0


In [21]:
publication_year = papers#[['pmid', 'publication_year']]#, 'first_author','last_author']]
publication_year = publication_year.join(authors, how="right", lsuffix="_l", rsuffix="").reset_index()
print(len(publication_year))
publication_year.head(2)#.affiliation.value_counts()

22276


Unnamed: 0,level_0,Term,Title,Tags,Origin,pmid_l,title,authors_list,citation,journal_book,...,affiliation,index_authorship,pmid,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std
0,1280686,1280686,Light and electron microscopic immunolocalizat...,Unverified,HRPPublicationsDatabase_UpdateDecember2014,1280686,Light and electron microscopic immunolocalizat...,"M HOMYK, J C HERR","Homyk M, Herr JC. Light and electron microscop...",Journal of reproductive immunology,...,[DEPARTMENT OF ANATOMY AND CELL BIOLOGY UNIVER...,1,1280686,False,HOMYK,M,M HOMYK,M HOMYK,M HOMYK,1.0
1,1280686,1280686,Light and electron microscopic immunolocalizat...,Unverified,HRPPublicationsDatabase_UpdateDecember2014,1280686,Light and electron microscopic immunolocalizat...,"M HOMYK, J C HERR","Homyk M, Herr JC. Light and electron microscop...",Journal of reproductive immunology,...,[],2,1280686,False,HERR,J,J HERR,J C HERR,J C HERR,1.0


In [22]:
publication_year.affiliation

0        [DEPARTMENT OF ANATOMY AND CELL BIOLOGY UNIVER...
1                                                       []
2        [DEPARTMENT OF PHYSIOLOGY FACULTY OF MEDICINE ...
3                                                       []
4        [CENTRAL VETERINARY INSTITUTE LELYSTAD THE NET...
                               ...                        
22271    [UCL INSTITUTE FOR GLOBAL HEALTH FACULTY OF PO...
22272    [UNDP UNFPA UNICEF WHO WORLD BANK SPECIAL PROG...
22273    [DEPARTMENT OF INFECTIOUS DISEASE EPIDEMIOLOGY...
22274    [UNDP UNFPA UNICEF WHO WORLD BANK SPECIAL PROG...
22275    [UNDP UNFPA UNICEF WHO WORLD BANK SPECIAL PROG...
Name: affiliation, Length: 22276, dtype: object

In [23]:
authors_without_affiliation = publication_year[publication_year.affiliation.map(len) == 0]
print(len(authors_without_affiliation),len(authors_without_affiliation)/len(publication_year)*100)

for index, row in authors_without_affiliation.iterrows():
    affiliation = publication_year[(publication_year.deduplicated_name_std == row.deduplicated_name_std) & 
                     ((publication_year.publication_year >= row.publication_year - 0) &
                      (publication_year.publication_year <= row.publication_year + 0)
                     ) & 
                     #(publication_year.affiliation != "[]") &
                     (publication_year.affiliation.map(len) > 0) &
                          (publication_year.affiliation != "Collaborators")]#.affiliation
    if len(affiliation) > 0:
        #print (index, publication_year.iloc[index].publication_year, "\n",affiliation[['publication_year', 'affiliation']])#.iloc[0])
        #print(affiliation.sort_values(by='publication_year', ascending=True).publication_year.iloc[0])
        publication_year.at[index, 'affiliation'] = affiliation.sort_values(by='publication_year', ascending=True).affiliation.iloc[0]
        
authors_without_affiliation = publication_year[publication_year.affiliation.map(len) == 0]
print(len(authors_without_affiliation), len(authors_without_affiliation)/len(publication_year)*100)

10262 46.06751660980427
8934 40.1059436164482


In [24]:
authors = publication_year
#authors.rename(columns={"pmid_l": "pmid"}, inplace=True)

In [25]:
authors.affiliation

0        [DEPARTMENT OF ANATOMY AND CELL BIOLOGY UNIVER...
1        [DEPARTMENT OF ANATOMY AND CELL BIOLOGY UNIVER...
2        [DEPARTMENT OF PHYSIOLOGY FACULTY OF MEDICINE ...
3                                                       []
4        [CENTRAL VETERINARY INSTITUTE LELYSTAD THE NET...
                               ...                        
22271    [UCL INSTITUTE FOR GLOBAL HEALTH FACULTY OF PO...
22272    [UNDP UNFPA UNICEF WHO WORLD BANK SPECIAL PROG...
22273    [DEPARTMENT OF INFECTIOUS DISEASE EPIDEMIOLOGY...
22274    [UNDP UNFPA UNICEF WHO WORLD BANK SPECIAL PROG...
22275    [UNDP UNFPA UNICEF WHO WORLD BANK SPECIAL PROG...
Name: affiliation, Length: 22276, dtype: object

In [26]:
type(authors.affiliation)

pandas.core.series.Series

In [27]:
remap_affiliation(authors)
#authors.head()

on 27: SPECIAL PROGRAMME OF RESEARCH DEVELOPMENT AND RESEARCH TRAINING IN HUMAN REPRODUCTION WORLD HEALTH ORGANISATION GENEVA SWITZERLAND.
on 37: WHO COLLABORATING CENTRE FOR RESEARCH IN HUMAN REPRODUCTION DEPARTMENT OF OBSTETRICS AND GYNECOLOGY ALBERT SZENT GYÖRGYI MEDICAL UNIVERSITY SZEGED HUNGARY.
on 174: UNIVERSITY OF EDINBURGH DEPARTMENT OF OBSTETRICS AND GYNAECOLOGY UK.                                            
on 475: UNIVERSIDADE FEDERAL DE JUIZ DE FORA BRAZIL.                                                                    ta: 31s) 
on 486: SPECIAL PROGRAMME OF RESEARCH ON HUMAN REPRODUCTION WORLD HEALTH ORGANIZATION GENEVA SWITZERLAND.               
on 640: UNIVERSITY OF VIRGINIA CHARLOTTESVILLE.                                                                         
on 660: UNIVERSITY OF VIRGINIA CHARLOTTESVILLE.                                                                         
on 1512: UNIDADE DE GONADAS E INTERSEXO FACULDADE DE MEDICINA HOSPITAL DAS CLINICAS 

on 4980: WORLD HEALTH ORGANIZATION COLLABORATING CENTER IN REPRODUCTIVE HEALTH DIVISION OF REPRODUCTIVE HEALTH NATIONAL CENTER FOR CHRONIC DISEASE PREVENTION AND HEALTH PROMOTION CENTERS FOR DISEASE CONTROL AND PREVENTION ATLANTA GA 30341 USA. KMC6@CDC.GOV
on 4985: WORLD HEALTH ORGANIZATION COLLABORATING CENTER IN REPRODUCTIVE HEALTH DIVISION OF REPRODUCTIVE HEALTH NATIONAL CENTER FOR CHRONIC DISEASE PREVENTION AND HEALTH PROMOTION CENTERS FOR DISEASE CONTROL AND PREVENTION ATLANTA GA 30341 USA. KMC6@CDC.GOV
on 4988: WORLD HEALTH ORGANIZATION COLLABORATING CENTER IN REPRODUCTIVE HEALTH DIVISION OF REPRODUCTIVE HEALTH NATIONAL CENTER FOR CHRONIC DISEASE PREVENTION AND HEALTH PROMOTION CENTERS FOR DISEASE CONTROL AND PREVENTION ATLANTA GA 30341 USA. KMC6@CDC.GOV
on 4991: WHO COLLABORATING CENTER IN REPRODUCTIVE HEALTH DIVISION OF REPRODUCTIVE HEALTH CENTERS FOR DISEASE CONTROL AND PREVENTION ATLANTA GA 30341 USA.
on 4992: WORLD HEALTH ORGANIZATION COLLABORATING CENTER IN REPRODUCTIVE HEA

In [28]:
for index, aff in authors.iterrows():
    if (aff.pmid in [17308725, 23240739, 27227224]) & (aff.index_authorship==1):
        print(aff.affiliation)
#affiliations_unique.loc[affiliations_unique.original == clean_string("DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARCH (RHR) WORLD HEALTH ORGANIZATION (WHO) GENEVA SWITZERLAND.", True)].reset_index().remap

for index, aff in authors.iterrows():
    if ("DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARCH (RHR) WORLD HEALTH ORGANIZATION (WHO) GENEVA SWITZERLAND." in aff.affiliation) \
    & (aff.pmid in [17308725, 23240739, 27227224]) & (aff.index_authorship==1):
        print(aff.full_name)

['DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARCH (RHR) WORLD HEALTH ORGANIZATION (WHO) GENEVA SWITZERLAND.']
['DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARCH (RHR) WORLD HEALTH ORGANIZATION (WHO) GENEVA SWITZERLAND.']
['DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARCH (RHR) WORLD HEALTH ORGANIZATION (WHO) GENEVA SWITZERLAND.']
IQBAL H SHAH
MARLEEN TEMMERMAN
A METIN GÜLMEZOGLU


In [29]:
authors.to_csv(path+'deduplicated_authors.csv', encoding="utf-8", index=False)

In [30]:
print("Number of authors: {}".format(len(authors['full_name'])))
print("Number unique authors: {}".format(len(authors['full_name'].value_counts())))

print("Number of authors: {}".format(len(authors['deduplicated_name_std'])))
print("Number unique authors: {}".format(len(authors['deduplicated_name_std'].value_counts())))

Number of authors: 22276
Number unique authors: 13213
Number of authors: 22276
Number unique authors: 10865


In [31]:
gender_list = []
unique_names = authors['deduplicated_name_std'].value_counts().index

with alive_bar(len(unique_names), force_tty=True) as bar:
    for author in unique_names:
        list_names_variation = get_list_names(authors, author)
        author_guessed_gender = guess_gender_in_list(list_names_variation)
        
        ## For last name
        if author_guessed_gender == 'unknown':            
            author_guessed_gender = guess_gender_in_list(list_names_variation, last=True)
        
        authors.loc[authors['deduplicated_name_std'] == author, 'gender'] = author_guessed_gender
        gender_list += [author_guessed_gender]
        #print(author, author_guessed_gender)
        bar()
        

|████████████████████████████████████████| 10865/10865 [100%] in 1:31.7 (118.53/s)                                      : 2:24) 94/10865 [1%] in 1s (79.5/s, eta: 2:19)  111/10865 [1%] in 1s (81.8/s, eta: 2:13) 191/10865 [2%] in 2s (91.5/s, eta: 1:59) 639/10865 [6%] in 6s (111.1/s, eta: 1:32) (112.7/s, eta: 1:31)  742/10865 [7%] in 6s (114.4/s, eta: 1:29)  ▆▄▂ 1023/10865 [9%] in 9s (119.8/s, eta: 1:23) in 10s (115.6/s, eta: 1:24) ▂▄▆ 1236/10865 [11%] in 12s (100.5/s, eta: 1:35)   ▃▅▇ 1338/10865 [12%] in 14s (98.9/s, eta: 1:36)  1364/10865 [13%] in 14s (98.6/s, eta: 1:36)  ▃▅▇ 1560/10865 [14%] in 16s (99.3/s, eta: 1:34)  ▁▃▅ 1653/10865 [15%] in 17s (99.8/s, eta: 1:32)  ▂▂▄ 1760/10865 [16%] in 18s (100.5/s, eta: 1:31) (101.2/s, eta: 1:28)  ▁▃▅ 1979/10865 [18%] in 20s (101.1/s, eta: 1:28) ▄▆█ 1997/10865 [18%] in 20s (100.6/s, eta: 1:28) ▄▂▂ 2136/10865 [20%] in 22s (98.8/s, eta: 1:28) ▁▃▅ 2159/10865 [20%] in 22s (98.5/s, eta: 1:28)  2267/10865 [21%] in 23s (98.3/s, eta: 1:28) (98.4/s, eta: 

In [32]:
authors['constant'] =["N={}".format(len(authors)) for i in authors['gender']]
tabelaEstiloArtigoCategoricas(authors, 'constant', 'gender', probabilidade='col')

  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))
  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))


constant,N=22276,N=22276,All,All
distribuição,N,%,N,%
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
andy,1593,(7.15%),1593,(7.15%)
female,7897,(35.45%),7897,(35.45%)
male,7867,(35.32%),7867,(35.32%)
unknown,4919,(22.08%),4919,(22.08%)
All,22276,(100.0%),22276,(100.0%)


In [33]:
unknown_gender_authors = authors[(authors.gender == 'unknown') | (authors.gender == 'andy')]
print("Total of unique names unknown gender: {} ({:.2%})".
      format(len(unknown_gender_authors.deduplicated_name_std.unique()), 
            len(unknown_gender_authors.deduplicated_name_std.unique())/len(authors.deduplicated_name_std.unique())))
print("Total of names unknown gender: {} ({:.2%})\n".format(len(unknown_gender_authors.deduplicated_name_std),
            len(unknown_gender_authors.deduplicated_name_std)/len(authors.deduplicated_name_std)))

known_gender_authors = authors[authors.gender != 'unknown']
print("Total of unique names suggested gender: {} ({:.2%})".
      format(len(known_gender_authors.deduplicated_name_std.unique()), 
            len(known_gender_authors.deduplicated_name_std.unique())/len(authors.deduplicated_name_std.unique())))
print("Total of names suggested gender: {} ({:.2%})".format(len(known_gender_authors.deduplicated_name_std),
            len(known_gender_authors.deduplicated_name_std)/len(authors.deduplicated_name_std)))

Total of unique names unknown gender: 3644 (33.54%)
Total of names unknown gender: 6512 (29.23%)

Total of unique names suggested gender: 7985 (73.49%)
Total of names suggested gender: 17357 (77.92%)


In [34]:
#len(unknown_gender_authors.value_counts())
#unknown_gender_authors.deduplicated_name_std.value_counts()

In [35]:
names_checked = pd.read_csv(path + '..\\Reviewed Files\\01_names_checked_v2.csv')
names_checked.deduplicated_name_std = names_checked.deduplicated_name_std.str.title()
#names_checked.head(5)
names_checked.gender.value_counts()

female     367
male       315
unknown    305
andy        15
Name: gender, dtype: int64

In [36]:
for index, row in unknown_gender_authors.iterrows():
    gender = names_checked[names_checked.deduplicated_name_std == row.deduplicated_name_std.title()]
    if len(gender) > 0:
        #print(row.deduplicated_name_std.title())
        authors.loc[index, 'gender'] = gender.iloc[0].gender
        #print(row.deduplicated_name_std.title(), gender)

In [37]:
tabelaEstiloArtigoCategoricas(authors, 'constant', 'gender', probabilidade='col')

  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))
  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))


constant,N=22276,N=22276,All,All
distribuição,N,%,N,%
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
andy,1601,(7.19%),1601,(7.19%)
female,8508,(38.19%),8508,(38.19%)
male,8707,(39.09%),8707,(39.09%)
unknown,3460,(15.53%),3460,(15.53%)
All,22276,(100.0%),22276,(100.0%)


In [38]:
unknown_gender_authors = authors[(authors.gender == 'unknown') | (authors.gender == 'andy')]
print("Total of unique names unknown gender: {} ({:.2%})".
      format(len(unknown_gender_authors.deduplicated_name_std.unique()), 
            len(unknown_gender_authors.deduplicated_name_std.unique())/len(authors.deduplicated_name_std.unique())))
print("Total of names unknown gender: {} ({:.2%})\n".format(len(unknown_gender_authors.deduplicated_name_std),
            len(unknown_gender_authors.deduplicated_name_std)/len(authors.deduplicated_name_std)))

known_gender_authors = authors[authors.gender != 'unknown']
print("Total of unique names suggested gender: {} ({:.2%})".
      format(len(known_gender_authors.deduplicated_name_std.unique()), 
            len(known_gender_authors.deduplicated_name_std.unique())/len(authors.deduplicated_name_std.unique())))
print("Total of names suggested gender: {} ({:.2%})".format(len(known_gender_authors.deduplicated_name_std),
            len(known_gender_authors.deduplicated_name_std)/len(authors.deduplicated_name_std)))

Total of unique names unknown gender: 3196 (29.42%)
Total of names unknown gender: 5061 (22.72%)

Total of unique names suggested gender: 8444 (77.72%)
Total of names suggested gender: 18816 (84.47%)


In [39]:
authors.gender.replace('andy', 'unknown', inplace=True)
tabelaEstiloArtigoCategoricas(authors, 'constant', 'gender', probabilidade='col')

  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))
  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))


constant,N=22276,N=22276,All,All
distribuição,N,%,N,%
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
female,8508,(38.19%),8508,(38.19%)
male,8707,(39.09%),8707,(39.09%)
unknown,5061,(22.72%),5061,(22.72%)
All,22276,(100.0%),22276,(100.0%)


In [40]:
authors.tail(2)

Unnamed: 0,level_0,Term,Title,Tags,Origin,pmid_l,title,authors_list,citation,journal_book,...,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std,countries,gender,constant
22274,35590230,10.1016/j.medj.2021.04.023,The role of the health sector in contributing ...,Included by VB,covidence-doi,35590230,The role of the health sector in contributing ...,"CHRISTINA C PALLITTO, WISAL AHMED","Pallitto CC, Ahmed W. The role of the health s...","Med (New York, N.Y.)",...,False,PALLITTO,C,C PALLITTO,CHRISTINA CATHERINE PALLITTO,"CHRISTINA PALLITTO, CHRISTINA CATHERINE PALLIT...",5.0,[SWITZERLAND],female,N=22276
22275,35590230,10.1016/j.medj.2021.04.023,The role of the health sector in contributing ...,Included by VB,covidence-doi,35590230,The role of the health sector in contributing ...,"CHRISTINA C PALLITTO, WISAL AHMED","Pallitto CC, Ahmed W. The role of the health s...","Med (New York, N.Y.)",...,False,AHMED,W,W AHMED,WISAL AHMED,WISAL AHMED,1.0,[SWITZERLAND],male,N=22276


In [41]:
names_checked[names_checked.deduplicated_name_std == "I K WARRINER".title()]

Unnamed: 0,deduplicated_name_std,first_name,gender
429,I K Warriner,I,female


In [42]:
names_checked.tail()

Unnamed: 0,deduplicated_name_std,first_name,gender
997,Z Fekete,Z,unknown
998,Zafiro Andrade Romo,ZAFIRO,male
999,C Abouzahr,C,female
1000,Bidia D Deperthes,BIDIA,female
1001,Qian Long,QIAN,female


In [43]:
authors[authors.deduplicated_name_std == "I K WARRINER"]#, 'affiliation']

Unnamed: 0,level_0,Term,Title,Tags,Origin,pmid_l,title,authors_list,citation,journal_book,...,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std,countries,gender,constant
5318,17141703,17141703,Rates of complication in first-trimester manua...,Included by VB,covidence-doi,17141703,Rates of complication in first-trimester manua...,"I K WARRINER, O MEIRIK, M HOFFMAN, C MORRONI, ...","Warriner IK, Meirik O, Hoffman M, et al. Rates...","Lancet (London, England)",...,False,WARRINER,I,I WARRINER,I K WARRINER,I K WARRINER,1.0,[SWITZERLAND],female,N=22276
12802,29246235,10.1186/s12978-017-0438-7,Comparative satisfaction of receiving medical ...,Included by VB,covidence-doi,29246235,Comparative satisfaction of receiving medical ...,"ANAND TAMANG, IQBAL H SHAH, PRAGYA SHRESTHA, I...","Tamang A, Shah IH, Shrestha P, et al. Comparat...",Reproductive health,...,False,WARRINER,I,I WARRINER,I K WARRINER,I K WARRINER,1.0,[UNITED STATES OF AMERICA],female,N=22276


In [44]:
unique_authors = authors[["deduplicated_name_std", "gender", "countries"]].drop_duplicates(subset=["deduplicated_name_std"])
unique_authors['constant'] =["N={}".format(len(unique_authors)) for i in unique_authors['gender']]
tabelaEstiloArtigoCategoricas(unique_authors, 'constant', 'gender', probabilidade='col')

  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))
  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))


constant,N=10865,N=10865,All,All
distribuição,N,%,N,%
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
female,3703,(34.08%),3703,(34.08%)
male,3966,(36.5%),3966,(36.5%)
unknown,3196,(29.42%),3196,(29.42%)
All,10865,(100.0%),10865,(100.0%)


In [45]:
unique_authors.head(2)

Unnamed: 0,deduplicated_name_std,gender,countries,constant
0,M HOMYK,unknown,[UNITED STATES OF AMERICA],N=10865
1,J C HERR,unknown,[UNITED STATES OF AMERICA],N=10865


In [46]:
unknown_gender_authors = authors[authors.gender == 'unknown']
#authors.gender.replace('andy', 'unknown', inplace=True)
#authors.gender.replace('unknown', np.nan, inplace=True)

authors.to_csv(path+'authors_gender.csv', encoding="utf-8", index=False)

In [47]:
#authors.gender.value_counts()

In [48]:
#authors.pmid.value_counts()

In [49]:
print(authors.gender.isna().sum())
tabelaEstiloArtigoCategoricas(authors, 'constant', 'gender', probabilidade='col')

0


  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))
  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))


constant,N=22276,N=22276,All,All
distribuição,N,%,N,%
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
female,8508,(38.19%),8508,(38.19%)
male,8707,(39.09%),8707,(39.09%)
unknown,5061,(22.72%),5061,(22.72%)
All,22276,(100.0%),22276,(100.0%)


In [50]:
## Check if there are names without gender classifications
unknown_gender_authors = authors[authors.gender == np.nan]
print("Total of unique names unknown gender: {} ({:.2%})".
      format(len(unknown_gender_authors.deduplicated_name_std.unique()), 
            len(unknown_gender_authors.deduplicated_name_std.unique())/len(authors.deduplicated_name_std.unique())))
print("Total of names unknown gender: {} ({:.2%})\n".format(len(unknown_gender_authors.deduplicated_name_std),
            len(unknown_gender_authors.deduplicated_name_std)/len(authors.deduplicated_name_std)))

known_gender_authors = authors[authors.gender != 'unknown']
print("Total of unique names suggested gender: {} ({:.2%})".
      format(len(known_gender_authors.deduplicated_name_std.unique()), 
            len(known_gender_authors.deduplicated_name_std.unique())/len(authors.deduplicated_name_std.unique())))
print("Total of names suggested gender: {} ({:.2%})".format(len(known_gender_authors.deduplicated_name_std),
            len(known_gender_authors.deduplicated_name_std)/len(authors.deduplicated_name_std)))


Total of unique names unknown gender: 0 (0.00%)
Total of names unknown gender: 0 (0.00%)

Total of unique names suggested gender: 7669 (70.58%)
Total of names suggested gender: 17215 (77.28%)


In [51]:
unknown_gender_authors = authors[authors.gender == 'unknown'].groupby('deduplicated_name_std')[['pmid']].count().sort_values(by='pmid', ascending=False)
unknown_gender_authors['first_name_is_letter'] = [1 if len(first_name.split(' ')[0]) == 1 else 0 for first_name in unknown_gender_authors.index]
unknown_gender_authors['first_name'] = [first_name.split(' ')[0] for first_name in unknown_gender_authors.index]


unknown_gender_authors.to_csv(path + "author_unknown_gender.csv", encoding="utf-8")
unknown_gender_authors.head()

Unnamed: 0_level_0,pmid,first_name_is_letter,first_name
deduplicated_name_std,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
YONGJIAN LIU,35,0,YONGJIAN
DEBABRATA GHOSH,28,0,DEBABRATA
Y W LOKE,28,1,Y
JAYASREE SENGUPTA,28,0,JAYASREE
BIRAN AFFANDI,26,0,BIRAN


In [52]:
unknown_gender_authors.first_name.value_counts().to_csv(path+"first_name_unknow_gender_count.csv", encoding="utf-8")
unknown_gender_authors#.first_name.value_counts()

Unnamed: 0_level_0,pmid,first_name_is_letter,first_name
deduplicated_name_std,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
YONGJIAN LIU,35,0,YONGJIAN
DEBABRATA GHOSH,28,0,DEBABRATA
Y W LOKE,28,1,Y
JAYASREE SENGUPTA,28,0,JAYASREE
BIRAN AFFANDI,26,0,BIRAN
...,...,...,...
J A CLEMENTS,1,1,J
J A GRISSO,1,1,J
J A HORCAJADAS,1,1,J
J A LINDGREN,1,1,J


In [53]:
papers
profile = ProfileReport(papers.drop("abstract", axis=1))#, minimal=True)
profile.to_file(path + "papers.html")

Summarize dataset:   0%|          | 0/41 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
authors
profile = ProfileReport(authors, minimal=True)
profile.to_file(path + "authors.html")

In [None]:
profile = ProfileReport(authors)#, minimal=True)
profile.to_file(path + "authors_max.html")

profile = ProfileReport(authors[authors.index_authorship > 0], minimal=True)
profile.to_file(path + "authors_with_index.html")

profile = ProfileReport(authors[authors.index_authorship == 0], minimal=True)
profile.to_file(path + "colaborators.html")

In [None]:
unknown_gender_authors

profile = ProfileReport(unknown_gender_authors, minimal=True)
profile.to_file(path + "unknown_gender_authors.html")