In [1]:
import csv
import json
import time
import pandas as pd
import numpy as np

import gender_guesser.detector as gender

import requests
import re
import jellyfish
import spacy
import geonamescache

from functions import *

from os import makedirs
from os.path import exists, join
from unidecode import unidecode
from collections import Counter
from pandas_profiling import ProfileReport
from spacy import displacy 
from alive_progress import alive_bar

nlp = spacy.load("en_core_web_lg")
d = gender.Detector()
gc = geonamescache.GeonamesCache()
# gets nested dictionary for countries
countries = gc.get_countries()
# gets nested dictionary for cities
cities = gc.get_cities()

import sys  
sys.path.insert(0, 'D:\Google Drive\Analises\Codigos python')
from build_features import *


In [2]:
def gen_dict_extract(var, key):
    if isinstance(var, dict):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, (dict, list)):
                yield from gen_dict_extract(v, key)
    elif isinstance(var, list):
        for d in var:
            yield from gen_dict_extract(d, key)

cities = [*gen_dict_extract(cities, 'name')]
countries = [*gen_dict_extract(countries, 'name')]

countries = [clean_string(country) for country in countries]
cities = [clean_string(city) for city in cities]

In [3]:
institutions_list = ['World Health Organization', 'WHO', 'Research', 'Reproductive', 'Study', 'Health', 
             'GROUP', 'NETWORK', 'Team', 'University', 'Project', 'IEDEA', 
             'Consortium', 'committee', 'all the authors', 'for IeDEA-Southern Africa', 'Systematic', 
             'collaborations', 'Organizacion', 'College', 'Consortium', 'Association', 'Survey', 'Expert',
             'de la Salud', 'Control','collaborations', 'Collaboration', 'committee', 'Universite', 'BURKINA FASO']
institutions_list = [institutions.upper() for institutions in institutions_list]


def prepare_std_name(authors):
    last_name = []
    first_letter_name = []
    standard_name = []
    for nome in authors['full_name']:
        nome = re.sub(' +', ' ', unidecode(nome).replace('-', ' ').strip().upper())
        last_name += [nome.split(" ")[-1]]
        first_letter_name += [nome.split(" ")[0][0]]
        standard_name += [nome.split(" ")[0][0] + " " + nome.split(" ")[-1]]
    
    authors['last_name'] = last_name
    authors['first_letter_name'] = first_letter_name
    authors['standard_name'] = standard_name
    
    return authors

def get_longest_name(group):        
    names_list = group['unique'].iloc[0].tolist()
    candidate_name = unidecode(max(names_list, key=len))

    while len(candidate_name.split(' ')) == 1 and len(names_list) > 1:
        names_list.remove(candidate_name)
        candidate_name = unidecode(max(names_list, key=len))   
    return candidate_name


def deduplica_nome(inicial_data, column_group, column_agg):
    return_data = pd.DataFrame(inicial_data)
    names_variation_unique = inicial_data.groupby(column_group)[column_agg].agg(['unique']).reset_index()
    authors.to_csv(path+'names_variation_unique.csv', encoding="utf-8", index=False)

    return_data = return_data.reset_index()
    for index, row in return_data.iterrows():
        group = names_variation_unique[names_variation_unique[column_group] == row[column_group]]
        candidate_name = get_longest_name(group)
        
        return_data.loc[index, 'deduplicated_name_std'] = candidate_name
        return_data.loc[index, 'names_variation_std'] = ", ".join(group.unique.iloc[0].tolist())
        return_data.loc[index, 'n_variacoes_std'] = len(group.unique.iloc[0].tolist())
        #print(row[column_group], candidate_name, ", ".join(group.unique.iloc[0].tolist()), len(group.unique.iloc[0].tolist()))
            
    return return_data

def gess_gender_author(name):
    first_name = name.split(' ')[0]
    guessed_gender = 'unknown'
    
    if len(first_name) > 1:
        guessed_gender = d.get_gender(first_name.title())
    
    if guessed_gender =='mostly_female':
        guessed_gender = 'female'
    if guessed_gender =='mostly_male':
        guessed_gender = 'male'
    
    return guessed_gender


def create_deduplicated_columns(authors):
    deduplicate_data =  deduplica_nome(authors, "standard_name", "full_name")    
    return deduplicate_data


def is_groups_name(nome):
    if any(s.upper() in nome.upper() for s in institutions_list):
        return True
    else:
        return False

def remove_groups_names(authors):  
    institutions = []
    for nome in authors.full_name.value_counts().index:
        if is_groups_name(nome):
            authors = authors[authors.full_name != nome]
            if nome not in institutions:
                institutions += [nome]
                
    return authors, institutions


def get_list_names(authors, author):
    list_names = authors[authors['deduplicated_name_std'] == author].iloc[0]['names_variation_std'].split(", ")    
    return list_names

def guess_gender_in_list(list_names, last=False):
    #print(list_names)
    author_guessed_gender = 'unknown'
    index_name = 0
    while (author_guessed_gender == 'unknown' and len(list_names) > index_name):
        if last:
            name = list_names[index_name].split(" ")[-1]
        else:
            name = list_names[index_name].split(" ")[0]
            
        author_guessed_gender = gess_gender_author(name)
        #print(name, author_guessed_gender, index_name)
        index_name = index_name + 1
    
    return author_guessed_gender

def remap_affiliation(authors):   
    secondary_list = []
    authors['countries'] = [[] for i in authors['affiliation']]  
    authors['countries'] = authors['countries'].astype('object')
    affiliations_unique = pd.read_csv(path+'..\\Reviewed Files\\01_affiliations_unique_with_countries_v4.csv')
    affiliations_unique.original = [clean_string(affiliation, True) for affiliation in affiliations_unique.original]
    with alive_bar(len(authors['affiliation']), force_tty=True) as bar:
        for index, row in authors.iterrows():
            affiliation_list= row['affiliation']
            remaped_list = []
            country_list = []
            if affiliation_list != "Collaborators":  
                #print(affiliation_list)
                affiliations = affiliation_list
                
                for affiliation in (affiliations):
                    if not pd.isnull(affiliation):
                        remaps = affiliations_unique.loc[affiliations_unique.original == clean_string(affiliation, True)].reset_index()
                        if len(remaps) > 0:
                            remaped_list += [remaps.iloc[0]['remap']]
                            country_list += [remaps.iloc[0]['country']]
                        else:
                            print(affiliation)
                            remaped_list += [affiliation]
                            secondary_list += [clean_string(str(affiliation), True)]
                authors.at[index, 'affiliation'] = remaped_list
                authors.at[index, 'countries'] = country_list
            bar()
    df = pd.DataFrame(secondary_list, columns=["affiliations_secondary"])
    df.to_csv(path+'secondary_list.csv', index=False)

# Preparação do banco

In [4]:
path = 'C:\\Users\\livia\\Dropbox\\HRP Alliance authorship paper\\Data 2022-06-15\\'

papers = pd.read_csv(path + 'papers_pubmed.csv')
print(len(papers))
papers_reports = pd.read_csv(path + "papers_pubmed_reports_v2.csv")
print(len(papers_reports))

papers.drop_duplicates(subset="pmid", inplace=True)
print(len(papers))

papers = pd.concat([papers, papers_reports])
print(len(papers))

#print(papers[papers.duplicated(subset="pmid")])#[["title", "pmid", "doi", "authors_list"]])
papers.drop_duplicates(subset="pmid", inplace=True)
print(len(papers))
papers = papers.drop("Unnamed: 0", axis = 1)
papers.reset_index(inplace=True)
papers = papers.drop("index", axis = 1)

print("Number of pmids: {}".format(len(papers.pmid.unique())))
print("Number rows: {}".format(len(papers)))

843
208
843
1051
1051
Number of pmids: 1051
Number rows: 1051


In [5]:
len(papers[papers.publication_year >= 2022].pmid)

29

In [6]:
pmid_to_delete = list(papers[papers.doi == "10.1159/000343054"].pmid)
pmid_to_delete += [27227232]
pmid_to_delete += list(papers[papers.publication_type == "Published Erratum"].pmid)
pmid_to_delete += list(papers[papers.publication_year >= 2022].pmid)
papers = papers[~papers.pmid.isin(pmid_to_delete)]


print("Pmid to delete: {}".format(len(pmid_to_delete)))
print("Number of pmids: {}".format(len(papers.pmid.unique())))
print("Number rows: {}".format(len(papers)))

Pmid to delete: 44
Number of pmids: 1007
Number rows: 1007


In [7]:
authors = pd.read_csv(path + 'authors.csv').reset_index()
authors = authors.drop("index", axis = 1)
print(len(authors))

authors_reports = pd.read_csv(path + 'authors_reports_v2.csv').reset_index()
authors_reports = authors_reports.drop("index", axis = 1)
print(len(authors_reports))

authors = pd.concat([authors, authors_reports])
authors = authors.reset_index()
authors = authors.drop("Unnamed: 0", axis = 1)
authors = authors.drop("index", axis = 1)
print(len(authors))
authors.drop_duplicates(inplace=True)
print(len(authors))


authors = authors[~authors.pmid.isin(pmid_to_delete)]

print(len(authors.pmid.value_counts()))

authors["is_institutions_name"] = [False for a in authors.full_name]
for index, row in authors.iterrows():
    affiliation_list= row['affiliation']
    remaped_list = []
    country_list = []
    if affiliation_list != "Collaborators": 
        affiliations = eval(str(affiliation_list))      
    authors.at[index, 'affiliation'] = affiliations
    authors.at[index, "is_institutions_name"] = is_groups_name(row.full_name)


8444
2906
11350
11001
987


In [8]:
papers["count_authors"] = [0 for a in papers.authors_list]
papers["count_authors_institutions"] = [0 for a in papers.authors_list]
papers["count_authors_individual"] = [0 for a in papers.authors_list]

papers["count_collaborators"] = [0 for a in papers.authors_list]
papers["count_collaborators_institutions"] = [0 for a in papers.authors_list]
papers["count_collaborators_individual"] = [0 for a in papers.authors_list]

for index, row in papers.iterrows():
    if not pd.isnull(row.authors_list) and str(row.authors_list) != "" and str(row.authors_list).lower() != "nan":
        authors_list = str(row.authors_list).split(",")
        papers.at[index, "count_authors"] = len(authors_list)    
        papers.at[index, "count_authors_institutions"] = sum([1 if any(institutions in authors_name for institutions in institutions_list) else 0 for authors_name in authors_list])
        papers.at[index, "count_authors_individual"] = papers.at[index, "count_authors"] - papers.at[index, "count_authors_institutions"]
        
    collaborators_list = list(authors[(authors.index_authorship == 0) & (authors.pmid == row.pmid)].full_name)
    papers.at[index, "count_collaborators"] = len(collaborators_list)
    papers.at[index, "count_collaborators_institutions"] = sum([1 if any(institutions in collaborators_name for institutions in institutions_list) else 0 for collaborators_name in collaborators_list])
    papers.at[index, "count_collaborators_individual"] = papers.at[index, "count_collaborators"] - papers.at[index, "count_collaborators_institutions"]
        
print("Number of pmids: {}".format(len(papers.pmid.unique())))
papers.drop_duplicates(inplace=True)
print("Number rows: {}".format(len(papers)))
papers.to_csv(path + "..\\papers_final.csv", index=False)

Number of pmids: 1007
Number rows: 1007


In [9]:
sum(papers["count_authors"])
sum(papers["count_collaborators"])

2894

In [10]:
only_collaboration_groups = papers[(papers["count_authors"] >= 1) & (papers["count_authors_individual"] == 0) & (papers["count_authors_institutions"] >= 1)]
only_individual_authors = papers[(papers["count_authors"] >= 1) & (papers["count_authors_individual"] >= 1) & (papers["count_authors_institutions"] == 0)]
both = papers[(papers["count_authors"] >= 1) & (papers["count_authors_individual"] >= 1) & (papers["count_authors_institutions"] >= 1)]
no_authors = papers[(papers["count_authors"] == 0)]

print("Papers with only collaborations groups authors: ", len(only_collaboration_groups))
print("Total of individual collaborators on papers with only collaborations groups authors: ", sum(only_collaboration_groups["count_collaborators_individual"]))
print("Total of collaborators on papers with only collaborations groups authors: ", sum(only_collaboration_groups["count_collaborators"]))
print("\nPapers with only individual authors: ", len(only_individual_authors))
print("Total of individual authors on papers with only individual authors: ", sum(only_individual_authors["count_authors_individual"]))
print("Total of authors on papers with only individual authors: ", sum(only_individual_authors["count_authors"]))
print("\nPapers of individual and collaboration authors: ", len(both))
print("Total of individual authors on papers with both types of authors: ", sum(both["count_authors_individual"]))
print("Total of authors on papers with both types of authors: ", sum(both["count_authors"]))
print("Total of colaborators groups on papers with both types of authors: ", sum(both["count_authors_institutions"]))
print("Total of individual collaborators on papers with both types of authors: ", sum(both["count_collaborators_individual"]))
print("Total of collaborators on papers with both types of authors: ", sum(both["count_collaborators"]))
print("\nPapers with no authors: ", len(no_authors))
#papers[(papers["count_authors"] == 0)].doi

Papers with only collaborations groups authors:  6
Total of individual collaborators on papers with only collaborations groups authors:  409
Total of collaborators on papers with only collaborations groups authors:  410

Papers with only individual authors:  866
Total of individual authors on papers with only individual authors:  6042
Total of authors on papers with only individual authors:  6042

Papers of individual and collaboration authors:  115
Total of individual authors on papers with both types of authors:  1419
Total of authors on papers with both types of authors:  1538
Total of colaborators groups on papers with both types of authors:  119
Total of individual collaborators on papers with both types of authors:  2483
Total of collaborators on papers with both types of authors:  2484

Papers with no authors:  20


In [11]:
papers[(papers["count_authors"] == 1) & (papers["count_authors_individual"] == 0) & (papers["count_authors_institutions"] >= 1)]

Unnamed: 0,pmid,title,authors_list,citation,journal_book,publication_year,pmcid,doi,publication_type,mesh_terms,term,is_systematic_review,abstract,count_authors,count_authors_institutions,count_authors_individual,count_collaborators,count_collaborators_institutions,count_collaborators_individual
550,20543706,Eighteen-month follow-up of HIV-1-infected mot...,KESHO BORA STUDY GROUP,Kesho Bora Study Group. Eighteen-month follow-...,Journal of acquired immune deficiency syndrome...,2010,,10.1097/QAI.0b013e3181e36634,"Randomized Controlled Trial,Research Support, ...","['Adult', 'Anti-HIV Agents / adverse effects',...",10.1097/qai.0b013e3181e36634,0.0,Abstract\n \n \n\n\n\n Ob...,1,1,0,52,0,52
570,31420064,The World Health Organization ACTION-I (Antena...,WHO ACTION TRIALS COLLABORATORS,WHO ACTION Trials Collaborators. The World Hea...,Trials,2019,PMC6698040,10.1186/s13063-019-3488-z,"Multicenter Study,Randomized Controlled Trial","['Dexamethasone / therapeutic use', 'Double-Bl...",10.1186/s13063-019-3488-z,0.0,Abstract\n \n \n\n\n\n Ba...,1,1,0,151,0,151
646,28215849,The Global Maternal and Neonatal Sepsis Initia...,GLOBAL MATERNAL AND NEONATAL SEPSIS INITIATIVE...,Global Maternal and Neonatal Sepsis Initiative...,The Lancet. Global health,2017,,10.1016/S2214-109X(17)30020-7,,"['Cooperative Behavior', 'Female', 'Global Hea...",10.1016/s2214-109x(17)30020-7,0.0,,1,1,0,0,0,0
713,22573845,Maternal HIV-1 disease progression 18-24 month...,KESHO BORA STUDY GROUP,Kesho Bora Study Group. Maternal HIV-1 disease...,Clinical infectious diseases : an official pub...,2012,PMC3393708,10.1093/cid/cis461,Randomized Controlled Trial,"['Adult', 'Anti-Retroviral Agents / administra...",10.1093/cid/cis461,0.0,Abstract\n \n \n\n\n\n Ba...,1,1,0,40,1,39
970,18400840,Intrauterine devices and intrauterine systems,ESHRE CAPRI WORKSHOP GROUP,ESHRE Capri Workshop Group. Intrauterine devic...,Human reproduction update,2008,,10.1093/humupd/dmn003,"Research Support, Non-U.S. Gov't,Review","['Contraceptive Agents, Female / administratio...",18400840,0.0,Abstract\n \n \n\n\n \n ...,1,1,0,0,0,0
1003,32353314,Frequency and management of maternal infection...,WHO GLOBAL MATERNAL SEPSIS STUDY (GLOSS) RESEA...,WHO Global Maternal Sepsis Study (GLOSS) Resea...,The Lancet. Global health,2020,PMC7196885,10.1016/S2214-109X(20)30109-1,"Research Support, Non-U.S. Gov't,Research Supp...","['Adult', 'Cohort Studies', 'Female', 'Global ...",32353314,0.0,Abstract\n \n \n\n\n\n Ba...,1,1,0,167,0,167


In [12]:
authors.head()
print(len(authors.pmid.value_counts()))
n_authors = len(authors['full_name'])
print("Number of pmids: {}".format(len(authors.pmid.unique())))
print("Number of authors: {}".format(n_authors))
print("Number unique authors: {}\n".format(len(authors['full_name'].value_counts())))

authors, institutions = remove_groups_names(authors)

print("Number of pmids without group names: {}".format(len(authors.pmid.unique())))
print("Removing institutions")
print("Number of authors: {}".format(len(authors['full_name'])))
print("Number unique authors: {}\n".format(len(authors['full_name'].value_counts())))


print("Number of institutions: {}".format(n_authors - len(authors['full_name'])))
authors.pmid.value_counts()
authors.head()


987
Number of pmids: 987
Number of authors: 10480
Number unique authors: 5445

Number of pmids without group names: 985
Removing institutions
Number of authors: 10353
Number unique authors: 5350

Number of institutions: 127


Unnamed: 0,full_name,affiliation,index_authorship,pmid,is_institutions_name
0,SUSANNE JACOBSSON,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,1,30456870,False
1,IRYNA BOIKO,[CLINICAL LABORATORY DEPARTMENT TERNOPIL REGIO...,2,30456870,False
2,DANIEL GOLPARIAN,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,3,30456870,False
3,KAREL BLONDEEL,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,4,30456870,False
4,JAMES KIARIE,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,5,30456870,False


In [13]:
authors

Unnamed: 0,full_name,affiliation,index_authorship,pmid,is_institutions_name
0,SUSANNE JACOBSSON,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,1,30456870,False
1,IRYNA BOIKO,[CLINICAL LABORATORY DEPARTMENT TERNOPIL REGIO...,2,30456870,False
2,DANIEL GOLPARIAN,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,3,30456870,False
3,KAREL BLONDEEL,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,4,30456870,False
4,JAMES KIARIE,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,5,30456870,False
...,...,...,...,...,...
11345,ALEXANDER MADDAMS,[INTERNATIONAL CENTRE FOR EVIDENCE IN DISABILI...,2,33060095,False
11346,HATTIE LOWE,[INTERNATIONAL CENTRE FOR EVIDENCE IN DISABILI...,3,33060095,False
11347,LOWRI DAVIES,[INTERNATIONAL CENTRE FOR EVIDENCE IN DISABILI...,4,33060095,False
11348,RAJAT KHOSLA,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,5,33060095,False


In [14]:
authors = prepare_std_name(authors)
authors.head(2)

Unnamed: 0,full_name,affiliation,index_authorship,pmid,is_institutions_name,last_name,first_letter_name,standard_name
0,SUSANNE JACOBSSON,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,1,30456870,False,JACOBSSON,S,S JACOBSSON
1,IRYNA BOIKO,[CLINICAL LABORATORY DEPARTMENT TERNOPIL REGIO...,2,30456870,False,BOIKO,I,I BOIKO


In [15]:
authors = create_deduplicated_columns(authors)

In [16]:
print("Number of authors: {}".format(len(authors)))
print("Number unique authors (after deduplication): {}\n".format(len(authors['deduplicated_name_std'].value_counts())))

Number of authors: 10353
Number unique authors (after deduplication): 4434



In [17]:
authors

Unnamed: 0,index,full_name,affiliation,index_authorship,pmid,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std
0,0,SUSANNE JACOBSSON,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,1,30456870,False,JACOBSSON,S,S JACOBSSON,SUSANNE JACOBSSON,SUSANNE JACOBSSON,1.0
1,1,IRYNA BOIKO,[CLINICAL LABORATORY DEPARTMENT TERNOPIL REGIO...,2,30456870,False,BOIKO,I,I BOIKO,IRYNA BOIKO,IRYNA BOIKO,1.0
2,2,DANIEL GOLPARIAN,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,3,30456870,False,GOLPARIAN,D,D GOLPARIAN,DANIEL GOLPARIAN,DANIEL GOLPARIAN,1.0
3,3,KAREL BLONDEEL,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,4,30456870,False,BLONDEEL,K,K BLONDEEL,KAREL BLONDEEL,KAREL BLONDEEL,1.0
4,4,JAMES KIARIE,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,5,30456870,False,KIARIE,J,J KIARIE,JAMES N. KIARIE,"JAMES KIARIE, JAMES N. KIARIE, JAMES N KIARIE",3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10348,11345,ALEXANDER MADDAMS,[INTERNATIONAL CENTRE FOR EVIDENCE IN DISABILI...,2,33060095,False,MADDAMS,A,A MADDAMS,ALEXANDER MADDAMS,ALEXANDER MADDAMS,1.0
10349,11346,HATTIE LOWE,[INTERNATIONAL CENTRE FOR EVIDENCE IN DISABILI...,3,33060095,False,LOWE,H,H LOWE,HATTIE LOWE,HATTIE LOWE,1.0
10350,11347,LOWRI DAVIES,[INTERNATIONAL CENTRE FOR EVIDENCE IN DISABILI...,4,33060095,False,DAVIES,L,L DAVIES,LOWRI DAVIES,LOWRI DAVIES,1.0
10351,11348,RAJAT KHOSLA,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,5,33060095,False,KHOSLA,R,R KHOSLA,RAJAT KHOSLA,RAJAT KHOSLA,1.0


In [18]:
len(papers)

1007

In [19]:
sum(papers.authors_list.isna())

20

In [20]:
papers.index = list(papers.pmid)
authors.index = list(authors.pmid)
authors.head()

Unnamed: 0,index,full_name,affiliation,index_authorship,pmid,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std
30456870,0,SUSANNE JACOBSSON,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,1,30456870,False,JACOBSSON,S,S JACOBSSON,SUSANNE JACOBSSON,SUSANNE JACOBSSON,1.0
30456870,1,IRYNA BOIKO,[CLINICAL LABORATORY DEPARTMENT TERNOPIL REGIO...,2,30456870,False,BOIKO,I,I BOIKO,IRYNA BOIKO,IRYNA BOIKO,1.0
30456870,2,DANIEL GOLPARIAN,[WHO COLLABORATING CENTRE FOR GONORRHOEA AND O...,3,30456870,False,GOLPARIAN,D,D GOLPARIAN,DANIEL GOLPARIAN,DANIEL GOLPARIAN,1.0
30456870,3,KAREL BLONDEEL,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,4,30456870,False,BLONDEEL,K,K BLONDEEL,KAREL BLONDEEL,KAREL BLONDEEL,1.0
30456870,4,JAMES KIARIE,[DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARC...,5,30456870,False,KIARIE,J,J KIARIE,JAMES N. KIARIE,"JAMES KIARIE, JAMES N. KIARIE, JAMES N KIARIE",3.0


In [21]:
publication_year = papers#[['pmid', 'publication_year']]#, 'first_author','last_author']]
publication_year = publication_year.join(authors, how="right", lsuffix="_l", rsuffix="_r").reset_index()
print(len(publication_year))
publication_year.head(2)#.affiliation.value_counts()

10353


Unnamed: 0,level_0,pmid_l,title,authors_list,citation,journal_book,publication_year,pmcid,doi,publication_type,...,affiliation,index_authorship,pmid_r,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std
0,13374,13374,"The W.H.O. expanded programme of research, dev...","A KESSLER, C C STANDLEY","Kessler A, Standley CC. The W.H.O. expanded pr...",Proceedings of the Royal Society of London. Se...,1976,,10.1098/rspb.1976.0103,,...,[],1,13374,False,KESSLER,A,A KESSLER,A KESSLER,A KESSLER,1.0
1,13374,13374,"The W.H.O. expanded programme of research, dev...","A KESSLER, C C STANDLEY","Kessler A, Standley CC. The W.H.O. expanded pr...",Proceedings of the Royal Society of London. Se...,1976,,10.1098/rspb.1976.0103,,...,[],2,13374,False,STANDLEY,C,C STANDLEY,C C STANDLEY,C C STANDLEY,1.0


In [22]:
authors_without_affiliation = publication_year[publication_year.affiliation.map(len) == 0]
print(len(authors_without_affiliation),len(authors_without_affiliation)/len(publication_year)*100)

for index, row in authors_without_affiliation.iterrows():
    affiliation = publication_year[(publication_year.deduplicated_name_std == row.deduplicated_name_std) & 
                     ((publication_year.publication_year >= row.publication_year - 0) &
                      (publication_year.publication_year <= row.publication_year + 0)
                     ) & 
                     #(publication_year.affiliation != "[]") &
                     (publication_year.affiliation.map(len) > 0) &
                          (publication_year.affiliation != "Collaborators")]#.affiliation
    if len(affiliation) > 0:
        #print (index, publication_year.iloc[index].publication_year, "\n",affiliation[['publication_year', 'affiliation']])#.iloc[0])
        #print(affiliation.sort_values(by='publication_year', ascending=True).publication_year.iloc[0])
        publication_year.at[index, 'affiliation'] = affiliation.sort_values(by='publication_year', ascending=True).affiliation.iloc[0]
        
authors_without_affiliation = publication_year[publication_year.affiliation.map(len) == 0]
print(len(authors_without_affiliation), len(authors_without_affiliation)/len(publication_year)*100)

4303 41.56283202936347
3632 35.08161885443833


In [23]:
authors = publication_year
authors.rename(columns={"pmid_l": "pmid"}, inplace=True)

In [24]:
remap_affiliation(authors)
#authors.head()

|████████████████████████████████████████| 10353/10353 [100%] in 15.3s (674.81/s)                                       %] in 1s (2071.5/s, eta: 4s)  ▅▃▁ 2379/10353 [23%] in 1s (1648.3/s, eta: 5s) ▄▂▂ 2429/10353 [23%] in 2s (1563.9/s, eta: 5s) ▂▂▄ 2493/10353 [24%] in 2s (1425.7/s, eta: 5s)  ▃▅▇ 2612/10353 [25%] in 2s (1296.7/s, eta: 6s) 2990/10353 [29%] in 3s (1119.4/s, eta: 7s) 3607/10353 [35%] in 4s (1018.9/s, eta: 7s)  ▅▇▇ 3809/10353 [37%] in 4s (952.3/s, eta: 7s) (891.3/s, eta: 7s)  ▄▂▂ 4327/10353 [42%] in 5s (876.7/s, eta: 7s)  ▄▆█ 6310/10353 [61%] in 8s (811.0/s, eta: 5s) 7284/10353 [70%] in 9s (769.4/s, eta: 4s) (729.1/s, eta: 3s) in 12s (729.0/s, eta: 3s)  ▅▃▁ 9194/10353 [89%] in 13s (707.2/s, eta: 2s) ▄▂▂ 9225/10353 [89%] in 13s (704.2/s, eta: 2s) in 13s (701.6/s, eta: 2s) (680.2/s, eta: 1s) in 15s (685.7/s, eta: 1s) 


In [25]:
for index, aff in authors.iterrows():
    if (aff.pmid in [17308725, 23240739, 27227224]) & (aff.index_authorship==1):
        print(aff.affiliation)
#affiliations_unique.loc[affiliations_unique.original == clean_string("DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARCH (RHR) WORLD HEALTH ORGANIZATION (WHO) GENEVA SWITZERLAND.", True)].reset_index().remap

for index, aff in authors.iterrows():
    if ("DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARCH (RHR) WORLD HEALTH ORGANIZATION (WHO) GENEVA SWITZERLAND." in aff.affiliation) \
    & (aff.pmid in [17308725, 23240739, 27227224]) & (aff.index_authorship==1):
        print(aff.full_name)

['DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARCH (RHR) WORLD HEALTH ORGANIZATION (WHO) GENEVA SWITZERLAND.']
['DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARCH (RHR) WORLD HEALTH ORGANIZATION (WHO) GENEVA SWITZERLAND.']
['DEPARTMENT OF REPRODUCTIVE HEALTH AND RESEARCH (RHR) WORLD HEALTH ORGANIZATION (WHO) GENEVA SWITZERLAND.']
IQBAL H SHAH
MARLEEN TEMMERMAN
A METIN GÜLMEZOGLU


In [26]:
authors.to_csv(path+'deduplicated_authors.csv', encoding="utf-8", index=False)

In [27]:
print("Number of authors: {}".format(len(authors['full_name'])))
print("Number unique authors: {}".format(len(authors['full_name'].value_counts())))

print("Number of authors: {}".format(len(authors['deduplicated_name_std'])))
print("Number unique authors: {}".format(len(authors['deduplicated_name_std'].value_counts())))

Number of authors: 10353
Number unique authors: 5350
Number of authors: 10353
Number unique authors: 4434


In [28]:
gender_list = []
unique_names = authors['deduplicated_name_std'].value_counts().index

with alive_bar(len(unique_names), force_tty=True) as bar:
    for author in unique_names:
        list_names_variation = get_list_names(authors, author)
        author_guessed_gender = guess_gender_in_list(list_names_variation)
        
        ## For last name
        if author_guessed_gender == 'unknown':            
            author_guessed_gender = guess_gender_in_list(list_names_variation, last=True)
        
        authors.loc[authors['deduplicated_name_std'] == author, 'gender'] = author_guessed_gender
        gender_list += [author_guessed_gender]
        #print(author, author_guessed_gender)
        bar()
        

|████████████████████████████████████████| 4434/4434 [100%] in 21.2s (209.56/s)                                         , eta: 23s) (183.2/s, eta: 22s) 557/4434 [13%] in 3s (183.5/s, eta: 21s)  ▆▄▂ 607/4434 [14%] in 3s (184.8/s, eta: 21s)  ▇▇▅ 741/4434 [17%] in 4s (188.7/s, eta: 20s) (189.2/s, eta: 19s) ▃▁▃ 1916/4434 [43%] in 10s (197.4/s, eta: 13s)  2027/4434 [46%] in 10s (197.7/s, eta: 12s)  ▄▂▂ 2077/4434 [47%] in 11s (197.4/s, eta: 12s)  ▃▅▇ 2147/4434 [48%] in 11s (198.2/s, eta: 12s)  ▅▇▇ 2338/4434 [53%] in 12s (197.6/s, eta: 11s) in 15s (198.5/s, eta: 7s) 3273/4434 [74%] in 16s (201.4/s, eta: 6s) ▂▄▆ 3500/4434 [79%] in 17s (204.0/s, eta: 5s)  ▅▃▁ 3855/4434 [87%] in 19s (206.5/s, eta: 3s) 3928/4434 [89%] in 19s (207.4/s, eta: 3s)  ▇▇▅ 4057/4434 [91%] in 19s (210.0/s, eta: 2s)  4261/4434 [96%] in 20s (210.9/s, eta: 1s) 


In [29]:
authors['constant'] =["N={}".format(len(authors)) for i in authors['gender']]
tabelaEstiloArtigoCategoricas(authors, 'constant', 'gender', probabilidade='col')

  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))
  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))


constant,N=10353,N=10353,All,All
distribuição,N,%,N,%
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
andy,321,(3.1%),321,(3.1%)
female,4614,(44.57%),4614,(44.57%)
male,3554,(34.33%),3554,(34.33%)
unknown,1864,(18.0%),1864,(18.0%)
All,10353,(100.0%),10353,(100.0%)


In [30]:
unknown_gender_authors = authors[(authors.gender == 'unknown') | (authors.gender == 'andy')]
print("Total of unique names unknown gender: {} ({:.2%})".
      format(len(unknown_gender_authors.deduplicated_name_std.unique()), 
            len(unknown_gender_authors.deduplicated_name_std.unique())/len(authors.deduplicated_name_std.unique())))
print("Total of names unknown gender: {} ({:.2%})\n".format(len(unknown_gender_authors.deduplicated_name_std),
            len(unknown_gender_authors.deduplicated_name_std)/len(authors.deduplicated_name_std)))

known_gender_authors = authors[authors.gender != 'unknown']
print("Total of unique names suggested gender: {} ({:.2%})".
      format(len(known_gender_authors.deduplicated_name_std.unique()), 
            len(known_gender_authors.deduplicated_name_std.unique())/len(authors.deduplicated_name_std.unique())))
print("Total of names suggested gender: {} ({:.2%})".format(len(known_gender_authors.deduplicated_name_std),
            len(known_gender_authors.deduplicated_name_std)/len(authors.deduplicated_name_std)))

Total of unique names unknown gender: 1149 (25.91%)
Total of names unknown gender: 2185 (21.10%)

Total of unique names suggested gender: 3496 (78.85%)
Total of names suggested gender: 8489 (82.00%)


In [31]:
#len(unknown_gender_authors.value_counts())
#unknown_gender_authors.deduplicated_name_std.value_counts()

In [32]:
names_checked = pd.read_csv(path + '..\\01_names_checked_v2.csv')
names_checked.deduplicated_name_std = names_checked.deduplicated_name_std.str.title()
#names_checked.head(5)
names_checked.gender.value_counts()

female     367
male       315
unknown    305
andy        15
Name: gender, dtype: int64

In [33]:
for index, row in unknown_gender_authors.iterrows():
    gender = names_checked[names_checked.deduplicated_name_std == row.deduplicated_name_std.title()]
    if len(gender) > 0:
        #print(row.deduplicated_name_std.title())
        authors.loc[index, 'gender'] = gender.iloc[0].gender
        #print(row.deduplicated_name_std.title(), gender)

In [34]:
unknown_gender_authors.head(2)
unknown_gender_authors[unknown_gender_authors.deduplicated_name_std == "QIAN LONG"]

Unnamed: 0,level_0,pmid,title,authors_list,citation,journal_book,publication_year,pmcid,doi,publication_type,...,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std,countries,gender,constant
4024,27885772,27885772,Clinical practice patterns on the use of magne...,"Q LONG, O T OLADAPO, S LEATHERSICH, J P VOGEL,...","Long Q, Oladapo OT, Leathersich S, et al. Clin...",BJOG : an international journal of obstetrics ...,2017,PMC5697690,10.1111/1471-0528.14400,,...,False,LONG,Q,Q LONG,QIAN LONG,"QIAN LONG, Q LONG",2.0,[SWITZERLAND],andy,N=10353
4371,28588944,28588944,Onsite midwife-led birth units (OMBUs) for car...,"QIAN LONG, EMMA R ALLANSON, JENNIFER PONTRE, Ö...","Long Q, Allanson ER, Pontre J, Tunçalp Ö, Hofm...",BMJ global health,2016,PMC5321346,10.1136/bmjgh-2016-000096,,...,False,LONG,Q,Q LONG,QIAN LONG,"QIAN LONG, Q LONG",2.0,[SWITZERLAND],andy,N=10353
5043,29081997,29081997,Implementation of effective practices in healt...,"EMMA R ALLANSON, ÖZGE TUNÇALP, JOSHUA P VOGEL,...","Allanson ER, Tunçalp Ö, Vogel JP, et al. Imple...",BMJ global health,2017,PMC5656132,10.1136/bmjgh-2016-000266,,...,False,LONG,Q,Q LONG,QIAN LONG,"QIAN LONG, Q LONG",2.0,[SWITZERLAND],andy,N=10353
6917,31157410,31157410,Alternative Magnesium Sulfate Dosing Regimens ...,"LIHONG DU, LARISSA A WENNING, BRENDAN CARVALHO...","Du L, Wenning LA, Carvalho B, et al. Alternati...",Journal of clinical pharmacology,2019,PMC6790709,10.1002/jcph.1448,"Research Support, Non-U.S. Gov't",...,False,LONG,Q,Q LONG,QIAN LONG,"QIAN LONG, Q LONG",2.0,"[SWITZERLAND, CHINA]",andy,N=10353


In [35]:
authors[authors.deduplicated_name_std == "QIAN LONG"]

Unnamed: 0,level_0,pmid,title,authors_list,citation,journal_book,publication_year,pmcid,doi,publication_type,...,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std,countries,gender,constant
4024,27885772,27885772,Clinical practice patterns on the use of magne...,"Q LONG, O T OLADAPO, S LEATHERSICH, J P VOGEL,...","Long Q, Oladapo OT, Leathersich S, et al. Clin...",BJOG : an international journal of obstetrics ...,2017,PMC5697690,10.1111/1471-0528.14400,,...,False,LONG,Q,Q LONG,QIAN LONG,"QIAN LONG, Q LONG",2.0,[SWITZERLAND],female,N=10353
4371,28588944,28588944,Onsite midwife-led birth units (OMBUs) for car...,"QIAN LONG, EMMA R ALLANSON, JENNIFER PONTRE, Ö...","Long Q, Allanson ER, Pontre J, Tunçalp Ö, Hofm...",BMJ global health,2016,PMC5321346,10.1136/bmjgh-2016-000096,,...,False,LONG,Q,Q LONG,QIAN LONG,"QIAN LONG, Q LONG",2.0,[SWITZERLAND],female,N=10353
5043,29081997,29081997,Implementation of effective practices in healt...,"EMMA R ALLANSON, ÖZGE TUNÇALP, JOSHUA P VOGEL,...","Allanson ER, Tunçalp Ö, Vogel JP, et al. Imple...",BMJ global health,2017,PMC5656132,10.1136/bmjgh-2016-000266,,...,False,LONG,Q,Q LONG,QIAN LONG,"QIAN LONG, Q LONG",2.0,[SWITZERLAND],female,N=10353
6917,31157410,31157410,Alternative Magnesium Sulfate Dosing Regimens ...,"LIHONG DU, LARISSA A WENNING, BRENDAN CARVALHO...","Du L, Wenning LA, Carvalho B, et al. Alternati...",Journal of clinical pharmacology,2019,PMC6790709,10.1002/jcph.1448,"Research Support, Non-U.S. Gov't",...,False,LONG,Q,Q LONG,QIAN LONG,"QIAN LONG, Q LONG",2.0,"[SWITZERLAND, CHINA]",female,N=10353


In [36]:
authors.gender.replace('andy', 'unknown', inplace=True)
tabelaEstiloArtigoCategoricas(authors, 'constant', 'gender', probabilidade='col')

  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))
  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))


constant,N=10353,N=10353,All,All
distribuição,N,%,N,%
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
female,5134,(49.59%),5134,(49.59%)
male,4280,(41.34%),4280,(41.34%)
unknown,939,(9.07%),939,(9.07%)
All,10353,(100.0%),10353,(100.0%)


In [37]:
authors.head(2)

Unnamed: 0,level_0,pmid,title,authors_list,citation,journal_book,publication_year,pmcid,doi,publication_type,...,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std,countries,gender,constant
0,13374,13374,"The W.H.O. expanded programme of research, dev...","A KESSLER, C C STANDLEY","Kessler A, Standley CC. The W.H.O. expanded pr...",Proceedings of the Royal Society of London. Se...,1976,,10.1098/rspb.1976.0103,,...,False,KESSLER,A,A KESSLER,A KESSLER,A KESSLER,1.0,[],male,N=10353
1,13374,13374,"The W.H.O. expanded programme of research, dev...","A KESSLER, C C STANDLEY","Kessler A, Standley CC. The W.H.O. expanded pr...",Proceedings of the Royal Society of London. Se...,1976,,10.1098/rspb.1976.0103,,...,False,STANDLEY,C,C STANDLEY,C C STANDLEY,C C STANDLEY,1.0,[],unknown,N=10353


In [38]:
names_checked[names_checked.deduplicated_name_std == "I K WARRINER".title()]

Unnamed: 0,deduplicated_name_std,first_name,gender
429,I K Warriner,I,female


In [39]:
names_checked.tail()

Unnamed: 0,deduplicated_name_std,first_name,gender
997,Z Fekete,Z,unknown
998,Zafiro Andrade Romo,ZAFIRO,male
999,C Abouzahr,C,female
1000,Bidia D Deperthes,BIDIA,female
1001,Qian Long,QIAN,female


In [40]:
authors[authors.deduplicated_name_std == "I K WARRINER"]#, 'affiliation']

Unnamed: 0,level_0,pmid,title,authors_list,citation,journal_book,publication_year,pmcid,doi,publication_type,...,is_institutions_name,last_name,first_letter_name,standard_name,deduplicated_name_std,names_variation_std,n_variacoes_std,countries,gender,constant
419,17141703,17141703,Rates of complication in first-trimester manua...,"I K WARRINER, O MEIRIK, M HOFFMAN, C MORRONI, ...","Warriner IK, Meirik O, Hoffman M, et al. Rates...","Lancet (London, England)",2006,,10.1016/S0140-6736(06)69742-0,"Multicenter Study,Randomized Controlled Trial,...",...,False,WARRINER,I,I WARRINER,I K WARRINER,I K WARRINER,1.0,[SWITZERLAND],female,N=10353
5465,29246235,29246235,Comparative satisfaction of receiving medical ...,"ANAND TAMANG, IQBAL H SHAH, PRAGYA SHRESTHA, I...","Tamang A, Shah IH, Shrestha P, et al. Comparat...",Reproductive health,2017,PMC5732435,10.1186/s12978-017-0438-7,Randomized Controlled Trial,...,False,WARRINER,I,I WARRINER,I K WARRINER,I K WARRINER,1.0,[UNITED STATES OF AMERICA],female,N=10353


In [41]:
unique_authors = authors[["deduplicated_name_std", "gender", "countries"]].drop_duplicates(subset=["deduplicated_name_std"])
unique_authors['constant'] =["N={}".format(len(unique_authors)) for i in unique_authors['gender']]
tabelaEstiloArtigoCategoricas(unique_authors, 'constant', 'gender', probabilidade='col')

  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))
  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))


constant,N=4434,N=4434,All,All
distribuição,N,%,N,%
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
female,2112,(47.63%),2112,(47.63%)
male,1633,(36.83%),1633,(36.83%)
unknown,689,(15.54%),689,(15.54%)
All,4434,(100.0%),4434,(100.0%)


In [42]:
unique_authors

Unnamed: 0,deduplicated_name_std,gender,countries,constant
0,A KESSLER,male,[],N=4434
1,C C STANDLEY,unknown,[],N=4434
2,L S PERSIANINOV,unknown,[],N=4434
3,I A MANUILOVA,unknown,[],N=4434
4,HOWARD S FRIEDMAN,male,[SWITZERLAND],N=4434
...,...,...,...,...
10338,ISHA BERRY,male,[CANADA],N=4434
10340,KALONDE MALAMA,unknown,[CANADA],N=4434
10341,HOLLY DONKERS,female,[CANADA],N=4434
10344,JEEVA JOHN,male,[UNITED KINGDOM],N=4434


In [43]:
unknown_gender_authors = authors[authors.gender == 'unknown']
#authors.gender.replace('andy', 'unknown', inplace=True)
#authors.gender.replace('unknown', np.nan, inplace=True)

authors.to_csv(path+'authors_gender.csv', encoding="utf-8", index=False)

In [44]:
authors.gender.value_counts()

female     5134
male       4280
unknown     939
Name: gender, dtype: int64

In [45]:
authors.pmid.value_counts()

29178847    244
28643241    229
34273300    181
34475107    169
32444432    168
           ... 
1390784       1
30734448      1
1396266       1
23240739      1
17531612      1
Name: pmid, Length: 985, dtype: int64

In [46]:
print(authors.gender.isna().sum())
tabelaEstiloArtigoCategoricas(authors, 'constant', 'gender', probabilidade='col')

0


  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))
  V = numpy.sqrt(test_val / (n * min((num_row - 1), (num_col - 1))))


constant,N=10353,N=10353,All,All
distribuição,N,%,N,%
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
female,5134,(49.59%),5134,(49.59%)
male,4280,(41.34%),4280,(41.34%)
unknown,939,(9.07%),939,(9.07%)
All,10353,(100.0%),10353,(100.0%)


In [47]:
unknown_gender_authors = authors[authors.gender == np.nan]
print("Total of unique names unknown gender: {} ({:.2%})".
      format(len(unknown_gender_authors.deduplicated_name_std.unique()), 
            len(unknown_gender_authors.deduplicated_name_std.unique())/len(authors.deduplicated_name_std.unique())))
print("Total of names unknown gender: {} ({:.2%})\n".format(len(unknown_gender_authors.deduplicated_name_std),
            len(unknown_gender_authors.deduplicated_name_std)/len(authors.deduplicated_name_std)))

known_gender_authors = authors[authors.gender != 'unknown']
print("Total of unique names suggested gender: {} ({:.2%})".
      format(len(known_gender_authors.deduplicated_name_std.unique()), 
            len(known_gender_authors.deduplicated_name_std.unique())/len(authors.deduplicated_name_std.unique())))
print("Total of names suggested gender: {} ({:.2%})".format(len(known_gender_authors.deduplicated_name_std),
            len(known_gender_authors.deduplicated_name_std)/len(authors.deduplicated_name_std)))


Total of unique names unknown gender: 0 (0.00%)
Total of names unknown gender: 0 (0.00%)

Total of unique names suggested gender: 3745 (84.46%)
Total of names suggested gender: 9414 (90.93%)


In [48]:
unknown_gender_authors = authors[authors.gender == 'unknown'].groupby('deduplicated_name_std')[['pmid']].count().sort_values(by='pmid', ascending=False)
unknown_gender_authors['first_name_is_letter'] = [1 if len(first_name.split(' ')[0]) == 1 else 0 for first_name in unknown_gender_authors.index]
unknown_gender_authors['first_name'] = [first_name.split(' ')[0] for first_name in unknown_gender_authors.index]


unknown_gender_authors.to_csv(path + "author_unknown_gender.csv", encoding="utf-8")
unknown_gender_authors.head()

Unnamed: 0_level_0,pmid,first_name_is_letter,first_name
deduplicated_name_std,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B M LANDGREN,13,1,B
E JOHANNISSON,11,1,E
SOO DOWNE,10,0,SOO
PING TERESA YEH,9,0,PING
TRAN MINH THIEN NGO,7,0,TRAN


In [49]:
unknown_gender_authors.first_name.value_counts().to_csv("first_name_unknow_gender_count.csv", encoding="utf-8")
unknown_gender_authors#.first_name.value_counts()

Unnamed: 0_level_0,pmid,first_name_is_letter,first_name
deduplicated_name_std,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B M LANDGREN,13,1,B
E JOHANNISSON,11,1,E
SOO DOWNE,10,0,SOO
PING TERESA YEH,9,0,PING
TRAN MINH THIEN NGO,7,0,TRAN
...,...,...,...
J FORTNEY,1,1,J
J GE,1,1,J
J GUEROLA,1,1,J
J HUATUCO,1,1,J


In [50]:
papers
profile = ProfileReport(papers.drop("abstract", axis=1))#, minimal=True)
profile.to_file(path + "papers.html")

Summarize dataset:   0%|          | 0/32 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [51]:
authors
profile = ProfileReport(authors, minimal=True)
profile.to_file(path + "authors.html")

Summarize dataset:   0%|          | 0/43 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [52]:
profile = ProfileReport(authors)#, minimal=True)
profile.to_file(path + "authors_max.html")

profile = ProfileReport(authors[authors.index_authorship > 0], minimal=True)
profile.to_file(path + "authors_with_index.html")

profile = ProfileReport(authors[authors.index_authorship == 0], minimal=True)
profile.to_file(path + "colaborators.html")

Summarize dataset:   0%|          | 0/48 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/44 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/44 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [53]:
unknown_gender_authors

profile = ProfileReport(unknown_gender_authors, minimal=True)
profile.to_file(path + "unknown_gender_authors.html")

Summarize dataset:   0%|          | 0/12 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]