In [1]:
import csv
import json
import time
from os import makedirs
from os.path import exists, join
import pandas as pd
import numpy as np

import gender_guesser.detector as gender
d = gender.Detector()

import requests
import re

from pandas_profiling import ProfileReport

try: 
    from BeautifulSoup import BeautifulSoup
except ImportError:
    from bs4 import BeautifulSoup

In [2]:
path = 'C:\\Users\\livia\\Dropbox\\HRP Alliance authorship paper\\Data 2022-04-14\\'


def get_title(parsed_html):
    return  parsed_html.find(attrs= {'class': 'heading-title'}).get_text().strip() if parsed_html.find(attrs= {'class': 'heading-title'}) else ""

def get_authors_list(parsed_html):
    list_authors = pd.DataFrame(columns=['full_name', 'affiliation', 'index_authorship', 'pmid'])
    
    if parsed_html.find(attrs={'id': 'full-view-heading'}):
        authors = parsed_html.find(attrs={'id': 'full-view-heading'}).find_all('span', attrs={'class': 'authors-list-item'})  
        list_authors = pd.DataFrame(columns=['full_name', 'affiliation', 'index_authorship', 'pmid'], index=np.arange(len(authors)))
        pmid = parsed_html.find('strong', attrs={'class': 'current-id'}).text.strip()  

        i = 0
        for author in authors:
            list_authors.iloc[i]['full_name'] = author.find(attrs={'class': 'full-name'}).text        
            list_authors.iloc[i]['index_authorship'] = i + 1
            list_authors.iloc[i]['pmid'] = pmid

            affiliations = author.find_all(attrs={'class': 'affiliation-link'})
            list_authors.iloc[i]['affiliation'] = []
            for affiliation in affiliations:
                list_authors.iloc[i]['affiliation'] += [affiliation['title']]
            i +=1
    else:        
        print(parsed_html.find(attrs={'id': 'full-view-heading'}))

    return list_authors

def get_last_author_before_instituion(list_authors):
    some_list = ['World Health Organization', 'WHO', 'Research', 'Reproductive', 'Study', 'Health', 'GROUP', 'NETWORK',
                'Consortium', 'committee', 'all the authors', 'for IeDEA-Southern Africa', 'Systematic', 
                 'collaborations', 'Organizacion', 'College', 'Consortium', 'Association', 'Survey', 'Expert',
                'de la Salud', 'Control']
    last_author = list_authors.iloc[len(list_authors)-1] if len(list_authors) > 0 else ""
    if any(s.upper() in last_author.upper() for s in some_list):
        return list_authors.iloc[len(list_authors)-2]
    return last_author

def get_citation(pmid):
    s = requests.Session()
    url = 'https://pubmed.ncbi.nlm.nih.gov/{}/citations/'.format(pmid)  
    try:
        response = s.get(url)
    except requests.exceptions.ConnectionError:
        time.sleep(60)
        response = s.get(url)
    return json.loads(response.text)['ama']['orig']


def get_journal_book(parsed_html):
    return parsed_html.find(attrs= {'id': 'full-view-journal-trigger'})['title'].strip() if parsed_html.find(attrs= {'id': 'full-view-journal-trigger'}) else ""

def get_publication_year(parsed_html):
    #pub_date = parsed_html.find('meta',attrs= {'name': 'citation_date'})['content']
    return re.search('[0-9]{4}',parsed_html.find('meta',attrs= {'name': 'citation_date'})['content']).group(0)

def get_doi(parsed_html):
    return parsed_html.find('meta',attrs= {'name': 'citation_doi'})['content']

def get_pmc_id(parsed_html):
    if parsed_html.find('span', attrs={'class': 'identifier pmc'}) and parsed_html.find('span', attrs={'class': 'identifier pmc'}).find('a'):
        return parsed_html.find('span', attrs={'class': 'identifier pmc'}).find('a').text.strip()
    else:
        return ''

def gess_gender_author(name):
    first_name = name.split(' ')[0]
    return d.get_gender(first_name)

def pub_type(parsed_html):
    pub_types = parsed_html.find_all('a',attrs= {'data-ga-action': 'pub_type_link'})
    list_pub_types = []
    for pub_type in pub_types:
        list_pub_types += [pub_type['data-ga-label']]
    return list_pub_types

def is_systematic_review(list_pub_types):
    return 1 if 'Systematic Review' in list_pub_types else 0


def get_parsed_html_from_pmid(PMID):
    s = requests.Session()
    url = 'https://pubmed.ncbi.nlm.nih.gov/{}/'.format(PMID)
    try:
        response = s.get(url)
    except requests.exceptions.ConnectionError:
        time.sleep(60)
        response = s.get(url)
        
    html = response.text
    parsed_html = BeautifulSoup(html)
    if parsed_html.find(attrs= {'class': 'usa-grid error-page'}): #.find('h2').text.strip() == 'Server error':
        print(url)
        return False
    return parsed_html


In [4]:
#Link to open access publication "Type of publication:  Systematic review=1; Scientific paper=2"	
#HRP thematic area "Type of support: Direct support=1; Indirect support=2"	

pmid_list = pd.read_csv(path + 'pmid_list_v3.txt')
columns = ['pmid', 'title', 'authors_list', 'citation', 'journal_book', 'publication_year', 'create_date', 
           'pmcid', 'nih_ms', 'doi', 'publication_type', 'hrp_thematic_area', 'type_support',
           'first_author', 'fa_country', 'fa_gender', 'fa_is_hub', 'fa_is_staff'
           'last_author',  'la_country', 'la_gender', 'la_is_hub', 'la_is_staff']
database = pd.DataFrame(columns = columns, index = pmid_list['PMID'])



In [5]:
complete_list_authors = None
for pmid in pmid_list['PMID']:
    parsed_html = get_parsed_html_from_pmid(pmid)
    
    if parsed_html:
        list_authors = get_authors_list(parsed_html)
        if complete_list_authors is []:
            complete_list_authors = list_authors
        else:
            complete_list_authors = pd.concat([complete_list_authors, list_authors])        
            
        list_authors = list_authors['full_name']

        database.loc[pmid, 'title'] = get_title(parsed_html)
        database.loc[pmid, 'authors_list'] = ", ".join(list_authors)
        database.loc[pmid, 'citation'] = get_citation(pmid)
        database.loc[pmid, 'journal_book'] = get_journal_book(parsed_html)
        database.loc[pmid, 'publication_year'] = get_publication_year(parsed_html)
        #database.loc[pmid, 'create_date'] = 
        database.loc[pmid, 'pmcid'] = get_pmc_id(parsed_html)
        #database.loc[pmid, 'nih_ms'] = 
        database.loc[pmid, 'doi'] = get_doi(parsed_html)
        database.loc[pmid, 'publication_type'] = ",".join(pub_type(parsed_html))
        database.loc[pmid, 'is_systematic_review'] = is_systematic_review(database.loc[pmid, 'publication_type'])
        #database.loc[pmid, 'hrp_thematic_area'] = 
        #database.loc[pmid, 'type_support'] = 
        database.loc[pmid, 'first_author'] = list_authors.iloc[0] if len(list_authors) > 0 else ""
        #database.loc[pmid, 'fa_country'] = 
        database.loc[pmid, 'fa_gender'] = gess_gender_author(database.loc[pmid, 'first_author'])
        #database.loc[pmid, 'fa_is_hub'] = 
        #database.loc[pmid, 'fa_is_staff'] = 
        database.loc[pmid, 'last_author'] = get_last_author_before_instituion(list_authors)
        #database.loc[pmid, 'la_country'] = 
        database.loc[pmid, 'la_gender', ] = gess_gender_author(database.loc[pmid, 'last_author']) 
        #database.loc[pmid, 'la_is_hub'] = 
        #database.loc[pmid, 'la_is_staff'] = 
        #database.loc[pmid, 'key_words'] = 
    else:
        linha = pd.DataFrame(np.nan, index=[pmid], columns=database.columns)
        
    database.to_csv('artigos_partial_HRP_3.csv', encoding='utf-8')
    complete_list_authors.to_csv('authors_partial_3.csv', encoding='utf-8', index=False)
    
database.to_csv('artigos_HRP_v3.csv', encoding='utf-8')
complete_list_authors.to_csv('authors_v3.csv', encoding='utf-8', index=False)

https://pubmed.ncbi.nlm.nih.gov/34594989/


In [69]:

profile = ProfileReport(database, minimal=True)
profile.to_file(path + "papers.html")

Summarize dataset:   0%|          | 0/33 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]