In [45]:
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, SoupStrainer
import urllib
import os
from argparse import Namespace
from urllib.request import urlopen
import time
from datetime import datetime, timedelta
from dateutil.relativedelta import *

In [46]:
paper_df = pd.DataFrame(columns=["PMID", "TI", "PT", "JT", "DP", "OT", "MH", "RN"])
site_df = pd.DataFrame(columns=['PMID','AUTHOR','UNIVERSITY','DEPARTMENT','CITY','COUNTRY'])

In [47]:
def get_soup(url):
    count = 0
    soup = None
    while soup is None:
        try:
            page = requests.get("https://pubmed.ncbi.nlm.nih.gov/?term=" + url, allow_redirects=False, timeout=100)
            soup = BeautifulSoup(page.content, features="lxml")
        except Exception as e: print("get_soup: ", e) 
        count += 1
        if (count >= 20): break
    return soup

In [48]:
from collections import OrderedDict

def parse_location(string):   
    university, department, city, country = None, None, None, None 
    string = string.replace(".", "").replace("(", "").replace(")", "")
    if " - " in string:
        string = string.split(" - ")[0]
    components = string.strip().split(',')
    components = list(OrderedDict.fromkeys(components))
    components = [item for item in components if not any(char.isdigit() for char in item)]
    if len(components) == 1:
        university = components[0]
    elif len(components) == 2:
        university = components[0]
        country = components[1]
    elif len(components) == 3:
        university = components[0]
        city = components[1]
        country = components[2]
    elif len(components) == 4:
        department = components[0]
        university = components[1]
        city = components[2]
        country = components[3]
    elif len(components) > 4:
        university = components[0]
        city = components[len(components)-2]
        country = components[len(components)-1]

    return university, department, city, country

In [49]:
def write_to_df(records):
    paper = []
    site = []
    desired_fields = ["PMID", "TI", "PT", "JT", "DP", "OT", "MH", "RN"]
    key = ""
    value = ""
    for record in records[:]:
        entry = {}
        lines = record.strip().split('\n')
        pmid = lines[0].strip()
        entry['PMID'] = pmid
        for line in lines[1:]:
            parts = re.split(r'\s*-\s*', line, maxsplit=1)
            if(len(parts) == 2):
                key, value = parts
                value = value.strip()
                if key in desired_fields:
                    try:
                        if (entry[key]):
                            entry[key] = entry[key] + " / " + value
                    except:
                        entry[key] = value
            else:
                if key in desired_fields:
                    line = line.strip()
                    entry[key] = entry[key] + line
        paper.append(entry)
        authors = re.split(r'FAU - ', record)
        for author in authors:
            entry = {}   
            ad_match = re.findall(r'AD  - (.+)', author, flags=re.DOTALL)
            if ad_match:
                line = author.strip().split('\n')
                fau = line[0].strip()
                ad = ' / '.join(ad_match)
                university, department, city, country = parse_location(ad)
                entry['PMID'] = pmid
                entry['AUTHOR'] = fau
                entry['UNIVERSITY'] = ad
                entry['DEPARTMENT'] = ad
                entry['CITY'] = ad
                entry['COUNTRY'] = ad
                site.append(entry)
        records.remove(record)

    paper_df_temp = pd.DataFrame(paper)
    site_df_temp = pd.DataFrame(site)
    return paper_df_temp, site_df_temp

In [50]:
leftovers = []
i = 2
r1 = 0
r2 = 0
try:

    start_date = datetime(2024, 1, 1)
    end_date = datetime(1993, 1, 1)
    current_date = start_date
    
    while current_date >= end_date:
        print(current_date.strftime("%Y-%m-%d"), end = ' ')
        current_date_next = current_date - relativedelta(months=+1)
        
        url1 = "%28%28"+str(current_date_next.year)+"%2F"+str(current_date_next.month)+"%2F"+str(current_date_next.day)+"%5BDate+-+Create%5D+%3A+"+str(current_date.year)+"%2F"+str(current_date.month)+"%2F"+str(current_date.day)+"%5BDate+-+Create%5D%29%29+AND+%28alzheimer%29"
        
        soup = get_soup(url1)
        log_resultcount_tag  = soup.find('meta', {'name': 'log_resultcount'})
        resultcount_value = log_resultcount_tag.get('content')
        resultcount = (int(resultcount_value) // 200)+1
        if (resultcount <= 50):
            r1 = resultcount
            r2 = 1
        elif (resultcount <= 100):
            r1 = 50
            r2 = resultcount - 49
        else: 
            print("******** problem on url: ", url)
            leftovers.append(current_date)
            
        for x in range(1,r1+1):
            url_end = "&size=200&sort=fauth&sort_order=asc&format=pubmed&page="+str(x)
            url= url1 + url_end
            soup = get_soup(url)
            temp_list = re.split(r'\bPMID- ', soup.text)[1:]
            paper_df_temp, site_df_temp = write_to_df(temp_list) 
            paper_df = pd.concat([paper_df, paper_df_temp])
            site_df = pd.concat([site_df, site_df_temp])
        for x in range(1,r2+1):
            url_end = "&size=200&sort=fauth&sort_order=desc&format=pubmed&page="+str(x)
            url= url1 + url_end
            soup = get_soup(url)
            temp_list = re.split(r'\bPMID- ', soup.text)[1:]
            paper_df_temp, site_df_temp = write_to_df(temp_list) 
            paper_df = pd.concat([paper_df, paper_df_temp])
            site_df = pd.concat([site_df, site_df_temp])
            if(site_df.shape[0]> 3000000):
                paper_csv = "paper_" + str(i)+".csv"
                site_csv = "site_" + str(i)+".csv"
                i = i + 1
                paper_df.to_csv(paper_csv, sep=';')
                site_df.to_csv(site_csv, sep=';')
                paper_df = pd.DataFrame(columns=["PMID", "TI", "PT", "JT", "DP", "OT", "MH", "RN"])
                site_df = pd.DataFrame(columns=['PMID','AUTHOR','UNIVERSITY','DEPARTMENT','CITY','COUNTRY']) 
        current_date = current_date_next
except Exception as e: 
    print("*", e)

2024-01-01 2023-12-01 2023-11-01 2023-10-01 2023-09-01 2023-08-01 2023-07-01 2023-06-01 2023-05-01 2023-04-01 2023-03-01 2023-02-01 2023-01-01 2022-12-01 2022-11-01 2022-10-01 2022-09-01 2022-08-01 2022-07-01 2022-06-01 2022-05-01 2022-04-01 2022-03-01 2022-02-01 2022-01-01 2021-12-01 2021-11-01 2021-10-01 2021-09-01 2021-08-01 2021-07-01 2021-06-01 2021-05-01 2021-04-01 2021-03-01 2021-02-01 2021-01-01 2020-12-01 2020-11-01 2020-10-01 2020-09-01 2020-08-01 2020-07-01 2020-06-01 2020-05-01 2020-04-01 2020-03-01 2020-02-01 2020-01-01 2019-12-01 2019-11-01 2019-10-01 2019-09-01 2019-08-01 2019-07-01 2019-06-01 2019-05-01 2019-04-01 2019-03-01 2019-02-01 2019-01-01 2018-12-01 2018-11-01 2018-10-01 2018-09-01 2018-08-01 2018-07-01 2018-06-01 2018-05-01 2018-04-01 2018-03-01 2018-02-01 2018-01-01 2017-12-01 2017-11-01 2017-10-01 2017-09-01 2017-08-01 2017-07-01 2017-06-01 2017-05-01 2017-04-01 2017-03-01 2017-02-01 2017-01-01 2016-12-01 2016-11-01 2016-10-01 2016-09-01 2016-08-01 2016-07-01

In [43]:
paper_df

Unnamed: 0,PMID,TI,PT,JT,DP,OT,MH,RN
0,38070385,New bithiophene derivative attenuated Alzheime...,Journal Article,Journal of trace elements in medicine and biol...,2023 Dec 8,AChE / Antioxidants / GSK3-β / MAO / Neurotran...,,
1,38061270,Unraveling the role of miRNAs in the diagnosis...,Journal Article / Review,"Pathology, research and practice",2023 Dec 4,Alzheimer’s disease / Diagnosis / Mental healt...,,
2,38083415,Effect of Comorbidities Features in Machine Le...,Journal Article,Annual International Conference of the IEEE En...,2023 Jul,,,
3,38087743,Preoperative electroencephalographic alpha-pow...,Journal Article,British journal of anaesthesia,2023 Dec 11,alpha attenuation / attention / delirium / ele...,,
4,38053545,A glance through the effects of CD4(+) T cells...,Journal Article / Review,Computational and structural biotechnology jou...,2023,Alzheimer's disease / Amyloid β-protein / CD4+...,,
...,...,...,...,...,...,...,...,...
195,37961246,Human neural stem cells restore spatial memory...,Preprint,bioRxiv : the preprint server for biology,2023 Nov 4,,,
196,37961679,Regional interneuron transcriptional changes r...,Preprint,bioRxiv : the preprint server for biology,2023 Nov 4,,,
197,37939533,A comparative study of GNN and MLP based machi...,Journal Article,Neural networks : the official journal of the ...,2023 Oct 26,Alzheimer’s Disease (AD) / Data fusion / Graph...,,
198,37931808,Chronic pain accelerates cognitive impairment ...,Journal Article,Brain research bulletin,2023 Dec,Alzheimer's disease / CCL2 / Chronic pain / Co...,,


In [51]:
site_df.shape

(1217152, 6)

In [52]:
paper_df.to_csv("alzheimers_paper", sep=';')
site_df.to_csv("alzheimers_site", sep=';')