This notebook downloads the current PubMed baseline data and stores it in a Google Drive.

There should be no reason to run this code unless you wish to download PubMed to your own Google Drive folder or locally.

In [1]:
import urllib
import re
import shutil
import contextlib

from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient import discovery
from googleapiclient import errors
from googleapiclient.http import MediaFileUpload

In [2]:
Return = urllib.request.urlopen('ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline')
Text = str(Return.read())

In [3]:
Text.split('\\n')

["b'-r--r--r--   1 ftp      anonymous     4187 Mar 23 16:50 README.txt\\r",
 '-r--r--r--   1 ftp      anonymous 19593618 Dec 16 11:47 pubmed20n0001.xml.gz\\r',
 '-r--r--r--   1 ftp      anonymous       60 Dec 16 11:47 pubmed20n0001.xml.gz.md5\\r',
 '-r--r--r--   1 ftp      anonymous 17719379 Dec 16 11:47 pubmed20n0002.xml.gz\\r',
 '-r--r--r--   1 ftp      anonymous       60 Dec 16 11:47 pubmed20n0002.xml.gz.md5\\r',
 '-r--r--r--   1 ftp      anonymous 16411608 Dec 16 11:47 pubmed20n0003.xml.gz\\r',
 '-r--r--r--   1 ftp      anonymous       60 Dec 16 11:47 pubmed20n0003.xml.gz.md5\\r',
 '-r--r--r--   1 ftp      anonymous 18055779 Dec 16 11:47 pubmed20n0004.xml.gz\\r',
 '-r--r--r--   1 ftp      anonymous       60 Dec 16 11:47 pubmed20n0004.xml.gz.md5\\r',
 '-r--r--r--   1 ftp      anonymous 17360235 Dec 16 11:48 pubmed20n0005.xml.gz\\r',
 '-r--r--r--   1 ftp      anonymous       60 Dec 16 11:48 pubmed20n0005.xml.gz.md5\\r',
 '-r--r--r--   1 ftp      anonymous 20367276 Dec 16 11:48 pubmed

In [4]:
# This is a little regex that finds all the strings of the form pubmed20n####.xml.gz but that don't end in . (as the .md5 files do).
ZipFiles = re.findall(r"\bpubmed20n\d{4}.xml.gz(?!\.)\b", Text)
print(len(ZipFiles))
ZipFiles

1015


['pubmed20n0001.xml.gz',
 'pubmed20n0002.xml.gz',
 'pubmed20n0003.xml.gz',
 'pubmed20n0004.xml.gz',
 'pubmed20n0005.xml.gz',
 'pubmed20n0006.xml.gz',
 'pubmed20n0007.xml.gz',
 'pubmed20n0008.xml.gz',
 'pubmed20n0009.xml.gz',
 'pubmed20n0010.xml.gz',
 'pubmed20n0011.xml.gz',
 'pubmed20n0012.xml.gz',
 'pubmed20n0013.xml.gz',
 'pubmed20n0014.xml.gz',
 'pubmed20n0015.xml.gz',
 'pubmed20n0016.xml.gz',
 'pubmed20n0017.xml.gz',
 'pubmed20n0018.xml.gz',
 'pubmed20n0019.xml.gz',
 'pubmed20n0020.xml.gz',
 'pubmed20n0021.xml.gz',
 'pubmed20n0022.xml.gz',
 'pubmed20n0023.xml.gz',
 'pubmed20n0024.xml.gz',
 'pubmed20n0025.xml.gz',
 'pubmed20n0026.xml.gz',
 'pubmed20n0027.xml.gz',
 'pubmed20n0028.xml.gz',
 'pubmed20n0029.xml.gz',
 'pubmed20n0030.xml.gz',
 'pubmed20n0031.xml.gz',
 'pubmed20n0032.xml.gz',
 'pubmed20n0033.xml.gz',
 'pubmed20n0034.xml.gz',
 'pubmed20n0035.xml.gz',
 'pubmed20n0036.xml.gz',
 'pubmed20n0037.xml.gz',
 'pubmed20n0038.xml.gz',
 'pubmed20n0039.xml.gz',
 'pubmed20n0040.xml.gz',


In [5]:
import os
import shutil
import glob

In [6]:
OutputFolder = './PubMed/'
if os.path.exists(OutputFolder):
    shutil.rmtree(OutputFolder)
os.mkdir(OutputFolder)

In [7]:
for ZipFile in ZipFiles:
    with contextlib.closing(urllib.request.urlopen('ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/{}'.format(ZipFile))) as r:
        with open(OutputFolder+ZipFile, 'wb') as f:
            shutil.copyfileobj(r, f)

In [8]:
scope = ['https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('./GoogleDriveCredentials/IntegrationCredentials.json', scope)
drive_service = discovery.build('drive', 'v3', credentials=credentials)

In [9]:
TopLevelFolderID = '1I8uM1gWa9J93au8fuRZ-RTd3n15DnBsi'
PubMedFolderName = 'PubMed_Data'

FoundFolders = []

page_token = None
while True:
    response = drive_service.files().list(q="name = '"+PubMedFolderName+"' and '"+TopLevelFolderID+"' in parents", spaces='drive', 
                                          fields='nextPageToken, files(id, name)', pageToken=page_token).execute()
    for file in response.get('files', []):
        # Process change
        FoundFolders.append(file.get('id'))
    page_token = response.get('nextPageToken', None)
    if page_token is None:
        break

if len(FoundFolders) == 1:
    PubMedFolderID = FoundFolders[0]
else:
    print('Found more than 1 folder with name {}'.format(PubMedFolderName))
    print(FoundFolders)

In [10]:
def UploadFileToDriveFolder(fileName, _folderID, _mimeType,_drive_service):
    file_metadata = {'name': fileName, 'parents': [_folderID]}
    media = MediaFileUpload(fileName,
                            mimetype=_mimeType)
    file = _drive_service.files().create(body=file_metadata,
                                        media_body=media,
                                        fields='id').execute()
    return file.get('id')

In [12]:
for File in glob.glob(OutputFolder+'*'):
    UploadFileToDriveFolder(File, PubMedFolderID, 'application/x-gzip', drive_service)