In [31]:
from ftplib import FTP
import utils
from pathlib import Path
import xml.etree.ElementTree as ET
import glob

server='lhcftp.nlm.nih.gov'
dirs_from_to=(1, 110)
source_dirs=['Open-Access-Datasets/Pills/PillProjectDisc' + str(x) +'/images' for x in range(dirs_from_to[0], dirs_from_to[1]+1)]

dest_base_dir=Path('Dataset')

with FTP(server) as ftp:
    print('Login into {}'.format(server))
    ftp.login()

    base_dir = ftp.pwd()
    
    for idx, source_dir in enumerate(source_dirs):
        ftp.cwd(base_dir)
        ftp.cwd(source_dir)
        
        dest_dir = dest_base_dir / str(idx)
        
        dest_dir.mkdir(parents=True, exist_ok=True)

        print('Current woring directory:', ftp.pwd())
        print('Downloading tmp xml')
        filename = 'images.xml'
        dest_file = dest_dir / filename
        with open(dest_file, 'wb') as f:
            ftp.retrbinary('RETR ' + filename, f.write)

        tree = ET.parse(dest_file)
        root = tree.getroot()
        
        se = list(root)[0]
        
        print('Filtering xml (' + str(len(list(se))) + ' images)')
        images = []
        for e in list(se):
            layout = e.find('Layout')
            shadow = e.find('RatingShadow')
            if (layout is not None and layout.text == "MC_C3PI_REFERENCE_SEG_V1.6") or \
               (shadow is not None and shadow.text == 'Soft'):
                images.append(e.find('File').find('Name').text)
            else:
                se.remove(e)
        
        print('saving xml in:', dest_file)
        tree.write(dest_file)
        
        print("final images:", len(images))

Login into lhcftp.nlm.nih.gov
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc1/images
Downloading tmp xml
Filtering xml (1271 images)
saving xml in: Dataset/0/images.xml
final images: 666
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc2/images
Downloading tmp xml
Filtering xml (1109 images)
saving xml in: Dataset/1/images.xml
final images: 554
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc3/images
Downloading tmp xml
Filtering xml (1283 images)
saving xml in: Dataset/2/images.xml
final images: 646
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc4/images
Downloading tmp xml
Filtering xml (1268 images)
saving xml in: Dataset/3/images.xml
final images: 661
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc5/images
Downloading tmp xml
Filtering xml (1242 images)
saving xml in: Dataset/4/images.xml
final images: 607
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc6

Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc46/images
Downloading tmp xml
Filtering xml (1174 images)
saving xml in: Dataset/45/images.xml
final images: 591
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc47/images
Downloading tmp xml
Filtering xml (1279 images)
saving xml in: Dataset/46/images.xml
final images: 650
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc48/images
Downloading tmp xml
Filtering xml (1281 images)
saving xml in: Dataset/47/images.xml
final images: 637
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc49/images
Downloading tmp xml
Filtering xml (1229 images)
saving xml in: Dataset/48/images.xml
final images: 626
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc50/images
Downloading tmp xml
Filtering xml (1207 images)
saving xml in: Dataset/49/images.xml
final images: 602
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc51/images
Downloading

Filtering xml (1200 images)
saving xml in: Dataset/90/images.xml
final images: 599
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc92/images
Downloading tmp xml
Filtering xml (1164 images)
saving xml in: Dataset/91/images.xml
final images: 561
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc93/images
Downloading tmp xml
Filtering xml (1153 images)
saving xml in: Dataset/92/images.xml
final images: 564
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc94/images
Downloading tmp xml
Filtering xml (1113 images)
saving xml in: Dataset/93/images.xml
final images: 550
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc95/images
Downloading tmp xml
Filtering xml (1230 images)
saving xml in: Dataset/94/images.xml
final images: 603
Current woring directory: /Open-Access-Datasets/Pills/PillProjectDisc96/images
Downloading tmp xml
Filtering xml (1133 images)
saving xml in: Dataset/95/images.xml
final images: 565
Current

In [79]:
# in this part I check if there are more images with the same type of drug 
dirs = [x for x in dest_base_dir.iterdir() if x.is_dir()]

ids = dict()

expected_size = 0

for d in dirs:
    try:
        tree = ET.parse(d / 'images.xml')
    except ET.ParseError:
        print('Parse error on {}'.format(d/'images.xml'))
        continue
    se = list(tree.getroot())[0]
    
    for e in list(se):
        expected_size += int(e.find('File').find('Size').text)
        
        # i = e.find('ProprietaryName').text.lower()
        # i = e.find('NDC11').text[5:9]
        i = e.find('NDC9').text
        if i not in ids:
            ids[i] = []
        ids[i].append(e.find('File').find('Name').text) 

sizes = dict()

for k, e in ids.items():
    if len(e) not in sizes:
        sizes[len(e)] = []
    sizes[len(e)].append(k)

In [112]:
sorted_sizes = list(sizes.keys())
sorted_sizes.sort(reverse=True)

total_ids = 0
total_images = 0
for k in sorted_sizes:
    print('{:4} ids with {:4} images'.format(len(sizes[k]), k))
    total_ids += len(sizes[k])
    total_images += len(sizes[k]) * k
    
print('Total ids: {}'.format(total_ids))
print('Total images: {}'.format(total_images))

print(utils.bytes2human(expected_size), 'will be needed to download all the images')

   2 ids with   96 images
   1 ids with   70 images
   1 ids with   64 images
   1 ids with   62 images
   1 ids with   61 images
   5 ids with   60 images
   2 ids with   58 images
   1 ids with   56 images
   1 ids with   54 images
   1 ids with   53 images
   6 ids with   52 images
   1 ids with   51 images
   3 ids with   50 images
   5 ids with   49 images
  18 ids with   48 images
   8 ids with   47 images
   9 ids with   46 images
   3 ids with   45 images
  10 ids with   44 images
   8 ids with   43 images
   9 ids with   42 images
   2 ids with   41 images
  21 ids with   40 images
   5 ids with   39 images
  21 ids with   38 images
  13 ids with   37 images
  23 ids with   36 images
   4 ids with   35 images
  13 ids with   34 images
   5 ids with   33 images
  29 ids with   32 images
   4 ids with   31 images
  30 ids with   30 images
   5 ids with   29 images
  42 ids with   28 images
   6 ids with   27 images
  17 ids with   26 images
   1 ids with   25 images
 359 ids wit

In [120]:
ids_to_download = sizes[10]
download_imgs = True

with FTP(server) as ftp:
    print('Login into {}'.format(server))
    ftp.login()

    base_dir = ftp.pwd()
    
    for idx, source_dir in enumerate(source_dirs):
        ftp.cwd(base_dir)
        ftp.cwd(source_dir)
        
        dest_dir = dest_base_dir / str(idx)
        
        print(dest_dir)
        
        try:
            tree = ET.parse(dest_dir / 'images.xml')
        except ET.ParseError:
            print('Parse error on {}'.format(dest_dir / 'images.xml'))
            continue
        se = list(tree.getroot())[0]
        
        images = []
        
        for e in list(se):
            ndc = e.find('NDC9').text
            if ndc in ids_to_download:
                images.append(e.find('File').find('Name').text)
        
        # downloading
        if download_imgs:
            for i, img in enumerate(images):
                dest_file = dest_dir / img
                if (dest_file).exists():
                    print(img, 'already downloaded!')
                    continue
                with open(dest_file, 'wb') as f:
                    print('\rDownloading {:3}/{}'.format(i + 1, len(images)), end='')
                    ftp.retrbinary('RETR ' + img, f.write)
            print()

Login into lhcftp.nlm.nih.gov
Dataset/0
!-KLWKBWJ4Q-1D04O_12TR3EA3VEOY.JPG already downloaded!
!-ZDJM0JWT-CQBM-89Y50YK9XHRBWP.JPG already downloaded!
!2G2E_WL!JXB7483IW-QKAZ!IUYVWI.PNG already downloaded!
!AY5LU34QY0RNYA_ZJZED3EV0OOJ92.PNG already downloaded!
!IB!P53GML1!A8-LC9NLX0UQWG!T08.JPG already downloaded!

Dataset/1
-2J5192VAB93WA8JEEWMT6WUJNJ3DM.PNG already downloaded!
-3U3DC3LK_-U70_INP06_F3059S9OP.PNG already downloaded!
-6PEO3EIS2ADWY4PS2WWD9RVVLRQG1.PNG already downloaded!
-B!1ZJRA5DCPG7VHX9Y_UA2R!X!GQL.JPG already downloaded!
-BJK6DAFOZJ6IGO9SZO2QC!5_QXHHB.PNG already downloaded!
-B_J0HY7I6T43ESUFQZ68OM54SAZHX.PNG already downloaded!
-I00ZC!FSEJ_WNLMIZVFW1IE61E0EF.JPG already downloaded!

Dataset/2
-_KTP55APJ6G91RLYW_3J3I_YA6XO3.PNG already downloaded!
0-J5EEMLDOK-4WBJ0NYHG0Q!FBKZTQ.PNG already downloaded!
074BGFE334ICEIMLIHT1Y1Q-6QR21K.JPG already downloaded!
0I_Y65T!6FPTFC_YLIHWED86!MND-R.JPG already downloaded!

Dataset/3
0VMSGR44PS!S157Q74PTO6PAR87063.PNG already down