<a href="https://colab.research.google.com/github/olaviinha/SloppyNoto/blob/master/ftp_crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##FTP crawler

This utility lists all files of 5+ MB recursively from a set directory on an FTP server. 

This utility is to accompany [Sloppy Noto](https://github.com/olaviinha/SloppyNoto) for finding suitable data file candidates for data to audio conversion.

Note that it is not recommended to set `crawl_directory` to a top level directory, or in fact any upper level directory path (short path), if that directory contains a large subdirectory structure with a large number of files, such as _all data from an entire space mission_. It is recommended to use a low level directory path (long path). Should this recommendation be ignored, you are prohibited to blame this utility for _dying of old age_ before file list retrieval is completed (seriously, it may take a number of **hours**).



In [None]:
#@title Settings
ftp_host = "" #@param {type:"string"}
#@markdown <small>Upper level directories are not recommended as `crawl_directory`.</small>
crawl_directory = "" #@param {type:"string"}

#@markdown <small>Save file lists as a _.txt_ file in your Google Drive. At least use this if you chose to ignore the recommendation above.</small>
save_txt = False #@param {type:"boolean"}
#@markdown <small>Path to a directory in your Google Drive where the txt will be saved.</small>
save_path = "space_data" #@param {type:"string"}

basedir = crawl_directory

force_setup = False
if save_txt == True:
  import os
  # inhagcutils
  if not os.path.isfile('/content/inhagcutils.ipynb') and force_setup == False:
    %cd /content/
    !pip -q install import-ipynb
    !curl -s -O https://raw.githubusercontent.com/olaviinha/inhagcutils/master/inhagcutils.ipynb
  import import_ipynb
  from inhagcutils import *

  # Mount Drive
  if not os.path.isdir('/content/drive') and force_setup == False:
    from google.colab import drive
    drive.mount('/content/drive')

  # Drive symlink
  if not os.path.isdir('/content/mydrive') and force_setup == False:
    os.symlink('/content/drive/My Drive', '/content/mydrive')
    drive_root_set = True
  drive_root = '/content/mydrive/'

  dir_tmp = '/content/tmp/'
  create_dirs([dir_tmp])
  save_path = fix_path(drive_root+save_path)

if not 'ftputil_installed' in globals():
  !pip install ftputil
  ftputil_installed = True


all_files = []
gigs = []
half_gigs = []
texas_pennies = []
fiddies = []
small = []
extra_small = []
exxxtra_small = []
b = 1000000

class c:
  title = '\033[96m'
  ok = '\033[92m'
  okb = '\033[94m'
  warn = '\033[93m'
  fail = '\033[31m'
  endc = '\033[0m'
  bold = '\033[1m'
  u = '\033[4m'

def op(typex, msg, value=''):
  if value != '':
    print(typex+msg+c.endc, end=' ')
    print(value)
  else:
    print(typex+msg+c.endc)

import sys
import os
import ftplib
import ftputil
import fnmatch
if save_txt == True:
  txt = save_path+'ftp_'+ftp_host+'filelist_'+rnd_str(4)+'.txt'
  log_all = open(txt, 'a+')
  log_all.write('FTP host: '+ftp_host+'\n')
  log_all.write('FTP dir: '+basedir+'\n\n')
  # log_all.close()
op(c.title, 'Logging in to '+ftp_host+'...')
host = ftputil.FTPHost(ftp_host, 'anonymous', 'anonymous@domain.com') # ftp host info
recursive = host.walk(basedir, topdown=True, onerror=None) # recursive search 
op(c.warn, 'Retrieval in progress...')
for root, dirs, files in recursive:
  for name in files:
    fullpath = os.path.join(root, name)
    size = host.path.getsize(fullpath)
    all_files.append([size, fullpath])
    if size > 1000*b:
      gigs.append([size, fullpath])
    if size < 1000*b and size > 500*b:
      half_gigs.append([size, fullpath])
    if size < 500*b and size > 100*b:
      texas_pennies.append([size, fullpath])
    if size < 100*b and size > 50*b:
      fiddies.append([size, fullpath])
    if size < 50*b and size > 20*b:
      small.append([size, fullpath])
    if size < 20*b and size > 10*b:
      extra_small.append([size, fullpath])
    if size < 10*b and size > 5*b:
      exxxtra_small.append([size, fullpath])
op(c.ok, 'Retrieval compelte.\n\n')

b = 1000000

op(c.title, 'Files by size range\n')

# log_all = ''
# save_txt = False
def apnd(content):
  global log_all, txt
  log_all = open(txt, 'a+')
  log_all.write(content)
  log_all.close()

def print_filelist(list, title):
  global log_all, save_txt
  op(c.warn, '\n'+title+':')
  if save_txt == True:
    log_all.write('\n'+title+'\n')
  for s, f in list:
    line = str('{:.2f}'.format(round(s/b, 2)))+' MB: ftp://'+ftp_host+f
    print(line)
    if save_txt == True:
      log_all.write(line+'\n')

if len(gigs) > 0:
  print_filelist(gigs, 'Over 1 GB')

if len(half_gigs) > 0:
  print_filelist(half_gigs, '0,5-1 GB')

if len(texas_pennies) > 0:
  print_filelist(texas_pennies, '100-500 MB')

if len(fiddies) > 0:
  print_filelist(fiddies, '50-100 MB')

if len(small) > 0:
  print_filelist(small, '20-50 MB')

if len(extra_small) > 0:
  print_filelist(extra_small, '10-20 MB')

if len(exxxtra_small) > 0:
  print_filelist(exxxtra_small, '5-10 MB')

if save_txt == True:
  log_all.close()