<a href="https://colab.research.google.com/github/olaviinha/MidiTurmoil/blob/main/MIDI_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#<font face="Trebuchet MS" size="6">MIDI Scraper <font color="#999" size="4">&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;</font><a href="https://github.com/olaviinha" target="_blank"><font color="#999" size="4">Github</font></a>

Scrapes midi files from mididb.com (artist URL, genre URL), freemidi.org (artist URL) or any web page with links to midi files.

`output_dir` should be relative to your Google Drive root, e.g. `music/midis` if you have a directory called *music* in your Drive, containing a subdirectory called *midis*, and that's the directory you want to save all scraped midi files.

In mididb.com or freemidi.org scrapes, a subdirectory for each artist is auto-created under `output_dir` (e.g. `music/midis/aphex-twin`). In other scrapes, the subdirectory name will be website's domain name.

In [65]:
#@title #Setup
#@markdown This cell needs to be run only once. It will mount your Google Drive and setup prerequisites.<br>
#@markdown <small>Mounting Drive will enable this notebook to save outputs directly to your Drive. Otherwise you will need to copy/download them manually from this notebook.</small>

force_setup = False
repositories = []
pip_packages = ''
apt_packages = ''
mount_drive = True #@param {type:"boolean"}
skip_setup = False #@ param {type:"boolean"}

# Download the repo from Github
import os
from google.colab import output
import warnings
warnings.filterwarnings('ignore')
%cd /content/

# inhagcutils
if not os.path.isfile('/content/inhagcutils.ipynb') and force_setup == False:
  !pip -q install import-ipynb {pip_packages}
  if apt_packages != '':
    !apt-get update && apt-get install {apt_packages}
  !curl -s -O https://raw.githubusercontent.com/olaviinha/inhagcutils/master/inhagcutils.ipynb
import import_ipynb
from inhagcutils import *

# Mount Drive
if mount_drive is True:
  if not os.path.isdir('/content/drive'):
    from google.colab import drive
    drive.mount('/content/drive')
    drive_root = '/content/drive/My Drive'
  if not os.path.isdir('/content/mydrive'):
    os.symlink('/content/drive/My Drive', '/content/mydrive')
    drive_root = '/content/mydrive/'
  drive_root_set = True
else:
  create_dirs(['/content/faux_drive'])
  drive_root = '/content/faux_drive/'

if len(repositories) > 0 and skip_setup == False:
  for repo in repositories:
    %cd /content/
    install_dir = fix_path('/content/'+path_leaf(repo).replace('.git', ''))
    repo = repo if '.git' in repo else repo+'.git'
    !git clone {repo}
    if os.path.isfile(install_dir+'setup.py') or os.path.isfile(install_dir+'setup.cfg'):
      !pip install -e ./{install_dir}
    if os.path.isfile(install_dir+'requirements.txt'):
      !pip install -r {install_dir}/requirements.txt

if len(repositories) == 1:
  %cd {install_dir}

dir_tmp = '/content/tmp/'
create_dirs([dir_tmp])

import time, sys
from datetime import timedelta

import urllib.request
import requests


try: 
  from BeautifulSoup import BeautifulSoup
except ImportError:
  from bs4 import BeautifulSoup

session = requests.Session()

def wild_scrape(page_url, out_dir):
  global got_em, tot_em, total_input_urls

  http = page_url.split('/')
  host = http[2]
  base_url = http[0]+'//'+host+'/'
  base_www = http[0]+'//www.'+host+'/'

  headers = {
    "Host": host,
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
  }

  page = urllib.request.urlopen(page_url)
  html = BeautifulSoup(page.read())
  links = nav = html.body.find_all('a', href=True)
  tracks = []

  for link in links:
    href = str(link['href'])
    if href[:4] != 'http':
      href = page_url.replace(path_leaf(page_url), '')+href
    test1 = href[-4:]
    test2 = href[-5:]
    if test1.lower() == '.mid' or test2.lower() == '.midi':
      filename = path_leaf(href)
      tracks.append([filename, href])

  total_tracks = len(tracks)
  for iii, track in enumerate(tracks, 1):
    iii_ndx_info = str(iii)+'/'+str(total_tracks)+' '
    link = str(track[1])
    dir_out = out_dir+fix_path(host.replace('www.', ''))
    if not os.path.isdir(dir_out):
      os.mkdir(dir_out)
    filename = dir_out+str(track[0])

    if os.path.isfile(filename):
      op(c.warn, iii_ndx_info+'File already exists, skipping ', filename.replace(drive_root, ''), time=True)
      tot_em = tot_em+1
    else:
      faux_req = session.get(link, headers=headers)
      req = session.get(link, headers=headers)
      with open(filename, "wb") as mid:
        mid.write(req.content)
      if os.path.isfile(filename):
        op(c.ok, iii_ndx_info+'Saved', filename.replace(drive_root, ''), time=True)
        got_em = got_em+1
        tot_em = tot_em+1
      else:
        op(c.fail, iii_ndx_info+'ERROR saving', link, time=True)
        tot_em = tot_em+1

def scrape(page_url, out_dir):
  global got_em, tot_em, total_input_urls

  if 'mididb.com' in page_url:
    host = 'mididb.com'
    scraping = 'mididb'
  elif 'freemidi.org' in page_url:
    host = 'freemidi.org'
    scraping = 'freemidi'
  else:
    return 'URL not supported.'

  base_url = 'https://'+host+'/'
  base_www = 'https://www.'+host+'/'
  if scraping == 'mididb' and page_url[-1] != '/':
    page_url = page_url+'/'

  headers = {
    "Host": host,
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
  }

  page = urllib.request.urlopen(page_url)
  html = BeautifulSoup(page.read())
  page_inputs = []

  if scraping == 'freemidi':
    nav = html.body.find('div', attrs={'class', 'artist-container'}).find('nav').find('ul').find_all('li')
    page_inputs.append(page_url)
    for n in nav:
      href = str(n.find('a')['href'])
      if href != '#': 
        page_inputs.append(str(base_url+href))

  if scraping == 'mididb':
    if html.body.find('select', attrs={'class', 'jumpmenu'}):
      nav = html.body.find('select', attrs={'class', 'jumpmenu'}).find_all('option')
      for n in nav:
        tail = str(n.attrs['value'])
        tail = tail.replace('genres.asp?permalink=/', '')
        if tail[0] == '/':
          tail = tail[1:]
        page_inputs.append(base_www+tail)
    else:
      page_inputs = [page_url]

  total_pages = len(page_inputs)

  for ii, input in enumerate(page_inputs, 1):
    tracks = []
    ii_ndx_info = str(ii)+'/'+str(total_pages)+' '
    op(c.title, ii_ndx_info+'Fetching from page', input, time=True)

    # faux_req = session.get(input, headers=headers)
    # page2 = session.get(input, headers=headers)
    # html2 = BeautifulSoup(page2)
    page = None
    html = None
    page = urllib.request.urlopen(input)
    html = BeautifulSoup(page.read())
    
    if scraping == 'mididb':
      cnts = html.body.find_all('span', attrs={'class', 'song-title'})
      for cnt in cnts:
        href = cnt.find('a').attrs['href']
        midi_link = base_url+'midi-download/'+cnt.attrs['id'].replace('s_', 'AUD_')+'.mid'
        artist = href.split('/')[3]
        filename = href.split('/')[4]
        tracks.append([artist, filename, midi_link])

    if scraping == 'freemidi':
      cnts = html2.body.find_all('span', attrs={'itemprop':'name'})
      for cnt in cnts:
        href = cnt.find('a').attrs['href']
        pcs = href.split('-')
        id = pcs[1]
        filename = '-'.join(pcs[2:])
        midi_link = base_url+'getter-'+id
        artist = '-'.join(page_url.split('-')[2:])
        tracks.append([artist, filename, midi_link])
    
    total_tracks = len(tracks)
    for iii, track in enumerate(tracks, 1):
      iii_ndx_info = str(iii)+'/'+str(total_tracks)+' '
      artist = str(track[0])
      title = str(track[1])
      link = str(track[2])
      dir_out = out_dir+fix_path(artist)
      if not os.path.isdir(dir_out):
        os.mkdir(dir_out)
      filename = dir_out+title+'.mid'

      if os.path.isfile(filename):
        op(c.warn, iii_ndx_info+'File already exists, skipping ', filename.replace(drive_root, ''), time=True)
        tot_em = tot_em+1
      else:
        faux_req = session.get(link, headers=headers)
        req = session.get(link, headers=headers)
        with open(filename, "wb") as mid:
          mid.write(req.content)
        if os.path.isfile(filename):
          op(c.ok, iii_ndx_info+'Saved', filename.replace(drive_root, ''), time=True)
          got_em = got_em+1
          tot_em = tot_em+1
        else:
          op(c.fail, iii_ndx_info+'ERROR saving', link, time=True)
          tot_em = tot_em+1
    print()




output.clear()
# !nvidia-smi
op(c.ok, 'Setup finished.')

[92mSetup finished.[0m


In [None]:
#@title # Do stuff
# You may enter multiple URLs, use ; as separator.
url = "" #@param {type:"string"}
output_dir = "" #@param {type:"string"}
end_session_when_done = False #@ param {type: "boolean"}

uniq_id = gen_id()

# You may enter URLs to this list variable, enter 'list' to url: field to use it:
url_list = []


if url == 'list':
  urls = url_list
else:
  if ';' in url:
    urls = [x.strip() for x in url.split(';')]
  else:
    urls = [url]

inputs = urls

# Output
if output_dir != '':
  if not os.path.isdir(drive_root+output_dir):
    os.mkdir(drive_root+output_dir)
  out_dir = drive_root+fix_path(output_dir)
  
timer_start = time.time()
total = len(inputs)

got_em = 0
tot_em = 0

# -- DO THINGS --
for i, page_url in enumerate(inputs, 1):
  ndx_info = str(i)+'/'+str(total)+' '
  print()
  op(c.title, ndx_info+'Scraping URL', page_url, time=True)
  print()
  if 'freemidi.org' in page_url or 'mididb.com' in page_url:
    scrape(page_url, out_dir)
  else:
    wild_scrape(page_url, out_dir)
# -- END THINGS --







timer_end = time.time()

op(c.ok, 'Saved '+str(got_em)+'/'+str(tot_em)+' files.', time=True)
op(c.okb, 'Elapsed', timedelta(seconds=timer_end-timer_start), time=True)
op(c.ok, 'FIN.')

if end_session_when_done is True: end_session()