<a href="https://colab.research.google.com/github/olaviinha/SloppyNoto/blob/master/sloppyNoto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<font face="Trebuchet MS" size="6">Sloppy Noto <font color="#999" size="3">v0.0.1</font><font color="#999" size="4">&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;</font><a href="https://github.com/olaviinha/SloppyNoto" target="_blank"><font color="#999" size="4">Github</font></a><font color="#999" size="4">&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;</font><font size="3" color="#999"><a href="https://inha.se" target="_blank"><font color="#999">O. Inha</font></a></font></font>

Sloppy Noto converts raw data from space probes into raw audio.

- While Sloppy Noto works with any CSV-like data files containing large quantities of numeric data, it was designed primarily to create audio from the raw datasets of various space craft of [ESA](https://www.esa.int/) and [NASA](https://nasa.gov).

- While Sloppy Noto can download, unzip and utilize various datasets and parts of datasets from the data archives of ESA and NASA, URLs as `data_file` may be subject to failure, especially if data is compressed. Should any such failure occur, please download the file to your local computer, extract it to your Google Drive and access by giving the extracted file path as `data_file`.

- High-RAM runtime is recommended (available in Colab Pro only).

- Note to self: check for TXT and XML support.

<font size="5">Howto</font>
- Run one cell at a time and follow instructions per cell.

---

In [None]:
#@title #Setup
#@markdown This cell needs to be run only once. It will:
#@markdown 1. Connect your Google Drive.<br>
#@markdown 2. Import [inhagcutils](https://inha.asia/c/inhagcutils).

import os
force_setup = False

pip_packages = 'pysoundfile'

# inhagcutils
if not os.path.isfile('/content/inhagcutils.ipynb') and force_setup == False:
  %cd /content/
  !pip -q install import-ipynb {pip_packages}
  !curl -s -O https://raw.githubusercontent.com/olaviinha/inhagcutils/master/inhagcutils.ipynb
import import_ipynb
from inhagcutils import *

# Mount Drive
if not os.path.isdir('/content/drive') and force_setup == False:
  from google.colab import drive
  drive.mount('/content/drive')

# Drive symlink
if not os.path.isdir('/content/mydrive') and force_setup == False:
  os.symlink('/content/drive/My Drive', '/content/mydrive')
  drive_root_set = True
drive_root = '/content/mydrive/'

dir_tmp = '/content/tmp/'
create_dirs([dir_tmp])
last_data_file = ''

class c:
  title = '\033[96m'
  ok = '\033[92m'
  okb = '\033[94m'
  warn = '\033[93m'
  fail = '\033[31m'
  endc = '\033[0m'
  bold = '\033[1m'
  u = '\033[4m'

def op(typex, msg, value=''):
  if value != '':
    print(typex+msg+c.endc, end=' ')
    print(value)
  else:
    print(typex+msg+c.endc)

In [None]:
#@title #1. Select Data
#@markdown <small>`data_file` may be a URL to a file or file path to a file located in your Google Drive. Whenever you change your `data_file`, remember to reset other settings in this cell as well if you're unsure about them.</small>
data_file = "" #@param {type:"string"}
#@markdown <small>You may run this cell now, after setting `data_file`, to see a preview of your data to determine correct delimiter.</small>
delimiter = "None" #@param ["None", "whitespace", "tab", "semicolon", "comma", "pipe", "double_pipe"]
#use = "columns" #@param ["columns", "rows"]
#@markdown <small>You may run this cell again after setting `delimiter`, to see a better preview to determine which columns to use.</small><br><br>
#@markdown <small>List column numbers or column number ranges, separated by commas. E.g. `3` or `3, 5, 10` or `3-5` or `3-5, 8-10, 20-27`. These columns will be the candidates for sound file creation.</small>
preview_columns = "" #@param {type:"string"}

columns = preview_columns
data = ''
prev_rows = 10
separator = ''
global_sr = 44100
secs_warn_limit = 0.8


columnlist = []
if "," in columns:
  columns = columns.split(',')
  for col in columns:
    if "-" in col:
      cr = [int(i) for i in col.split('-')]
      columnlist.extend(list(range(cr[0], cr[1]+1)))
    else:
      columnlist.append(int(col))
  columns = columnlist
elif "-" in columns:
  cr = [int(i) for i in columns.split('-')]
  columnlist.extend(list(range(cr[0], cr[1]+1)))
  columns = columnlist
elif columns == '':
  columns = ''
else:
  columns = [int(columns)]

if data_file != last_data_file:
  input_type = check_input_type(data_file)
  source_id = rnd_str(6)  
  last_data_file = data_file
  if input_type == 'link':
    if is_zip(data_file):
      zip_ext = path_ext(data_file, True)
      !wget {data_file} -O {dir_tmp}{source_id}.{zip_ext}
      !gunzip {dir_tmp}{source_id}.{zip_ext}
      !mv {dir_tmp}{source_id} {dir_tmp}{source_id}.csv
    else:
      !wget {data_file} -O {dir_tmp}{source_id}.csv
    use_file = dir_tmp+source_id+'.csv'
    input_type = check_input_type(use_file)
  elif input_type == 'unknown':
    use_file = drive_root+data_file
    input_type = check_input_type(use_file)
    if input_type == 'file':
      source_id = slug(basename(use_file))
  else:
    use_file = data_file

if input_type != 'file':
  op(c.fail, 'FILE NOT FOUND:', use_file)
else:

  skip_rows = 1
  with open(use_file, 'r') as f:
    for line in f:
      if line.startswith('#'):
        skip_rows += 1
      else:
        break

  if delimiter == 'None':
    op(c.ok, 'Input file:', use_file)
    
    print( range(skip_rows, skip_rows+prev_rows) )
    with open(use_file) as f:
      #data_head = [next(f) for x in range(skip_rows, skip_rows+prev_rows)]
      import itertools
      data_head = itertools.islice(f, skip_rows, skip_rows+prev_rows)
      for line in data_head:
        print(line.replace('\n', ''))

    op(c.fail, '\nPlease select delimiter and run this cell again before proceeding.')
  elif delimiter == 'whitespace':
    separator = '\s+'
  elif delimiter == 'tab':
    separator = '\t'
  elif delimiter == 'semicolon':
    separator = ';'
  elif delimiter == 'comma':
    separator = ','  
  elif delimiter == 'pipe':
    separator = '|'
  elif delimiter == 'double_pipe':
    separator = '||'
  if separator != '':
    
    import pandas as pd
    pd.set_option('display.max_columns', None)
    colselect_warn = False
    if columns == '':
      data = pd.read_csv(use_file, sep=separator, error_bad_lines=False, skiprows=skip_rows, skipfooter=1, header=None, index_col=False, skipinitialspace=True, skip_blank_lines=True, engine='python')
      colselect_warn = True
    else:
      data = pd.read_csv(use_file, sep=separator, error_bad_lines=False, skiprows=skip_rows, skipfooter=1, header=None, index_col=False, skipinitialspace=True, skip_blank_lines=True, engine='python', usecols=columns)
    
    #data = data[data.iloc[:,0].str.startswith('#').ne(True)]
    data = data.apply (pd.to_numeric, errors='coerce')
    data = data.fillna(0)
    #data = data.dropna()
    #data = data.reset_index(drop=True)
    
    op(c.title, 'Data preview\n')
    op(c.warn, 'Note that the columns in this preview may be divided to multiple rows.\n')
    print(data.head(skip_rows+prev_rows))

    op(c.ok, '\nSeparator:', separator)
    if colselect_warn == True:
      op(c.fail, '\nPlease select which columns to use by typing their numbers into preview_columns field and run this cell again before proceeding.')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

info_file_created = False
source_run = 0

precise_secs = data.shape[0]/global_sr
if precise_secs < secs_warn_limit:
  op(c.fail, '\nWARN:', 'Your data will produce only '+str(precise_secs)+' seconds of audio. Try with a file with more numeric rows if you want more.')

In [None]:
#@title #2. Visual previews
#@markdown - Creates visual waveform previews of all selected columns.<br> 
#@markdown - DC offsets won't be as wonky in the final sound files as they likely appear in these previews.<br>

#@markdown ### Note before next step:
#@markdown - Use the column number located above each waveform to determine which will be used to create a stereo sound.<br>
#@markdown - Colab (free standard RAM version) will run out of RAM somwhere after 20 minutes of generated audio. At the bottom of this cell's output, below the visual previews, estimated duration along with time_stretch examples is present. Use this information in case you want to use `time_stretch` in the next step.<br> I.e. **avoid stretching anything to over 20 minutes.**<br>
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import scipy
plt.rcParams['figure.figsize'] = [25, 6]
plt.rcParams.update({"axes.facecolor": "white"})
spheight = 6 * data.shape[1]
#data.plot(figsize=(25,spheight), subplots=True)

precise_secs = data.shape[0]/global_sr
if precise_secs < secs_warn_limit:
  secs = precise_secs
else:
  secs = math.floor(precise_secs)

op(c.warn, 'Generating visual previews...\n')
for col in columns:
  op(c.title, 'Column number: '+str(col))
  data[col].plot()
  plt.show()

def pretty(secs):
  return str(datetime.timedelta(seconds=secs))

op(c.title, 'Estimated duration\n')
if precise_secs < secs_warn_limit:
  print(secs, 'seconds')
else:
  print(pretty(secs), '(h:mm:ss)')
op(c.title, '\nExamples of time-stretch estimations for this duration\n')
print('  2x:', pretty(secs*2))
print('  5x:', pretty(secs*5))
print(' 10x:', pretty(secs*10))
print(' 25x:', pretty(secs*25))
print(' 50x:', pretty(secs*50))
print(' 75x:', pretty(secs*75))
print('100x:', pretty(secs*100))
print('\n(avoid time-stretching to over 20 minutes)')



In [None]:
#@title #3. Output
#@markdown You may run this cell multiple times with different selections and settings. No need to run previous cells.

#@markdown 

#@markdown ###Audio settings
#@markdown Select waveforms from above by typing in their column numbers. They will be the left and right channel of the created soundfile. Leave `right_channel` blank if you want the same waveform on both channels (mono sound).<br>
left_channel = "" #@param {type:"string"}
right_channel = "" #@param {type:"string"}
stereo_width = 7 #@param {type:"slider", min:0, max:10, step:1}
time_stretch = 50 #@param {type:"slider", min:0, max:100, step:1}
#@markdown <small>Default sample rate is 44100 Hz. `stretch_type: sample_rate` option will stretch the sound by reducing sample rate (fast but lo-fi). With `sample_rate` option, your maximum `time_stretch` is probably around 14. `stretch_type: linear_fill` will retain the sample rate of 44100 Hz and fill in the blanks (hi-fi but slow).<br>In other words: `linear_fill` will produce higher quality audio for your auditory perception.</small>
stretch_type = "linear_fill" #@param ["sample_rate", "linear_fill"]

#@markdown

#@markdown ###Save files
save_to_drive = False #@param {type:"boolean"}
timestamp_output_files = False #@param {type:"boolean"}
#@markdown <small>Enter a directory path pointing somewhere in your Google Drive. All sound files will be saved in this directory as WAV.</small>
output_dir = "sounds-from-space" #@param {type:"string"}
#@markdown <small>Save accompanying .txt file containing information about your settings. May come in handy one day in the distant future.</small>
save_info_txt = False #@param {type:"boolean"}
#@markdown <small>Optional note to be included in the information txt file</small>
free_note = "" #@param {type:"string"}

output_dir = fix_path(drive_root+output_dir)

stereo_sep = stereo_width
maxv = 0.45
detail_view = False
run_id = rnd_str(6)

def swf(sig1, sig2=''):
  plt.ylim(-1, 1)
  plt.rcParams.update({"axes.facecolor": "black"})
  plt.plot(sig1, 'yellowgreen', linewidth=1, alpha=1)
  if sig2 != '':
    plt.plot(sig2, 'salmon', linewidth=1, alpha=0.55)
  plt.show()

def query_yes_no(question, default="yes"):
  valid = {"yes": True, "y": True, "ye": True,
            "no": False, "n": False}
  if default is None:
    prompt = " [y/n] "
  elif default == "yes":
    prompt = " [Y/n] "
  elif default == "no":
    prompt = " [y/N] "
  else:
    raise ValueError("invalid default answer: '%s'" % default)

  while True:
    sys.stdout.write(question + prompt)
    choice = input().lower()
    if default is not None and choice == '':
      return valid[default]
    elif choice in valid:
      return valid[choice]
    else:
      sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
            
channels = []
ready = True
cmp_time_stretch = time_stretch
if time_stretch == 0:
  cmp_time_stretch = 1
if secs*cmp_time_stretch > 1200:
  op(c.fail, '\n\nWARN:', 'You are about to time-stretch the soundfile to over 20 minutes (to '+pretty(secs*time_stretch)+' to be exact).\nIf you are rocking the free standard RAM version of Colab, there is a good chance Colab will run out of RAM and crash.\n')
  ready = query_yes_no('Want to take your chances anyway?')
  error = 'timestretch_length'
if secs*cmp_time_stretch > 180 and save_to_drive == False:
  ready = False
  error = 'preview_length'
if ready == False:
  if error == 'timestretch_length':
    op(c.fail, 'Try reducing time-stretch and run cell again.')
  if error == 'preview_length':
    op(c.fail, 'ERROR:', 'Your soundfile is too long for Colab to preview, and you have save_to_drive unchecked. Please reduce soundfile length or check save_to_drive.')
else:
  left = int(left_channel)
  if right_channel == '':
    right = int(left_channel)
    stereo_width = 0
    # maxv = 0.2
  else:
    right = int(right_channel)

  channels.append(data[left])
  channels.append(data[right])

  for i, chan in enumerate(channels):
    if time_stretch > 0 and stretch_type == "linear_fill":
      l = len(chan)
      stretcher = []
      for ii, fr in enumerate(chan):
        cur = fr
        #print(ii)
        if ii > 0 and ii < (l-1):
          prv = chan.iloc[ii-1]
          #nxt = chan[ii+1]
          new_frame = np.linspace(prv, cur, time_stretch)
          stretcher.extend(new_frame)
      chan = np.array(stretcher).astype(np.float64)
      
    chan = np.interp(chan, (chan.min(), chan.max()), (np.negative(maxv), maxv))
    sos = scipy.signal.butter(10, 15, 'hp', fs=global_sr, output='sos')
    chan = scipy.signal.sosfilt(sos, chan)
    channels[i] = chan

  xsr = global_sr
  if time_stretch > 0 and stretch_type == "sample_rate":
    xsr = math.floor(global_sr/time_stretch)
    print('New sample rate:', xsr)
  
  if stereo_sep == 0:
    fin_left = channels[0]+channels[1]
    fin_right = fin_left
  elif stereo_sep == 10:
    fin_left = channels[0]
    fin_right = channels[1]
  else:
    fin_left = channels[0]+channels[1]/stereo_sep
    fin_right = channels[0]/stereo_sep+channels[1]

  if detail_view == True:
    dvend = math.floor(xsr/2)
    swf(fin_left[0:dvend])
    swf(fin_right[0:dvend])
    
  if fin_left[0] < -0.4 or fin_left[0] > 0.4 or fin_right[0] < -0.4 or fin_right[0] > 0.4:
    clip_point = math.floor(xsr/2)
    fin_left = fin_left[clip_point:]
    fin_right = fin_right[clip_point:]

  if detail_view == True:
    dvend = math.floor(xsr/2)
    swf(fin_left[0:dvend])
    swf(fin_right[0:dvend])
    swf(fin_left)
    swf(fin_right)

  audio = np.array([fin_left, fin_right], np.float64)

  def appendTxt(file, content):
    txt = open(txt_file, 'a+') 
    txt.writelines(content+'\n')
    txt.close();

  swf(fin_left, fin_right)

  if time_stretch > 14 and stretch_type == 'sample_rate':
    op(c.fail, 'ERROR:', 'You may not set time_stretch higher than 14 if you use \'sample_rate\' as stretch_type. Reduce time_stretch or change stretch_type to \'linear_fill\'')
  else:
    #if librosa.get_duration(audio, sr=xsr) < 180:
    if librosa.get_duration(audio, sr=xsr) < 180:
      audio_player(audio, sr=xsr)
    else:
      op(c.warn, 'WARN:', 'Soundfile is too long to be previewed in Colab. It will still be saved to your Drive if you have save_to_drive checked.')

    source_run += 1
    if save_to_drive == True:
      import soundfile
      import datetime
      if timestamp_output_files == True:
        wav_timestamp = datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+'_'
      else:
        wav_timestamp = ''
      wav_file = output_dir+'noto_'+wav_timestamp+source_id+'_'+run_id+'.wav'
      soundfile.write(wav_file, audio.T, xsr)
      if save_info_txt == True:
        if info_file_created == False:
          if timestamp_output_files == True:
            txt_timestamp = datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+'_'
          else:
            txt_timestamp = ''
          txt_file = output_dir+'noto_'+txt_timestamp+source_id+'.txt'
          txt = open(txt_file,'w') 
          params = ['started:            '+datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')+'\n\n',
                    free_note+'\n\n',
                    'data_file:          '+data_file+'\n',
                    'columns:            '+str(preview_columns)+'\n',
                    'delimiter:          '+delimiter+' ('+separator+')\n\n',
                    '---\n\n']
          txt.writelines(params)
          txt.close()
          last_free_note = free_note
          info_file_created = True
        
        txt = open(txt_file,'a+') 
        params = ['VARIANT #'+str(source_run)+': '+path_leaf(wav_file)+'\n',
                  'created:            '+datetime.datetime.today().strftime('%H:%M:%S')+'\n',
                  'left_channel:       column '+str(left_channel)+'\n'
                  'right_channel:      column '+str(right_channel)+'\n',
                  'stereo_width:       '+str(stereo_width)+'\n',
                  'time_stretch:       '+str(time_stretch)+'x\n',
                  'stretch_type:       '+stretch_type+'\n\n']
        txt.writelines(params)
        txt.close()
        if free_note != last_free_note:
          appendTxt(txt_file, free_note+'\n')
          last_free_note = free_note


