# 2) Analyze source collection and target file

This notebook includes the code to analyze the collection of sounds compiled in the previous notebook and that will be later used as the source collection in our audio mosaicing code. The notebook also contains the code to analyze the target audio file that will be later reconstructed using sound chunks from the source collection.

The audio analysis carried out in this notebook uses the Pythonn bindings of the Essentia library which was introduced in the first session of AMPLAB. Please make sure you checked the [Essentia Python tutorial](https://essentia.upf.edu/documentation/essentia_python_tutorial.html) to get familiarized with using Essentia in Python. Also useful is to always have a browser tab opened with Essentia's [Algorithms Reference](https://essentia.upf.edu/documentation/algorithms_reference.html) documentation page.

In [1]:
!pip install essentia
!pip install mir_eval



In [2]:
# Mount drive and cd to notebook folder
from google.colab import drive
drive.mount('/content/drive')

%cd "/content/drive/MyDrive/Colab Notebooks/freesound-final"

import sys
COLAB_WORKDIR = "/content/drive/MyDrive/Colab Notebooks/freesound-final"
if COLAB_WORKDIR not in sys.path:
  sys.path.append(COLAB_WORKDIR)
print(sys.path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/freesound-final
['', '/content', '/env/python', '/usr/lib/python37.zip', '/usr/lib/python3.7', '/usr/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.7/dist-packages/IPython/extensions', '/root/.ipython', '/content/drive/MyDrive/Colab Notebooks/freesound-final']


In [3]:
import numpy as np
import pandas as pd
import essentia
import essentia.standard as estd
import matplotlib.pyplot as plt
from IPython.display import display, Audio
from mir_eval.sonify import pitch_contour

In [4]:
# Define the sample rate of the input audio
fs = 44100

def analyze_sound(audio_path, audio_id=None):
  # Load the target audio
  audio = estd.MonoLoader(filename=audio_path)()

  # Use PitchMelodia from Essentia to analyse the pitch contour of the predominant
  # melody and extract the pitch onset times
  hop_size = 64
  pitch_extractor = estd.PredominantPitchMelodia(frameSize=8820, hopSize=hop_size)
  pitch_values, pitch_confidence = pitch_extractor(audio)

  # Pitch is estimated on frames. Compute frame time positions.
  pitch_times = np.linspace(0.0, len(audio) / fs, len(pitch_values))
  pitch_samples = [int(round(t * fs)) for t in pitch_times]

  onset_times, durations, notes = estd.PitchContourSegmentation(
      hopSize=hop_size,
      minDuration=.6,
      pitchDistanceThreshold=30,
      rmsThreshold=-4
  )(pitch_values, audio)

  onset_samples = [int(round(t * fs)) for t in onset_times]

  frame_start_end_times = zip(onset_times[:-1], onset_times[1:])
  frame_start_end_samples = list(zip(onset_samples[:-1], onset_samples[1:]))

  analysis_output = []

  for ix, (fstart_time, fend_time) in enumerate(frame_start_end_times):
      if fend_time - fstart_time > .1:
          # Get corresponding audio chunk
          fstart, fend = frame_start_end_samples[ix]

          try:
              frame = audio[fstart:fend]
          except:
              frame = audio[fstart:]

          if len(frame) % 2 != 0:
            frame = frame[:-1] # Make frame size even

          # Initialize dictionary to store analysis results with some basic metadata
          frame_output = {
              'freesound_id': audio_id,
              'id': f'None_{audio_id}',
              'path': audio_path,
              'start_sample': fstart,
              'end_sample': fend,
          }

          # Extract loudness
          loudness_algo = estd.Loudness()
          loudness = loudness_algo(frame)
          frame_output['loudness'] = loudness / len(frame)  # Normalize by length of frame
          
          # Extract MFCC coefficients
          w_algo = estd.Windowing(type = 'hann')
          spectrum_algo = estd.Spectrum()
          mfcc_algo = estd.MFCC()
          spec = spectrum_algo(w_algo(frame))
          _, mfcc_coeffs = mfcc_algo(spec)
          frame_output.update({'mfcc_{0}'.format(j): mfcc_coeffs[j] for j in range(0, len(mfcc_coeffs))})

          # Use extracted pitch
          pitch_ix = list(onset_times).index(fstart_time)
          frame_output['mean_pitch'] = notes[pitch_ix]

          # Add frame analysis results to output
          analysis_output.append(frame_output)

  return analysis_output

## Analyze source collection

In [None]:
DATAFRAME_FILENAME = 'dataframe.csv'  # DataFrame file of the sound source collection to analyze
DATAFRAME_SOURCE_FILENAME = 'dataframe_source.csv'  # DataFrame file where to store the results of our analysis

# Load the DataFrame of the sound source collection created in previous notebook and analyze all sound files in it
df = pd.read_csv(open(DATAFRAME_FILENAME), index_col=0)
analyses = []
for i in range(0, len(df)):
    sound = df.iloc[i]  # Get DataFrame sound at position 'i'
    print('Analyzing sound with id {0} [{1}/{2}]'.format(sound['freesound_id'], i + 1, len(df)))
     # Split audio in chunks
     # Chunk times will be based on the pitch times of the predominant melody when using use_melody=True
    analysis_output = analyze_sound(sound['path'], audio_id=sound['freesound_id'])
    analyses += analysis_output

# Store analysis results in a new Pandas DataFrame and save it
df_source = pd.DataFrame(analyses)
df_source.to_csv(DATAFRAME_SOURCE_FILENAME)
print('Saved source DataFrame with {0} entries! {1}'.format(len(df_source), DATAFRAME_SOURCE_FILENAME))

display(df_source)  # Show DataFrame contents
df_source.describe()  # Show some statistics of numerical fields in the DataFrame

Analyzing sound with id 56199 [1/100]
Analyzing sound with id 56089 [2/100]
Analyzing sound with id 48684 [3/100]
Analyzing sound with id 337897 [4/100]
Analyzing sound with id 81807 [5/100]
Analyzing sound with id 508530 [6/100]
Analyzing sound with id 56025 [7/100]
Analyzing sound with id 55966 [8/100]
Analyzing sound with id 55889 [9/100]
Analyzing sound with id 55941 [10/100]
Analyzing sound with id 557469 [11/100]
Analyzing sound with id 92005 [12/100]
Analyzing sound with id 107319 [13/100]
Analyzing sound with id 197859 [14/100]
Analyzing sound with id 361565 [15/100]
Analyzing sound with id 333744 [16/100]
Analyzing sound with id 61928 [17/100]
Analyzing sound with id 332598 [18/100]
Analyzing sound with id 494670 [19/100]
Analyzing sound with id 494643 [20/100]
Analyzing sound with id 494667 [21/100]
Analyzing sound with id 502306 [22/100]
Analyzing sound with id 355814 [23/100]
Analyzing sound with id 494552 [24/100]
Analyzing sound with id 494669 [25/100]
Analyzing sound wit

## Analyze the target sound file

In [None]:
TARGET_SOUND_PATH = 'targets/short_V1bFr2SWP1I.wav'  # Filename of the target audio we'll reconstruct
DATAFRAME_TARGET_FILE_FILENAME = 'dataframe_target.csv'  # DataFrame file where to store the analysis results of the target audio

# Analyze the target audio file and store results in a new DataFrame
print('Analyzing target sound {0}'.format(TARGET_SOUND_PATH))

target_analysis = analyze_sound(TARGET_SOUND_PATH)
df_target = pd.DataFrame(target_analysis)
df_target.to_csv(DATAFRAME_TARGET_FILE_FILENAME)
print('Saved target dataframe with {0} entries! {1}'.format(len(df_target), DATAFRAME_TARGET_FILE_FILENAME))

audio_path = TARGET_SOUND_PATH

# Load the target audio
audio = estd.MonoLoader(filename=audio_path)()
display(Audio(audio, rate=fs))

# Use PitchMelodia from Essentia to analyse the pitch contour of the predominant
# melody and extract the pitch onset times
pitch_extractor = estd.PredominantPitchMelodia(frameSize=4410, hopSize=64)
pitch_values, pitch_confidence = pitch_extractor(audio)

# Pitch is estimated on frames. Compute frame time positions.
pitch_times = np.linspace(0.0, len(audio) / fs, len(pitch_values))
pitch_samples = [int(round(t * fs)) for t in pitch_times]

# Generate a sine wave signal following the estimated pitch
print('Estimated pitch:')
synthesized_melody = pitch_contour(pitch_times, pitch_values, fs).astype(np.float32)[:len(audio)]
display(Audio(synthesized_melody, rate=fs))

# Convert pitch values from Hz to MIDI notes using the PitchContourSegmentation algorithm
onset_times, durations, notes = estd.PitchContourSegmentation(
    hopSize=64,
    minDuration=.6,
    pitchDistanceThreshold=30,
    rmsThreshold=-4
)(pitch_values, audio)

print('Original audio with onset time markers:')
marker = estd.AudioOnsetsMarker(onsets=onset_times, type='beep')
marked_audio = marker(audio)
display(Audio(marked_audio, rate=fs))

print('Mix of all signals')
display(Audio(marked_audio * 0.5 + synthesized_melody * 0.5, rate=fs))

# Plot the estimated pitch contour and confidence over time
f, axarr = plt.subplots(2, sharex=True)
axarr[0].plot(pitch_samples, pitch_values)
axarr[0].set_title('estimated pitch [Hz]')
axarr[1].plot(pitch_samples, pitch_confidence)
axarr[1].set_title('pitch confidence')
f.set_figheight(5)
f.set_figwidth(15)

# Plot target audio file waveform and show ticks at the start samples of the pitches
plt.figure(figsize=(15,5))
plt.plot(audio)
plt.vlines(df_target['start_sample'].values, -1, 1, color='red')
plt.axis([0, len(audio), -1, 1])
plt.title('Target audio file')
plt.show()

display(df_target)
df_target.describe()