This notebook makes use of the analysis-resynthesis approach to predict the sound of new audio based on an existing corpus.

The example material ~1 hour of my released music.

Goals for this notebook were:

1. Generate novel harmonic and textural material for composition from my existing material.
2. Examine if trends in the spectral analysis of the audio reflect conscious compositional intentions.

  (this was not as explored due to time constraints with length of computing time)
3. Provide an easy open source framework for others to produce new sounds from existing bodies of work.

  (potential future goal - this project is ineffecient in its current cloud state)

  Source Material:
  https://open.spotify.com/artist/5etGw4ubkJ9urWZ9OYBm7U?si=v8I70nj8Tjuglc9Y7QXktg
  Example output from notebook:
  https://drive.google.com/drive/folders/1jIcM5dzG6EVX038Tmd7WyRiE9l0or5ZY?usp=sharing

In [None]:
import librosa
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from IPython.display import Audio
import os
import math
import sklearn
from sklearn.cluster import KMeans
#from sklearn.decomposition import PC
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.svm import SVR #svr for non-linear, regression tasks
from sklearn.model_selection import train_test_split
from scipy.stats import norm
from scipy.signal import convolve
from scipy.io.wavfile import write
import scipy





Mounted at /content/drive


Use corpusPath to set audio folder location. Must be folder of wav files. Non-wav files will be skipped during read in. Set the desired length of predicted clips below

In [None]:
corpusPath = '/content/drive/Shareddrives/content/201-Final-Corpus-Lucian'

corpusSize = len(os.listdir(corpusPath))
corpusList = (os.listdir(corpusPath))

In [None]:
desiredClipLength = 5 #in seconds

THIS SECTION READS IN AUDIO AND FORMATS THE DATA FOR MANIPULATION

In [None]:
def updateSampRate(track):
  sampRate = librosa.get_samplerate((corpusPath + '/' + corpusList[track]))
  return sampRate

In [None]:
SAMPLERATE = updateSampRate(1)

In [None]:
def setSliceLength(seconds):
  return SAMPLERATE * seconds #samples per slice

In [None]:
sliceLength = setSliceLength(10) #10 seconds
sliceLength

441000

In [None]:
def storeTracks(corpusList, corpusPath):
  rawTracks = []
  for i in range(len(corpusList)):
    loadName = corpusPath + "/" + corpusList[i]
    if '.wav' in loadName:
      data, SAMPLERATE = librosa.load(loadName)
      for i in range(len(data)):
        rawTracks.append(data[i])

  samplesArray = np.asarray(rawTracks)
  numClips = math.floor(len(samplesArray) / sliceLength)
  #information is shaved off here for reshaping, but could instead be padded to keep all info
  flatClipsArray = np.reshape((samplesArray[:(numClips*sliceLength)]), (numClips, sliceLength))
  print (flatClipsArray.shape, type(flatClipsArray))
  return flatClipsArray







In [None]:
clipsArray = storeTracks(corpusList, corpusPath)

(194, 441000) <class 'numpy.ndarray'>


THIS SECTION IS MAKING A FEATURE VECTOR OF ALL DATA

Function below makes a feature vector for each clip.

In [None]:
def makeClipFeatureVector(inputClip):

  vector = []

  vector.append(librosa.feature.spectral_centroid(y=inputClip, sr=SAMPLERATE))
  vector.append(librosa.feature.spectral_flatness(y=inputClip))
  vector.append(librosa.feature.spectral_rolloff(y=inputClip))
  vector.append(librosa.feature.zero_crossing_rate(inputClip))
  vector.append(librosa.feature.rms(y=inputClip))
  mfcc = librosa.feature.mfcc(y=inputClip)
  for i in range(mfcc.shape[0]):
    element = mfcc[i]
    element = np.reshape((element), (1, element.shape[0]))
    vector.append(element)

  newVector =  np.asarray(vector)
  return np.reshape((newVector), (newVector.shape[0], newVector.shape[2]))


Function below uses makeClipFeatureVector function to create feature vectors for every clip, store them in a list, and than converts that to an array.

In [None]:
def makeClipListVectors(allClips):
  allFeatureVectors = []
  for i in range(len(allClips)): #change back to range totalClip
    clip = clipsArray[i]
    featureVector = makeClipFeatureVector(clip)
    #vectorPairs = [i, featureVector]
    #print(vectorPairs)
    allFeatureVectors.append(featureVector)
  return np.asarray(allFeatureVectors)


In [None]:
allClipsFeatureVectors = makeClipListVectors(clipsArray)

In [None]:
allClipsFeatureVectors.shape

(194, 25, 862)

Creates clusters of all clips, stores in a list. Clustering is based on above feature vector

In [None]:
#reshape all feature vectors to be 2d for clustering
reshapedX = np.reshape((allClipsFeatureVectors), (allClipsFeatureVectors.shape[0],allClipsFeatureVectors.shape[1]*allClipsFeatureVectors.shape[2]))

In [None]:
kmeans = KMeans(n_clusters=30, random_state=42) #n clusters = desired Clips
#need to work on slihouetting for idea number of clips

In [None]:
MainCluster = kmeans.fit_predict(reshapedX)

In [None]:
def makeClusterList(inputCluster):
  clusterList = []
  for i in range((kmeans.n_clusters)):
    clusterIndex = np.where(inputCluster == i)
    clusterList.append(clusterIndex)
  return clusterList

In [None]:
clusterList = makeClusterList(MainCluster)

Playback audio of a specific cluster

In [None]:
def arrayFromCluster(cluster):
  storeClips = []
  for i in range(len(cluster)):
    clipNum = cluster[i]
    storeClips.append(clipsArray[clipNum])
  audioStack = np.asarray(storeClips)
  reshapedStack = np.reshape(audioStack, (audioStack.shape[1] * audioStack.shape[2]))
  print(reshapedStack.shape)
  return reshapedStack

def playClusterClips(array):
  return Audio(data=array, rate=SAMPLERATE/2)


ANALYSIS / RESYNTHESIS

First, we take in a cluster of clips and read their FFTs. We create a new array with all the stored fft information, organize by clips (this is for training purposes, to avoid 1 continuous time series.



In [None]:
def storeClusterFFT(cluster):
  storeClips = []
  for i in range(len(cluster)):
    clipNum = cluster[i]
    storeClips.append(clipsArray[clipNum])
  audioStack = np.asarray(storeClips)
  reshapedStack = np.reshape(audioStack, (audioStack.shape[1], audioStack.shape[2]))
  StoreFFTs = []
  for i in range((audioStack.shape[1])):
    clip = reshapedStack[i]
    clipFFT = np.abs(librosa.stft(clip))
    StoreFFTs.append(clipFFT) #(clipFFT.T) #instead of time steps at each freq bin, Transposes to be freq bins at each timestep
  arrayFFT = np.asarray(StoreFFTs)
  return arrayFFT

In [None]:
newClusterFFTTest = storeClusterFFT(clusterList[5])


In [None]:
newClusterFFTTest6 = storeClusterFFT(clusterList[8])

Next, we train a set of models for the cluster. This takes several minutes, as we have the setting tunes precisely to capture interesting spectral evolution, rather than a more vague spectral profile.

We are using Support Vector Regression to analyze non-linear relationships. There is a model for each bin, trained on a time series of magnitude for each clip, resulting in bin amount of models. I took a lookback / windowing approach to account for time. the alternative was a sort of nueral net approach that i found ineffective.

In [None]:
def trainBinModels(FFTArray, lookback=50, C=20.0):
  #lookback=50 covers ~1.15s of history, which is good for smooth spectral evolution
  #A lookback=100 covers ~2.3s, making it better for longer-term spectral shaping

  nClips, nBins, nTimeSteps = FFTArray.shape
  modelsList = [SVR(kernel='rbf', C=C, epsilon=0.001) for _ in range(nBins)] #creates a new model for each bin, we will train these bin models across clips
  #epsilon deals with microvariation. the lower the more precise
  #C controls how flexible SVR is when fitting the data
  for bin in tqdm(range(nBins)):  # Train one model per frequency bin
        X_train, y_train = [], []
        for i in range(nClips): #fetches training data (time series of magnitudes) for each clip at specific bin.
            currentBin = FFTArray[i, bin, :]
            for j in range(len(currentBin) - lookback):
                X_train.append(currentBin[j:j+lookback])
                y_train.append(currentBin[j+lookback])
        #x training for this bin
        X_train = np.array(X_train)
        #y training for this bin
        y_train = np.array(y_train)

        #fit training for model at the specific bin
        modelsList[bin].fit(X_train, y_train)
  return modelsList



We use this to calculate how long our predicted clips will be (seconds -> timesteps)

In [None]:
def calcTimeSteps(goal):
  #goal in seconds
  return int((goal * (SAMPLERATE/2)) / 512) #hoplength


In [None]:
nFutureTimeSteps = calcTimeSteps(desiredClipLength) #currently set to 5 seconds
print(nFutureTimeSteps)

215


In [None]:
trainedBinModels = trainBinModels(newClusterFFTTest6)

  3%|▎         | 31/1025 [08:22<4:28:17, 16.19s/it]


KeyboardInterrupt: 

This function takes our initial cluster array, the trained models, and our desired clip length and produces arrays of predicted ffts. These are then used for resynthesis.

In [None]:
def predictFromBinModels(FFTArray, models, nFutureSteps=50, lookback=50):
  nClips, nBins, nTimeSteps = FFTArray.shape
  predictionsArray = np.zeros((nClips, nBins, nTimeSteps + nFutureSteps))

  #loop through clips
  for i in tqdm(range(nClips), position=0,leave=True):
      data = FFTArray[i]

      for bin in range(nBins):
        currentBin = data[bin, :] #time series for current bin
        predictedTimeSeries = list(currentBin[:])  # Start with real data

        currentModel = trainedBinModels[bin]

        #initializing sequency for predicting time series
        lastSeq = np.array(currentBin[-lookback:]).reshape(1, -1)

        #predict future values
        for n in range(nFutureSteps):
          predictedNextValue = currentModel.predict(lastSeq)[0]
          predictedTimeSeries.append(predictedNextValue)
          lastSeq = np.roll(lastSeq, -1) #used for rercursive prediction
          lastSeq[0, -1] = predictedNextValue

        #after loop of predicting values:

        predictionsArray[i, bin, nTimeSteps:] = predictedTimeSeries[-nFutureSteps:]
  return predictionsArray[:, :, -nFutureSteps:]

In [None]:
testPrediction = predictFromBinModels(newClusterFFTTest6, trainedBinModels, nFutureTimeSteps)

This section takes the n predicted FFT arrays (based on number of clips in the cluster), and convolves them using a gassian convolution kernel. This is done rather than using another machine learning model, when this process is already very slow. For the first few examples, I used this convolution approach.
After the first few clusters, I realized that I prefered to have seperate clips, as they already contained a significant amount of spectral information - so I stopped using convolution.

In [None]:
#create a gaussian kernel for convolution

def gaussian_kernel(size, std):
    x = np.linspace(-size//2, size//2, size)
    kernel = norm.pdf(x, scale=std)
    return kernel / kernel.sum()  # Normalize

# 3point Gaussian kernel
gKernel = gaussian_kernel(3, 0.5).reshape(3, 1, 1) # need to update this so it is n clips

In [None]:
def convolveClipsPrediction(prediction, kernel):
  #currently hard coded to 3 clips, will need to update
  kernel = np.ones((6, 1, 1)) / 6
  predList = []
  for i in range(prediction.shape[0]):
    predList.append(prediction[i])

  finalPrediction = convolve(prediction, kernel, mode="valid")[0]
  return finalPrediction

In [None]:
#convolved = convolveClipsPrediction(testPrediction, gKernel)

This function optionally makes predictions based on the entire corpus. I am not using it but it would be useful on a smaller corpus. On my hour long corpus, this would take hours if not days.

In [None]:
mainPredictionList = []
def createPredictionsList(clusterList, nClips):
  for clust in range(clusterList.shape[0]):
    clusterFFT = storeClusterFFT(clusterList[clust])
    currentModel = trainBinModels(clusterFFT)
    prediction = predictFromBinModels(clusterFFT, currentModel, nFutureTimeSteps)
    for clip in range(prediction.shape[0]):
      mainPredictionList.append(prediction[clip])


The code below takes a predicted FFT array and resynthesizes it via ISTFT. It also writes the audio to a new wav file.

In [None]:
#used to index into single clip of a predicted cluster
def resynthSinglePrediction(prediction):
  data =  librosa.istft(prediction)
  write('new.wav', int(SAMPLERATE/2), data)
  return Audio(data=data, rate=SAMPLERATE/2)

In [None]:
#used to read the whole predicted cluster as a longer audio clip
def resynthPredictionLong(prediction):
  tempList = []
  for i in range(prediction.shape[0]):
    for j in range(prediction.shape[1])
    tempList.append(tempList[i][j])
  predict = np.asarray(tempList)
  data =  librosa.istft(predict)
  write('new.wav', int(SAMPLERATE/2), data)
  return Audio(data=data, rate=SAMPLERATE/2)

In [None]:
resynthPredictionLong(testPrediction)

This function returns audio of the entire prediction list if it is needed.

In [None]:

def resynthAllPredictions(predictionList):
  tempList = []
  predictions = np.asarray(predictionList)
  for l in range(len(predictionList)):
    store = resynthSinglePrediction(predictions[l])
    tempList.append(store)
