<a href="https://colab.research.google.com/github/perezrmaria/AMPLab2023-Streamlit/blob/main/Task_2__Audio_Content_based_playlists_Mar%C3%ADa_P%C3%A9rez.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 2: Audio Content based playlists
In this task we will create a system for generating music playlists based on the results of audio content analysis. We will use the MusAV dataset as a music audio collection, extract music descriptors using Essentia, and create a simple user interface to generate playlists based on these descriptors.


In [None]:
#If not installed, install Essentia. 
# This cell is for running the notebook in Colab
import importlib.util
if importlib.util.find_spec('essentia') is None:
    !pip install essentia

!pip install essentia.tensorflow
!pip install essentia
!pip install --upgrade essentia

import essentia as ess
from essentia.standard import MonoLoader
from essentia.standard import TensorflowPredictMusiCNN
from essentia.standard import TensorflowPredict2D
from essentia.standard import TensorflowPredictEffnetDiscogs


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting essentia
  Downloading essentia-2.1b6.dev858-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: essentia
Successfully installed essentia-2.1b6.dev858
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting essentia.tensorflow
  Downloading essentia_tensorflow-2.1b6.dev858-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (291.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m291.4/291.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: essentia.tensorflow
Successfully installed essentia.tensorflow-2.1b6.dev858
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#Basic imports
import os
import matplotlib.pyplot as plt
import numpy as np

# Imports 
import essentia.standard as ess
import pandas as pd
import json
import pickle
import csv
import itertools
from itertools import islice

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install tqdm
from tqdm import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Music collection

MusAV contains a variety of music, with 2,092 30-second track previews covering 1,404 genres.

Download the MusAV dataset from Google Drive. 

The `audio_chunks` subfolder contains the audio tracks we will use as our music collection. 

## Organizing the whole dataset in a dictionary

In order to assure reproducibility I chose to use the function os.walk() that recursively iterates through all subdirectories of a folder and returns three values on each iteration: the path of the current folder, a list with the names of the subfolders in the current folder, and a list with the names of the files in the current folder.

In [None]:
# esto si funciona bien, mete en un diccionario, para cada carpeta(clave) una lista con las rutas de todos los archivos
ruta_principal = '/content/drive/MyDrive/MusAV/audio_chunks'
mp3_files={}

for carpeta in tqdm(os.listdir(ruta_principal)):
    ruta_carpeta = os.path.join(ruta_principal, carpeta)
    if os.path.isdir(ruta_carpeta):
        print('Carpeta encontrada:', carpeta)
        for root, dirs, files in os.walk(ruta_carpeta):
            #print(root,dirs,files)
            for file0 in files:
                #mp3_files.append(os.path.join(root, file0))
                if carpeta not in mp3_files:
                  mp3_files[carpeta] = []
                  mp3_files[carpeta].append(os.path.join(root, file0))
                if carpeta in mp3_files:
                    mp3_files[carpeta].append(os.path.join(root, file0))



  0%|          | 0/7 [00:00<?, ?it/s]

Carpeta encontrada: audio.001


 14%|█▍        | 1/7 [00:27<02:45, 27.65s/it]

Carpeta encontrada: audio.000


 29%|██▊       | 2/7 [00:50<02:02, 24.54s/it]

Carpeta encontrada: audio.006


 43%|████▎     | 3/7 [01:11<01:31, 22.97s/it]

Carpeta encontrada: audio.003


 57%|█████▋    | 4/7 [01:32<01:07, 22.41s/it]

Carpeta encontrada: audio.005


 71%|███████▏  | 5/7 [01:54<00:44, 22.21s/it]

Carpeta encontrada: audio.004


 86%|████████▌ | 6/7 [02:15<00:21, 21.91s/it]

Carpeta encontrada: audio.002


100%|██████████| 7/7 [02:37<00:00, 22.43s/it]


In [None]:
print(mp3_files)

## Features extraction

In [None]:
df=pd.DataFrame(columns=['Audio file', 'Tempo', 'Music_style','Voice','Instrumental','Danceability', 'Arousal', 'Valence'])

dic = {}

#Define the models
#Genre model
model_genre = TensorflowPredictEffnetDiscogs(graphFilename="/content/drive/MyDrive/ASPLab/discogs-effnet-bs64-1.pb")
genres_json = '/content/drive/MyDrive/ASPLab/discogs-effnet-bs64-1.json'
with open(genres_json, 'r') as f:
  genres_dict = json.load(f)
  genres_list = genres_dict['classes']

#Voice model
model_voice = TensorflowPredictMusiCNN(graphFilename='/content/drive/MyDrive/ASPLab/voice_instrumental-musicnn-mtt-2.pb')

#Arousal and valence model
embeddings_model = ess.TensorflowPredictMusiCNN(graphFilename = '/content/drive/MyDrive/ASPLab/msd-musicnn-1.pb', output = 'model/dense/BiasAdd')
modelAV = ess.TensorflowPredict2D(graphFilename = '/content/drive/MyDrive/ASPLab/emomusic-musicnn-msd-2.pb', output = 'model/Identity')

#For all the files in one of the 7 folders
for clave in tqdm(mp3_files.values()):
  for file0 in tqdm(clave):
    audio = ess.MonoLoader(filename=file0, sampleRate=44100)()
    #Tempo
    extractor = ess.RhythmExtractor2013()(audio)
    bpm = extractor[0]

    #Danceability
    danceability, dfa = ess.Danceability()(audio)

    audio = ess.MonoLoader(filename=file0, sampleRate=16000)() #redefining the sample rate
    #Genre
    activations = model_genre(audio)
    genre_int = np.argmax(np.mean(activations, axis=0, keepdims=True))
    genre = genres_list[genre_int].replace(",", "_")

    #Voice/instrumental
    label = model_voice(audio)
    mean_label = np.mean(label, axis=0, keepdims=True)[0]
    instrumental = mean_label[0]
    voice = mean_label[1]
    if instrumental>voice:
      novoice=1 #novoice=1 (instrumental) novoice=0 (voice)
    else:
      novoice=0

    #Arousal & Valence
    embeddings = embeddings_model(audio)
    activations = modelAV(embeddings)

    mean_arousal_valence = (np.mean(activations, axis=0, keepdims=True))[0]
    arousal=mean_arousal_valence[0]
    valence=mean_arousal_valence[1]

    clave={'Audio_file': file0}
    values = {'Tempo': str(bpm), 'Music style': genre, 'Instrumental': str(novoice), 'Danceability': str(danceability), 'Arousal': str(arousal), 'Valence': str(valence)}
    #row = {'Audio_file': file0, 'Tempo': bpm, 'Music style': genre, 'Voice':voice, 'Instrumental':instrumental, 'Danceability':danceability, 'Arousal': arousal, 'Valence':valence}
    #df = df.append(row, ignore_index=True)

    dic[file0] = values

    with open('/content/drive/MyDrive/ASPLab/prueba5filas.json', 'w') as f:
      json.dump(dic, f)



  0%|          | 0/7 [00:00<?, ?it/s]
  0%|          | 0/301 [00:00<?, ?it/s][A
  0%|          | 1/301 [00:03<16:29,  3.30s/it][A
  1%|          | 2/301 [00:06<16:06,  3.23s/it][A
  1%|          | 3/301 [00:13<25:40,  5.17s/it][A
  1%|▏         | 4/301 [00:20<27:31,  5.56s/it][A
  2%|▏         | 5/301 [00:25<27:04,  5.49s/it][A
  2%|▏         | 6/301 [00:28<23:04,  4.69s/it][A
  2%|▏         | 7/301 [00:31<20:31,  4.19s/it][A
  3%|▎         | 8/301 [00:35<19:05,  3.91s/it][A
  3%|▎         | 9/301 [00:39<19:14,  3.95s/it][A
  3%|▎         | 10/301 [00:42<17:52,  3.69s/it][A
  4%|▎         | 11/301 [00:45<17:05,  3.53s/it][A
  4%|▍         | 12/301 [00:48<16:38,  3.46s/it][A
  4%|▍         | 13/301 [00:52<17:42,  3.69s/it][A
  5%|▍         | 14/301 [00:56<16:54,  3.53s/it][A
  5%|▍         | 15/301 [00:59<16:21,  3.43s/it][A
  5%|▌         | 16/301 [01:02<16:08,  3.40s/it][A
  6%|▌         | 17/301 [01:06<17:17,  3.65s/it][A
  6%|▌         | 18/301 [01:10<16:40,  3.54s

In [None]:
len(dic.keys())

2100

In [None]:
folder_audio_chunk0 = '/content/drive/MyDrive/MusAV/audio_chunks/audio.000'
files0=[]

for root, dirs, files in os.walk(folder_audio_chunk0):
    #print(root,dirs,files)
    for file in sorted(files):
        files0.append(os.path.join(root, file))
print(files0)

## Formatting the lines in order to use them afterwards with Streamlit
From .json to jsonl and jsonl. to jsonl.pickle

In [None]:
with open('/content/drive/MyDrive/ASPLab/prueba5filas.json', 'r') as infile, open('/content/drive/MyDrive/ASPLab/data.jsonl', 'w') as outfile:
    for line in infile:
        data = json.loads(line)
        json.dump(data, outfile)
        outfile.write('\n')

In [None]:
# Leer el archivo .jsonl
with open('/content/drive/MyDrive/ASPLab/data.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# Guardar los datos en un archivo .jsonl.pickle
with open('/content/drive/MyDrive/ASPLab/data.jsonl.pickle', 'wb') as f:
    pickle.dump(data, f)

## From json to pickle

In [None]:
import json
import pickle

# Abrir archivo JSON y cargar datos
with open('/content/drive/MyDrive/ASPLab/prueba5filas.json', 'r') as f:
    datos = json.load(f)

# Escribir datos en archivo pickle
with open('/content/drive/MyDrive/ASPLab/data.json.pickle', 'wb') as f:
    pickle.dump(datos, f)


## Change format for StreamLit

In [None]:
all_files = []
for lista in mp3_files.values():
    all_files.extend(lista)
    
all_files = list(set(all_files))

In [None]:
all_files_ruta=[]
for filepath in all_files:
  partes = filepath.split("/")
  parte_final = "/".join(partes[-3:])
  all_files_ruta.append(parte_final)
  print(parte_final)

In [None]:
with open("/content/drive/MyDrive/ASPLab/files_audio1", "w") as archivo:
    for elemento in all_files_ruta:
        archivo.write(elemento + "\n")

# Feature extraction for one song
(Trials before doing the whole dataset)

## Tempo

In [None]:
files=files0[0]
print(files)

audio = ess.MonoLoader(filename=files)()
rhythmExtractor = ess.RhythmExtractor2013()
extractor = rhythmExtractor(audio)
bpm = extractor[0]

/content/drive/MyDrive/MusAV/audio_chunks/audio.000/2Y/2YrzSi5dVnH5wDS06nZJyZ.mp3


In [None]:
display(df)

Unnamed: 0,Audio file,Tempo,Music_style,Voice/instrumental,Danceability,Arousal and valence
0,/content/drive/MyDrive/MusAV/audio_chunks/audi...,95.803223,,,,
1,/content/drive/MyDrive/MusAV/audio_chunks/audi...,161.835098,,,,
2,/content/drive/MyDrive/MusAV/audio_chunks/audi...,96.017174,,,,
3,/content/drive/MyDrive/MusAV/audio_chunks/audi...,137.198792,,,,
4,/content/drive/MyDrive/MusAV/audio_chunks/audi...,77.570518,,,,
...,...,...,...,...,...,...
295,/content/drive/MyDrive/MusAV/audio_chunks/audi...,142.231995,,,,
296,/content/drive/MyDrive/MusAV/audio_chunks/audi...,118.589317,,,,
297,/content/drive/MyDrive/MusAV/audio_chunks/audi...,184.570251,,,,
298,/content/drive/MyDrive/MusAV/audio_chunks/audi...,128.062714,,,,


## Music style

In [None]:
from IPython.display import Audio
files=files0[0]
Audio(files)

In [None]:
def musicstyle(audio, modelStyle,styles):
    activations = modelStyle(audio)
    mean_activations = np.mean(activations, axis=0, keepdims=True)
    max_pos = np.argmax(mean_activations)
    return styles[max_pos].replace(",", "_")

In [None]:
#from essentia.standard import tensorflowPredictEffnetDiscogs

files=files0[0]
print(files)

audio = MonoLoader(filename=files, sampleRate=44100)()
model_genre = TensorflowPredictEffnetDiscogs(graphFilename="/content/drive/MyDrive/ASPLab/discogs-effnet-bs64-1.pb")
genres_json = '/content/drive/MyDrive/ASPLab/discogs-effnet-bs64-1.json'


with open(genres_json, 'r') as f:
  genres_dict = json.load(f)
  genres_list = genres_dict['classes']

activations = model_genre(audio)
genre_int=np.argmax(np.mean(activations, axis=0, keepdims=True))
genres_list[genre_int]


/content/drive/MyDrive/MusAV/audio_chunks/audio.000/2Y/2YrzSi5dVnH5wDS06nZJyZ.mp3


'Electronic---Vaporwave'

In [None]:
activations.shape

(29, 400)

In [None]:
print(genres_list)

## Voice/instrumental

In [None]:
#audio_file='/kaggle/input/musav/MusAV/audio_chunks/audio.003/0R/0RSPtnexlPUm2gFecq6swu.mp3'
audio = ess.MonoLoader(filename=files, sampleRate=44100)()
model = TensorflowPredictMusiCNN(graphFilename='/content/drive/MyDrive/ASPLab/voice_instrumental-musicnn-mtt-2.pb')

label = model(audio)
mean_label = np.mean(label, axis=0, keepdims=True)
print(mean_label)
print(mean_label.shape)
# Convert label to binary if needed
threshold = 0.5
if (label.all() < threshold):
    print('The audio file is music with vocals.')
else:
    print('The audio file is instrumental music.')

This is [[0.64534926 0.3957026 ]]
(1, 2)
The audio file is instrumental music.


In [None]:
files1=files0[3]
Audio(files1)
print(files1)

audio = ess.MonoLoader(filename=files1, sampleRate=44100)()
model = TensorflowPredictMusiCNN(graphFilename='/content/drive/MyDrive/ASPLab/voice_instrumental-musicnn-mtt-2.pb')

label = model(audio)
mean_label = np.mean(label, axis=0, keepdims=True)
print(mean_label)
print(mean_label.shape)

# Convert label to binary if needed
threshold = 0.5
if (label.all() < threshold):
    print('The audio file is music with vocals.')
else:
    print('The audio file is instrumental music.')

/content/drive/MyDrive/MusAV/audio_chunks/audio.000/7G/7GSJVYWU0kP9QWFTxxNY6d.mp3
[[0.8474238  0.21642978]]
(1, 2)
The audio file is instrumental music.


In [None]:
Audio(files1)

## Danceability

In [None]:
#import essentia.standard as es
audio = ess.MonoLoader(filename=files,sampleRate=44100)()

# Compute the danceability

danceability, dfa = ess.Danceability()(audio)#danceability = danceability_extractor(beats)
danceability
# Print the danceability
print('Danceability:', danceability)

Danceability: 1.342718482017517


## Arousal and Valence

In [None]:
# Arousal and valence

files=files0[0]
print(files)
#audio = ess.MonoLoader(files)
#ess.RhythmExtractor2013()
audio = ess.MonoLoader(filename=files)()
embeddings_model = ess.TensorflowPredictMusiCNN(graphFilename = '/content/drive/MyDrive/ASPLab/msd-musicnn-1.pb', output = 'model/dense/BiasAdd')

modelAV = ess.TensorflowPredict2D(graphFilename = '/content/drive/MyDrive/ASPLab/emomusic-musicnn-msd-2.pb', output = 'model/Identity')

embeddings = embeddings_model(audio)
activations = modelAV(embeddings)

mean_label = (np.mean(activations, axis=0, keepdims=True))[0]
arousal=mean_label[0]
valence=mean_label[1]
print('Arousal: ',arousal)
print('Valence: ',valence)


/content/drive/MyDrive/MusAV/audio_chunks/audio.000/2Y/2YrzSi5dVnH5wDS06nZJyZ.mp3
Arousal:  2.982451
Valence:  5.033845
