                                            Extracting audio features and saving to mongoDB

In [60]:
import os
import librosa
import numpy as np
import pymongo

# Function to extract audio features
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path)

        # Extracting features
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        mfcc_avg = np.mean(mfcc)  # Calculate the mean of MFCC
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y))
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))

        return float(mfcc_avg), float(zero_crossing_rate), float(spectral_centroid)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None, None, None

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["audio_features1"]
collection = db["tracks1"]
collection.delete_many({})

# Path to the folder containing MP3 files
folder_path = "sample_3/000/"

# Batch processing parameters
batch_size = 100
batch_data = []

# Iterate over MP3 files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".mp3"):
        # Extract features
        track_id = os.path.splitext(file_name)[0]
        file_path = os.path.join(folder_path, file_name)
        mfcc_avg, zero_crossing_rate, spectral_centroid = extract_features(file_path)

        if mfcc_avg is not None:
            # Prepare data to insert into MongoDB
            track_data = {
                "track_id": track_id,
                "mfcc": mfcc_avg,  # Use single value instead of list
                "zero_crossing_rate": zero_crossing_rate,
                "spectral_centroid": spectral_centroid
            }
            batch_data.append(track_data)

            # Batch insert data into MongoDB
            if len(batch_data) == batch_size:
                collection.insert_many(batch_data)
                print(f"Inserted {len(batch_data)} documents into MongoDB.")
                batch_data = []

# Insert remaining batch data into MongoDB
if batch_data:
    collection.insert_many(batch_data)
    print(f"Inserted {len(batch_data)} documents into MongoDB.")

print("Data insertion into MongoDB completed.")


Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.


  y, sr = librosa.load(file_path)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing file sample_3/000/13529.mp3: 
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.




Error processing file sample_3/000/92479.mp3: 
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.


[src/libmpg123/layer3.c:INT123_do_layer3():1847] error: bit deficit after dequant


Inserted 100 documents into MongoDB.


[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!


Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 100 documents into MongoDB.
Inserted 14 documents into MongoDB.
Data insertion into MongoDB completed.


                                        Using Metadata to get Genre Id for each track 

In [63]:
import pandas as pd
import pymongo

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["audio_features1"]
collection = db["tracks1"]

# Read the CSV file
csv_file = "fma_metadata/tracks.csv"
df = pd.read_csv(csv_file, skiprows=2)

# Iterate over track IDs from MongoDB
for doc in collection.find({}, {"track_id": 1}):
    track_id = doc["track_id"]
    print("MongoDB Track ID:", track_id)
    
    # Find track ID in the CSV file
    track_row = df[df.iloc[:, 0] == track_id]  # Assuming track ID is in the first column
    if not track_row.empty:
        # Print the entire row for troubleshooting
        print("CSV Row:")
        print(track_row)
        # Get the first genre ID from the list of genres
        genre_id = track_row.iloc[0, 41].split(',')[0]  # Assuming genre column is 42nd
        print(f"Track ID: {track_id}, Genre ID: {genre_id}")
    else:
        print("Track ID not found in CSV file.")


MongoDB Track ID: 480
CSV Row:
     track_id  Unnamed: 1           Unnamed: 2           Unnamed: 3  \
266       480           0  2008-11-26 02:05:59  2002-01-01 00:00:00   

    Unnamed: 4  Unnamed: 5  Unnamed: 6  \
266        NaN           0         135   

                                            Unnamed: 7  Unnamed: 8 Unnamed: 9  \
266  <p>Live and uneasy on stages in Portland, Seat...        1775        NaN   

     ... Unnamed: 43 Unnamed: 44  Unnamed: 45  \
266  ...         NaN         678           en   

                                           Unnamed: 46 Unnamed: 47  \
266  Attribution-NonCommercial-ShareAlike 3.0 Inter...         149   

    Unnamed: 48 Unnamed: 49 Unnamed: 50  Unnamed: 51  \
266         NaN           4         NaN           []   

                   Unnamed: 52  
266  Appear to Be (Kelly Haus)  

[1 rows x 53 columns]
Track ID: 480, Genre ID: [25
MongoDB Track ID: 4800
CSV Row:
      track_id  Unnamed: 1           Unnamed: 2           Unnamed: 3  \
284

In [64]:
import pandas as pd
import pymongo
import re

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["audio_features1"]
collection = db["tracks1"]

# Read the CSV file
csv_file = "fma_metadata/tracks.csv"
df = pd.read_csv(csv_file, skiprows=2)

# Iterate over track IDs from MongoDB
for doc in collection.find({}, {"track_id": 1}):
    track_id = str(doc["track_id"])  # Convert MongoDB track ID to string
    
    # Find track ID in the CSV file
    track_row = df[df.iloc[:, 0].astype(str) == track_id]  # Convert CSV track IDs to string for comparison
    if not track_row.empty:
        # Extract the genre ID (first element only)
        genre_column = track_row.iloc[0, 41]  # Assuming genre column is 42nd
        genre_id_match = re.search(r'\[(\d+)\]', genre_column)  # Extract genre ID using regex
        if genre_id_match:
            genre_id = genre_id_match.group(1)
            print(f"Track ID: {track_id}, Genre ID: {genre_id}")
            
            # Update genre ID in MongoDB
            update_result = collection.update_one(
                {"track_id": track_id},
                {"$set": {"genre_id": genre_id}}
            )
            if update_result.modified_count > 0:
                print(f"Updated Genre ID for Track ID: {track_id} in MongoDB.")
            else:
                print(f"No changes required for Track ID: {track_id}.")
        else:
            print(f"Track ID: {track_id}, Genre ID: Not found in CSV file.")
    else:
        print(f"Track ID: {track_id}, Not found in CSV file.")


Track ID: 480, Genre ID: Not found in CSV file.
Track ID: 4800, Genre ID: 12
No changes required for Track ID: 4800.
Track ID: 4801, Genre ID: 12
No changes required for Track ID: 4801.
Track ID: 4802, Genre ID: 12
No changes required for Track ID: 4802.
Track ID: 4803, Genre ID: 12
No changes required for Track ID: 4803.
Track ID: 4804, Genre ID: 12
No changes required for Track ID: 4804.
Track ID: 481, Genre ID: Not found in CSV file.
Track ID: 482, Genre ID: Not found in CSV file.
Track ID: 4827, Genre ID: Not found in CSV file.
Track ID: 4828, Genre ID: 12
No changes required for Track ID: 4828.
Track ID: 4829, Genre ID: 12
No changes required for Track ID: 4829.
Track ID: 483, Genre ID: Not found in CSV file.
Track ID: 4830, Genre ID: 12
No changes required for Track ID: 4830.
Track ID: 4831, Genre ID: Not found in CSV file.
Track ID: 4832, Genre ID: Not found in CSV file.
Track ID: 4833, Genre ID: Not found in CSV file.
Track ID: 4834, Genre ID: Not found in CSV file.
Track ID: 4

Kmeans Clustering and saving model in joblib

Conversion from string to integer completed.


In [36]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
import pymongo
from joblib import dump

def connect_to_mongodb(uri, db_name, collection_name):
    """ Connect to MongoDB and return the collection object. """
    client = pymongo.MongoClient(uri)
    db = client[db_name]
    return db[collection_name]

def extract_features_from_mongodb(collection):
    """ Extract MFCC and genre ID as features and track IDs from MongoDB. """
    features = []
    track_ids = []
    for doc in collection.find({}):
        try:
            # Extract MFCC and genre ID
            mfcc = float(doc["mfcc"])  # Ensure this is the correct format for your data
            track_id = int(doc["track_id"])
            
            # Append to the list
            features.append([mfcc])
            track_ids.append(track_id)
        except (KeyError, ValueError, TypeError) as e:
            print(f"Skipping track {doc.get('track_id', 'Unknown')} due to error: {e}")
    return np.array(features), track_ids

def train_kmeans(features, n_clusters=50):
    """ Train and return a KMeans model using the provided features, return scaler, scaled features, and cluster labels. """
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)  # Normalize features
    kmeans_model = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans_model.fit(features_scaled)
    cluster_labels = kmeans_model.labels_
    return kmeans_model, scaler, features_scaled, cluster_labels

def find_similar_tracks(kmeans_model, scaler, track_ids, query_track_id, features):
    """ Find and return track IDs in the same cluster as the query track ID. """
    query_index = track_ids.index(query_track_id)
    query_features = features[query_index].reshape(1, -1)
    query_features_scaled = scaler.transform(query_features)
    query_label = kmeans_model.predict(query_features_scaled)

    # Find all tracks in the same cluster
    similar_indices = np.where(kmeans_model.labels_ == query_label)[0]
    similar_track_ids = [track_ids[i] for i in similar_indices]
    return similar_track_ids

# Main execution
if __name__ == "__main__":
    uri = "mongodb://localhost:27017/"
    db_name = "audio_features1"
    collection_name = "tracks1"
    collection = connect_to_mongodb(uri, db_name, collection_name)
    features, track_ids = extract_features_from_mongodb(collection)
    kmeans_model, scaler, features_scaled, cluster_labels = train_kmeans(features, n_clusters=50)
    
    # Save the model and scaler
    dump({'model': kmeans_model, 'scaler': scaler, 'labels': cluster_labels}, 'kmeans_model_data1.joblib')

    # Example query
    query_track_id = 2
    similar_track_ids = find_similar_tracks(kmeans_model, scaler, track_ids, query_track_id, features)
    print(f"Similar track IDs for query track ID {query_track_id}: {similar_track_ids}")


Skipping track 480 due to error: 'genre_id'
Skipping track 481 due to error: 'genre_id'
Skipping track 482 due to error: 'genre_id'
Skipping track 4827 due to error: 'genre_id'
Skipping track 483 due to error: 'genre_id'
Skipping track 4831 due to error: 'genre_id'
Skipping track 4832 due to error: 'genre_id'
Skipping track 4833 due to error: 'genre_id'
Skipping track 4834 due to error: 'genre_id'
Skipping track 185 due to error: 'genre_id'
Skipping track 248 due to error: 'genre_id'
Skipping track 250 due to error: 'genre_id'
Skipping track 251 due to error: 'genre_id'
Skipping track 253 due to error: 'genre_id'
Skipping track 254 due to error: 'genre_id'
Skipping track 26 due to error: 'genre_id'
Skipping track 4091 due to error: 'genre_id'
Skipping track 4092 due to error: 'genre_id'
Skipping track 4093 due to error: 'genre_id'
Skipping track 4094 due to error: 'genre_id'
Skipping track 34000 due to error: 'genre_id'
Skipping track 4096 due to error: 'genre_id'
Skipping track 4097 d

In [27]:
from joblib import dump

# Save the model and scaler
dump({'model': kmeans_model, 'labels': cluster_labels}, 'kmeans_model1.joblib')


['kmeans_model1.joblib']