In [7]:
import os
import librosa
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [16]:
def load_audio_files(directory):
    audio_files = {}
    valid_songs = []  # Initialize an empty list for valid song names
    for filename in os.listdir(directory):
        if filename.endswith('.mp3') or filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            audio, sr = librosa.load(file_path, sr=None)
            audio_files[filename] = (audio, sr)
            
            # Extract song name in the format A-songName
            song_name = filename.split('-')[1].split('.')[0]  # Get everything after '-' and before '.mp3'
            valid_songs.append(song_name)  # Add to valid songs list
            
    return audio_files, valid_songs  # Return both audio files and valid songs

In [17]:
# Load the audio files
audio_directory = '/home/pes1ug22am100/Documents/Phonk'
audio_tracks, valid_songs = load_audio_files(audio_directory)

In [18]:
def extract_features(audio):
    mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=13)  # Use Mel-frequency cepstral coefficients
    return mfccs.mean(axis=1)

In [19]:
# Get the features of all tracks
features = {name.split('-')[1].split('.')[0]: extract_features(audio[0]) for name, audio in audio_tracks.items()}

In [20]:
def find_similar_tracks(selected_track, features):
    selected_features = features[selected_track]
    similarities = {}
    
    for track, feature_vector in features.items():
        if track != selected_track:
            similarity_score = cosine_similarity([selected_features], [feature_vector])[0][0]
            similarities[track] = similarity_score
            
    # Sort by similarity score
    similar_tracks = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return similar_tracks

In [21]:
# okay, now I know user input may not always be fully correct, so
# I'll use Levenshtein distance and set a threshold

def levenshtein_distance(s, t):
    m = len(s)
    n = len(t)

    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        dp[i][0] = i  # Deletion cost
    for j in range(n + 1):
        dp[0][j] = j  # Insertion cost

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if s[i - 1] == t[j - 1] else 1
            dp[i][j] = min(dp[i - 1][j] + 1,      # Deletion
                           dp[i][j - 1] + 1,      # Insertion
                           dp[i - 1][j - 1] + cost)  # Substitution

    return dp[m][n]

In [14]:
'''# THis is how it'd work
string1 = "bob"
string2 = "bol"
distance = levenshtein_distance(string1, string2)
print(f"Levenshtein Distance between '{string1}' and '{string2}' is {distance}")'''

Levenshtein Distance between 'bob' and 'bol' is 1


In [22]:
def get_user_choice(valid_songs):
    while True:
        user_input = input("Enter the name of your song: ")
        
        # Check if input is in valid songs
        if user_input in valid_songs:
            return user_input
        
        # Find close matches within the threshold
        close_matches = [song for song in valid_songs if levenshtein_distance(user_input, song) <= 3]
        
        if close_matches:
            print("Did you mean one of these?")
            for match in close_matches:
                print(f"- {match}")
            confirm = input("Type the exact name or 'n' for no: ")
            if confirm in close_matches:
                return confirm
        
        print("Song not found. Please try again.")

In [23]:
chosen_song = get_user_choice(valid_songs)
similar_tracks = find_similar_tracks(chosen_song, features)

print("Tracks similar to", chosen_song)
for track, score in similar_tracks:
    print(f"{track}: Similarity Score: {score:.4f}")

Tracks similar to xtali
xslide: Similarity Score: 0.9872
anatomy: Similarity Score: 0.9284
