In [2]:
!pip list > requirements.txt

In [2]:
talent_list = [
    {
        "name": "アンジュ・カトリーナ",
        "path": "../data_collection/training/chat-ange.mp3",
        "color": "#C83C35",
    },
    {
        "name": "リゼ・ヘルエスタ",
        "path": "../data_collection/training/chat-lize.mp3",
        "color": "#42FFFF",
    },
    {
        "name": "戌亥とこ",
        "path": "../data_collection/training/chat-toko.mp3",
        "color": "#92F3A4",
    }
]

In [3]:
# Kaggle code (https://www.kaggle.com/code/anmour/svm-using-mfcc-features)

import numpy as np
import pandas as pd

import os
import librosa

import scipy
from scipy.stats import skew
from tqdm import tqdm, tqdm_pandas

tqdm.pandas()

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.svm import SVC

# Generate mfcc features with mean and standard deviation
def get_mfcc(segment, SAMPLE_RATE):
    data = segment
    # try:

    ft1 = librosa.feature.mfcc(y = data, sr = SAMPLE_RATE, n_mfcc=30)
    ft2 = librosa.feature.zero_crossing_rate(y = data)[0]
    ft3 = librosa.feature.spectral_rolloff(y = data, sr = SAMPLE_RATE)[0]
    ft4 = librosa.feature.spectral_centroid(y = data, sr = SAMPLE_RATE)[0]
    ft5 = librosa.feature.spectral_contrast(y = data, sr = SAMPLE_RATE)[0]
    ft6 = librosa.feature.spectral_bandwidth(y = data, sr = SAMPLE_RATE)[0]
    ft1_trunc = np.hstack((np.mean(ft1, axis=1), np.std(ft1, axis=1), skew(ft1, axis = 1), np.max(ft1, axis = 1), np.median(ft1, axis = 1), np.min(ft1, axis = 1)))
    ft2_trunc = np.hstack((np.mean(ft2), np.std(ft2), skew(ft2), np.max(ft2), np.median(ft2), np.min(ft2)))
    ft3_trunc = np.hstack((np.mean(ft3), np.std(ft3), skew(ft3), np.max(ft3), np.median(ft3), np.min(ft3)))
    ft4_trunc = np.hstack((np.mean(ft4), np.std(ft4), skew(ft4), np.max(ft4), np.median(ft4), np.min(ft4)))
    ft5_trunc = np.hstack((np.mean(ft5), np.std(ft5), skew(ft5), np.max(ft5), np.median(ft5), np.min(ft5)))
    ft6_trunc = np.hstack((np.mean(ft6), np.std(ft6), skew(ft6), np.max(ft6), np.median(ft6), np.max(ft6)))
    return pd.Series(np.hstack((ft1_trunc, ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc)))
    
    # except:
    #     print('bad file')
    #     return pd.Series([0]*210)

In [4]:
import librosa
import numpy as np
import os

sample_rate = 22050
segment_duration = 2  # seconds
mfccs = []
labels = []

for talent in talent_list:
    path = talent["path"]
    if os.path.exists(path): 
        audio, _ = librosa.load(path, sr=sample_rate, mono=True)
        total_duration = len(audio) / sample_rate
        num_segments = int(total_duration)

        for i in range(num_segments):
            start_sample = int(i * sample_rate)
            end_sample = int((i + segment_duration) * sample_rate)
            segment = audio[start_sample:end_sample]

            # Compute the MFCC for the segment
            # mfcc = np.mean(librosa.feature.mfcc(y=segment, sr=sample_rate, n_mfcc=13)[1:], axis=1)

            mfcc = get_mfcc(segment, 22050)
            if np.isnan(mfcc).any():
                # if mfcc array contain nan value, skip
                continue
            
            mfccs.append(mfcc)
            labels.append(talent["name"])

  ft1_trunc = np.hstack((np.mean(ft1, axis=1), np.std(ft1, axis=1), skew(ft1, axis = 1), np.max(ft1, axis = 1), np.median(ft1, axis = 1), np.min(ft1, axis = 1)))


In [5]:
from sklearn.datasets import load_digits
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
X = mfccs
# Apply scaling for PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# X_test_scaled = scaler.transform(X_test)

# Apply PCA for dimension reduction
pca = PCA(n_components=65).fit(X_scaled)
X_pca = pca.transform(X_scaled)
# X_test_pca = pca.transform(X_test_scaled)

print(sum(pca.explained_variance_ratio_)) 

import pickle
pickle.dump(pca, open("pca.pkl","wb"))

0.9024427044037658


In [7]:
y = labels

# Fit an SVM model
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size = 0.2, random_state = 42, shuffle = True)

clf = SVC(kernel = 'rbf', probability=True)

clf.fit(X_train, y_train)

print(accuracy_score(clf.predict(X_val), y_val))

1.0


In [8]:
# Define the paramter grid for C from 0.001 to 10, gamma from 0.001 to 10
C_grid = [0.001, 0.01, 0.1, 1, 10]
gamma_grid = [0.001, 0.01, 0.1, 1, 10]
param_grid = {'C': C_grid, 'gamma' : gamma_grid}

grid = GridSearchCV(SVC(kernel='rbf'), param_grid, cv = 3, scoring = "accuracy")
grid.fit(X_train, y_train)

# Find the best model
print(grid.best_score_)

print(grid.best_params_)

print(grid.best_estimator_)

0.9981447124304267
{'C': 1, 'gamma': 0.01}
SVC(C=1, gamma=0.01)


In [9]:
# 0.9981447124304267
# {'C': 1, 'gamma': 0.01}
# SVC(C=1, gamma=0.01)

# Optimal model
clf = SVC(kernel = 'rbf', C = 1, gamma = 0.01, probability=True)

clf.fit(X_train, y_train)

print(accuracy_score(clf.predict(X_val), y_val))

import pickle
pickle.dump(clf, open("model-kaggle.pkl","wb"))

1.0


In [None]:
# Fit the entire training sets
clf.fit(X_pca, y)

clf.predict_proba(X_test_pca)