In [73]:
import os
import pandas as pd
import numpy as np
from scipy.fftpack import dct
import wave
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
import librosa

# MFCC

In [74]:
def split_samples_into_frames(samples: np.ndarray,
                              frame_size: int,
                              frame_overlap: int = 0) -> np.ndarray:
    """
    Split the samples array into frames of given size with given overlap.
    """
    frames = [
        samples[i: i + frame_size]
        for i in range(
            0, len(samples) - frame_size, frame_size - frame_overlap
        )
    ]
    return np.array(frames, dtype=np.int32)

In [75]:
class Signal:
    def __init__(self, file, normalise: bool = True):
        """
        :param file: a wavfile (path or bytes)
        :param normalise: whether or not to normalise the audio
        """
        with wave.open(file, mode="rb") as wavfile_raw:
            # basic audio properties:
            self.n_channels = wavfile_raw.getnchannels()
            self.n_samples_all = wavfile_raw.getnframes()
            self.sample_rate = wavfile_raw.getframerate()

            samples_raw = wavfile_raw.readframes(-1)
            samples_all_channels = np.frombuffer(samples_raw, dtype=np.int16)
            # change the type to a larger int (necessary to compute squared values later)
            samples_all_channels = np.array(samples_all_channels, dtype=np.int32)

            # split channels into separate arrays:
            channels = [
                samples_all_channels[i:: self.n_channels]
                for i in range(self.n_channels)
            ]

            # merge them as rows in the final samples array:
            self.samples_all = np.array(channels)
            # convert to mono by averaging samples from each channel
            self.samples_all = np.mean(self.samples_all, axis=0)

            if normalise:
                self.__normalise_samples()

            # set the default boundaries of audio to analyse
            self.__start_id = 0
            self.__end_id = len(self.samples_all) - 1
            # set the default frame length and overlap
            self.frame_length_ms = 20
            self.frame_overlap_ms = 0

            self.frames = split_samples_into_frames(
                self.samples, frame_size=self.frame_size, frame_overlap=self.frame_overlap_size)

    @property
    def boundaries(self) -> tuple[int, int]:
        return self.__start_id, self.__end_id

    @property
    def samples(self) -> np.ndarray:
        return self.samples_all[self.__start_id:self.__end_id]

    @property
    def n_samples(self) -> int:
        return self.__end_id - self.__start_id

    @property
    def n_samples_in_frames(self) -> int:
        return sum([len(f) for f in self.frames])

    @property
    def all_audio_length_sec(self) -> float:
        """
        > The function returns the length of the audio in seconds (it ignores boundaries)
        :return: The length of the audio in seconds.
        """
        return self.n_samples_all / self.sample_rate

    @property
    def audio_length_sec(self) -> float:
        """
        > The function returns the length of the audio in seconds (it considers boundaries)
        :return: The length of the audio in seconds.
        """
        return self.n_samples / self.sample_rate

    def __normalise_samples(self):
        """
        Normalize the samples to a target level in decibels.
        """
        # samples are averaged so the width at this point is 1
        sample_width = 1

        max_amplitude = np.max(self.samples_all)
        target_level_db = -3
        target_amplitude = 10 ** (target_level_db / 20) * (
                2 ** (sample_width * 8 - 1) - 1
        )

        gain = target_amplitude / max_amplitude
        samples_normalised = np.floor(self.samples_all * gain)

        self.samples_all = samples_normalised

    @property
    def frame_size(self) -> int:
        """Number of samples in a single frame"""
        return self.frame_length_ms * self.sample_rate // 1000

    @property
    def frame_overlap_size(self) -> int:
        """Number of overlapping samples between the frames"""
        return self.frame_overlap_ms * self.sample_rate // 1000

    def update_settings(self, start_sample_id: int, end_sample_id: int,
                        frame_length_ms: int, frame_overlap_ms: int):
        """
        Sets the starting and the ending point of audio to analyse,
        as well as frame size and frame overlap in ms.
        """
        self.__start_id = start_sample_id
        self.__end_id = end_sample_id
        self.frame_length_ms = frame_length_ms
        self.frame_overlap_ms = frame_overlap_ms
        self.frames = split_samples_into_frames(
            self.samples, self.frame_size, self.frame_overlap_size)

    @property
    def n_frames(self) -> int:
        return len(self.frames)

In [76]:
def extract_mfcc_features(signal: Signal):
    emphasized_sig = np.append(signal.samples[0], signal.samples[1:] - 0.97 * signal.samples[:-1])
    frames = split_samples_into_frames(emphasized_sig, signal.frame_size, signal.frame_overlap_size)
    frames = _apply_window(frames)
    pow_frames = _power_spectrum(frames)
    filter_banks = _apply_filter_banks(pow_frames, signal.sample_rate)
    mfcc = _compute_mfcc_coefficients(filter_banks)
    return mfcc

def _apply_window(frames):
    try:
        res =  np.multiply(frames, np.hamming(frames.shape[1]), out=frames, casting='unsafe')
        return res
    except IndexError:
        print(frames)
        print(frames.shape)

def _power_spectrum(frames):
    mag_frames = np.absolute(np.fft.rfft(frames, n=512))
    pow_frames = ((1.0 / 512) * ((mag_frames) ** 2))
    return pow_frames

def _apply_filter_banks(pow_frames, sample_rate, n_filters=26):
    _, n_bins = pow_frames.shape

    mel_filterbank = _mel_filterbank(n_filters, n_bins, sample_rate)
    filtered_frames = np.dot(pow_frames, mel_filterbank.T)
    filtered_frames = np.where(
        filtered_frames == 0, np.finfo(float).eps, filtered_frames)
    return filtered_frames

def _mel_filterbank(n_filters, n_bins, sample_rate, low_freq=0, high_freq=None):
    if high_freq is None:
        high_freq = sample_rate / 2

    low_mel = _hz_to_mel(low_freq)
    high_mel = _hz_to_mel(high_freq)

    mel_points = np.linspace(low_mel, high_mel, n_filters + 2)
    hz_points = _mel_to_hz(mel_points)

    bins = np.floor((n_bins + 1) * hz_points / sample_rate).astype(int)

    filterbank = np.zeros((n_filters, n_bins))
    for m in range(1, n_filters + 1):
        left = bins[m - 1]
        center = bins[m]
        right = bins[m + 1]

        # Calculate left slope
        filterbank[m - 1, left:center] = (
            np.arange(left, center) - bins[m - 1]) / (center - bins[m - 1])

        # Calculate right slope
        filterbank[m - 1, center:right] = (
            bins[m + 1] - np.arange(center, right)) / (bins[m + 1] - center)

    return filterbank

def _hz_to_mel(hz):
    return 2595 * np.log10(1 + hz / 700)

def _mel_to_hz(mel):
    return 700 * (10**(mel / 2595) - 1)

def _compute_mfcc_coefficients(filter_banks):
    n_filters = filter_banks.shape[1]
    n_coefficients = n_filters - 1
    log_filter_banks = np.log(filter_banks)
    mfcc_coefficients  = dct(log_filter_banks, type=2, axis=1, norm='ortho')[
        :, 1: (n_coefficients + 1)]

    return mfcc_coefficients

# Speaker Classification

## Data preparation

In [77]:
frame_df = pd.DataFrame(columns=range(21))

for dir in os.listdir("Recordings"):
    speaker = dir
    for file in os.listdir("Recordings\\"+dir):
        if file.lower() ==  "znormalizowane":
            for file2 in os.listdir("Recordings\\"+dir+"\\znormalizowane\\"):
                audio_file = "Recordings\\"+dir+"\\znormalizowane\\"+file2
        elif file.endswith(".wav"):
            audio_file = "Recordings\\"+dir+"\\"+file
        else:
            continue
        signal, sr = librosa.load(audio_file)
        # signal = Signal(audio_file)
        # if signal.frames.shape[0] == 0:
        #     continue
        # mfcc = extract_mfcc_features(signal)
        #mfcc = librosa.feature.mfcc(y=sig.samples, sr=sig.sample_rate)
        mfcc = librosa.feature.mfcc(y=signal, sr=sr)
        df = pd.DataFrame(mfcc.T)
        df[20] = speaker
        frame_df = pd.concat([frame_df, df], ignore_index=True)



In [78]:
frame_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-550.843018,87.663612,38.879517,16.669125,7.608075,2.738084,4.040033,5.386412,3.359730,3.878994,...,7.208017,5.980130,-1.560332,0.435156,6.448923,1.867172,-1.906009,-1.760335,-0.777371,1_01
1,-521.453613,91.340729,33.655018,20.113050,16.564362,2.561702,4.990252,6.766742,-0.196718,0.017875,...,3.796235,3.845587,-1.329711,0.870627,1.664823,-1.015735,-2.241067,-4.296949,0.530449,1_01
2,-515.062744,92.638000,27.401657,19.803846,23.813400,11.532108,7.960026,6.507334,0.778383,0.858353,...,-0.522144,1.428493,-0.480874,0.218879,1.231898,-2.594046,-3.611378,-0.844954,2.104780,1_01
3,-513.523438,93.288452,25.848698,16.255955,22.437170,14.665035,13.518734,10.161581,4.062089,-0.089828,...,-0.465730,-2.377368,-0.902380,-0.545592,2.121382,-2.466409,-6.506040,0.220574,0.229768,1_01
4,-513.031677,94.261200,24.953739,15.384866,20.349669,13.122183,11.530576,5.568279,1.308941,2.943142,...,-0.854599,-2.218988,-4.091892,1.071039,1.109339,-0.539627,-2.787288,-1.094227,-0.956349,1_01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32918,-213.488602,59.281006,-37.084427,23.356981,20.140059,32.686050,-23.213978,1.718658,5.103720,5.157559,...,-15.194660,7.225684,-5.612788,7.369439,8.704142,9.406404,9.818296,-2.853420,3.216241,5_13
32919,-289.115662,95.761505,-10.941489,1.324265,20.161116,15.424707,6.675196,10.289164,4.750428,-3.131346,...,-6.566254,-4.278781,-3.714661,6.213298,1.476704,5.558108,11.468115,0.131748,0.840783,5_13
32920,-322.896820,98.196449,7.245686,-3.507871,12.683830,7.122865,11.802002,9.117743,1.596224,-7.333784,...,-4.452688,-3.516242,0.323526,-0.128980,3.365886,4.060381,5.149271,0.876746,3.804520,5_13
32921,-327.249512,96.804916,13.019209,-11.203919,8.550411,12.147888,12.957513,6.845652,-1.803101,-3.713849,...,-5.592212,-5.710196,1.043502,7.422772,6.414387,-1.124937,1.039994,2.556897,4.172557,5_13


In [79]:
unique_numbers = frame_df[20].unique()
total_numbers = len(unique_numbers)

# Map the numbers to a range from 0 to the total number of unique numbers
frame_df['y'] = frame_df[20].rank(method='dense', ascending=True).astype(int) -1
frame_df = frame_df.drop([20], axis=1)

In [80]:
frame_df['y'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37])

In [81]:
frame_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,y
0,-550.843018,87.663612,38.879517,16.669125,7.608075,2.738084,4.040033,5.386412,3.35973,3.878994,...,7.208017,5.98013,-1.560332,0.435156,6.448923,1.867172,-1.906009,-1.760335,-0.777371,0
1,-521.453613,91.340729,33.655018,20.11305,16.564362,2.561702,4.990252,6.766742,-0.196718,0.017875,...,3.796235,3.845587,-1.329711,0.870627,1.664823,-1.015735,-2.241067,-4.296949,0.530449,0
2,-515.062744,92.638,27.401657,19.803846,23.8134,11.532108,7.960026,6.507334,0.778383,0.858353,...,-0.522144,1.428493,-0.480874,0.218879,1.231898,-2.594046,-3.611378,-0.844954,2.10478,0
3,-513.523438,93.288452,25.848698,16.255955,22.43717,14.665035,13.518734,10.161581,4.062089,-0.089828,...,-0.46573,-2.377368,-0.90238,-0.545592,2.121382,-2.466409,-6.50604,0.220574,0.229768,0
4,-513.031677,94.2612,24.953739,15.384866,20.349669,13.122183,11.530576,5.568279,1.308941,2.943142,...,-0.854599,-2.218988,-4.091892,1.071039,1.109339,-0.539627,-2.787288,-1.094227,-0.956349,0


In [82]:
# Split the dataframe into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(frame_df.drop(['y'], axis=1), frame_df["y"], test_size=0.2, random_state=42, stratify=frame_df["y"])

## Model

In [83]:
clf = svm.SVC()

# Train the classifier on the training data
clf.fit(X_train, y_train)

In [84]:
# Make predictions on the testing data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7491268033409263


In [85]:
from xgboost import XGBClassifier
# Create an instance of the XGBClassifier
xgb_clf = XGBClassifier()

# Fit the classifier to the training data
xgb_clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_clf.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.876993166287016


# Word Classification

In [None]:

num_df = pd.DataFrame(columns=range(26))

for dir in os.listdir("MNIST"):
    for file in os.listdir("MNIST\\"+dir):
        if file.endswith(".wav"):
            audio_file = "MNIST\\"+dir+"\\"+file
        else:
            continue
        signal, sr = librosa.load(audio_file)
        # signal = Signal(audio_file)
        # if signal.frames.shape[0] == 0:
        #     continue
        # mfcc = extract_mfcc_features(signal)
        #mfcc = librosa.feature.mfcc(y=sig.samples, sr=sig.sample_rate)
        mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=25)
        df = pd.DataFrame(mfcc.T)
        df[25] = file.split("_")[0]
        frame_df = pd.concat([frame_df, df], ignore_index=True)