In [6]:
import pandas as pd
import numpy as np
import librosa
import pydub
from tqdm import tqdm
from sklearn.model_selection import train_test_split
def read_df(file):
    df = pd.read_csv(file, sep='\t')
    df_us = df[df['accent']=='us'].sample(5000)
    df_ind = df[df['accent']=='indian'].sample(5000)
    df_uk = df[df['accent']=='england'].sample(5000)
    df_aus = df[df['accent']=='australia'].sample(5000)
    df_ire = df[df['accent']=='ireland'].sample(5000)
    df_scot = df[df['accent']=='scotland'].sample(5000)



    df = df_us.append(df_ind)
    df = df.append(df_uk)
    df = df.append(df_aus)
    df = df.append(df_ire)
    df = df.append(df_scot)
    
    df.drop(['client_id', 'sentence', 'up_votes', 'down_votes', 'age', 'gender'],
        axis=1, inplace=True)
    return df



def mp3towav(df,col):
    for filename in tqdm(df[col]):
        pydub.AudioSegment.from_mp3("D:/UzairDataSet/en/clips/{}".format(filename)).export("D:/pracrice/{}".format(filename), format="wav")

        
def wavtomfcc( file_path):
        wave, sr = librosa.load(file_path, mono=True)
        mfcc = librosa.feature.mfcc(wave, sr=sr, n_mfcc=13)
        return mfcc

def create_mfcc(df,col):
        list_of_mfccs = []
        
        us = df[df['accent']=='us']
        for wav in tqdm(us[col]):
            file_name = 'D:/All Classes/{}.wav'.format(wav)
            mfcc = wavtomfcc(file_name)
            list_of_mfccs.append(mfcc)

        uk = df[df['accent']=='england']
        for wav in tqdm(uk[col]):
            file_name = 'D:/All Classes/{}.wav'.format(wav)
            mfcc = wavtomfcc(file_name)
            list_of_mfccs.append(mfcc)

        aus = df[df['accent']=='australia']
        for wav in tqdm(aus[col]):
            file_name = 'D:/All Classes/{}.wav'.format(wav)
            mfcc = wavtomfcc(file_name)
            list_of_mfccs.append(mfcc)

        ire = df[df['accent']=='ireland']
        for wav in tqdm(ire[col]):
            file_name = 'D:/All Classes/{}.wav'.format(wav)
            mfcc = wavtomfcc(file_name)
            list_of_mfccs.append(mfcc)

        scot = df[df['accent']=='scotland']
        for wav in tqdm(scot[col]):
            file_name = 'D:/All Classes/{}.wav'.format(wav)
            mfcc = wavtomfcc(file_name)
            list_of_mfccs.append(mfcc)

        
        ind = df[df['accent']=='indian']
        for wav in tqdm(ind[col]):
            file_name = 'D:/All Classes/{}.wav'.format(wav)
            mfcc = wavtomfcc(file_name)
            list_of_mfccs.append(mfcc)
        return list_of_mfccs

def resize_mfcc(list_of_mfccs):
        target_size = 64
        resized_mfcc = [librosa.util.fix_length(mfcc, target_size, axis=1) for mfcc in list_of_mfccs]
        resized_mfcc = [np.vstack((np.zeros((3, target_size)), mfcc)) for mfcc in resized_mfcc]
        return resized_mfcc


def label_samples(df):
        y_labels = np.array(df['accent'])
        y = []
        for label in y_labels:
            if label == 'us':
                y.append(0)
            elif label == 'indian':
                y.append(1)
            elif label == 'england':
                y.append(2)
            elif label == 'australia':
                y.append(3)
            elif label == 'ireland':
                y.append(4)
            elif label == 'scotland':
                y.append(5)
        
        
        return y
      


def split_data(X,y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle = True, test_size=0.25)
        X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, stratify=y_test, shuffle = True, test_size=0.3)
        X_train = np.array(X_train).reshape(-1, 16, 64)
        X_test = np.array(X_test).reshape(-1, 16, 64)
        X_val = np.array(X_val).reshape(-1, 16, 64)
        y_train = np.array(y_train).reshape(-1, 1)
        y_test = np.array(y_test).reshape(-1,1)
        y_val = np.array(y_val).reshape(-1,1)
        return X_train, X_test,X_val,y_train, y_test,y_val

def standardize_mfcc(X_train, X_test,X_val):
        train_mean = X_train.mean()
        train_std = X_train.std()
        X_train_std = (X_train-train_mean)/train_std
        X_test_std = (X_test-train_mean)/train_std
        X_val_std = (X_val-train_mean)/train_std
        return X_train_std,X_test_std,X_val_std

    

def save_mfccs(X_train_std,X_test_std,X_val_std,y_train, y_test,y_val):
        np.save('X_train_moz_6.npy', X_train_std)
        np.save('X_test_moz_6.npy', X_test_std)
        np.save('X_val_moz_6.npy', X_val_std)
        np.save('y_train_moz_6.npy', y_train)
        np.save('y_test_moz_6.npy', y_test)
        np.save('y_val_moz_6.npy', y_val)

# 354, 293, 61
if __name__ == '__main__':
    df = read_df('D:/UzairDataSet/en/validated.tsv')
    #print(df['accent'].value_counts())
    mp3towav(df,'path')
    list_of_mfccs=[]
    list_of_mfccs=create_mfcc(df,'path')
    X=resize_mfcc(list_of_mfccs)
    y=label_samples(df)
    print(y)
    X_train, X_test,X_val,y_train, y_test,y_val=split_data(X,y)
    X_train_std,X_test_std,X_val_std=standardize_mfcc(X_train, X_test,X_val)
    save_mfccs(X_train_std,X_test_std,X_val_std,y_train, y_test,y_val)

ireland      5000
england      5000
scotland     5000
indian       5000
us           5000
australia    5000
Name: accent, dtype: int64


100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [21:42<00:00,  3.84it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [29:41<00:00,  2.81it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [31:22<00:00,  2.66it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [33:26<00:00,  2.49it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [30:31<00:00,  2.73it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [31:22<00:00,  2.66it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [42:45<00:00,  1.95it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [36:48<00:00,  2.26it/s]
100%|███████████████████████████████████

In [63]:
from pydub import AudioSegment
from pydub.silence import split_on_silence

sound = AudioSegment.from_wav("Downloads/Recording (6).wav")
chunks = split_on_silence(sound, 
    # must be silent for at least half a second
    min_silence_len=500,silence_thresh=-40,
    keep_silence=300
)
abchunks=0
for i in range(0,len(chunks)):
    abchunks+=chunks[i]

abchunks.export("Downloads/chunk123.wav", format="wav")    
    
    
    

#for i, chunk in enumerate(chunks):
#    chunk.export("Downloads/chunk{0}.wav".format(i), format="wav")

<_io.BufferedRandom name='Downloads/chunk123.wav'>