In [None]:
# os system processing library
import os
import io
import subprocess
import shutil
import random
import json
from glob import glob
from shutil import move, rmtree, copyfile

# mathematic operation
import time
import math
import numpy as np
import pandas as pd

# audio related
from scipy.io import wavfile
import librosa
import librosa.display
# from google.cloud import speech

# display libary
import matplotlib.pyplot as plt
import tqdm
from tqdm.notebook import tqdm
# machine learning libary
# from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras as keras
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import train_test_split

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

# confussion matrix
%matplotlib inline
from sklearn.metrics import confusion_matrix
import itertools

# developed library
import importlib
import Timecode
importlib.reload(Timecode)
from Timecode import Timecode

In [4]:
class FeatureExtraction:
    def __init__(self, n_mels=128):
        self.n_mels = n_mels
        self.y = None
        self.sr = 11025
        self.S = None
        self.log_S = None
        self.mfcc = None
        self.delta_mfcc = None
        self.delta2_mfcc = None
        self.M = None
        self.rmse = None
        self.foldername = None
        self.filename=None
    
    def loadFile(self, foldernname):
        self.foldernname=foldernname
        self.y, self.sr = librosa.load(foldernname)
#         logger.debug('File loaded: %s', foldernname)
    
    def load_y_sr(self, y, sr):
        self.y = y
        self.sr = sr
    
    def melspectrogram(self):
        self.S = librosa.feature.melspectrogram(self.y, sr=self.sr, n_mels=self.n_mels)
        self.log_S = librosa.amplitude_to_db(self.S)
    
    def plotmelspectrogram(self, save=True):
        fig = plt.figure(figsize=(12, 4))
        librosa.display.specshow(self.log_S, sr=self.sr, x_axis='time', y_axis='mel')
        plt.title(f'mel Power Spectrogram ({self.filename})')
        plt.colorbar(format='%+02.0f dB')
        plt.tight_layout()
        if not os.path.exists('mel'):
            os.mkdir('mel')
        if save:
            fig.savefig(f'./mel/{self.filename}-mel.png', dpi=fig.dpi)
            print(f'Saved to ./mel/{self.filename}-mel.png')
            plt.close('all')

    def extractmfcc(self, n_mfcc=13):
        self.mfcc = librosa.feature.mfcc(S=self.log_S, n_mfcc=n_mfcc)
        self.delta_mfcc = librosa.feature.delta(self.mfcc,mode='nearest')
        self.delta2_mfcc = librosa.feature.delta(self.mfcc, order=2,mode='nearest')
        self.M = np.vstack([self.mfcc, self.delta_mfcc, self.delta2_mfcc])
    
    def plotmfcc(self,save=False):
        fig = plt.figure(figsize=(12, 6))
        plt.subplot(3, 1, 1)
        librosa.display.specshow(self.mfcc)
        plt.title(f'mel Power Spectrogram ({self.filename})')
        plt.ylabel('MFCC')
        plt.colorbar()
        
        plt.subplot(3, 1, 2)
        librosa.display.specshow(self.delta_mfcc)
        plt.title(f'mel Power Spectrogram ({self.filename})')
        plt.ylabel('MFCC-$\Delta$')
        plt.colorbar()
        
        plt.subplot(3, 1, 3)
        librosa.display.specshow(self.delta2_mfcc, sr=self.sr, x_axis='time')
        plt.title(f'mel Power Spectrogram ({self.filename})')
        plt.ylabel('MFCC-$\Delta^2$')
        plt.colorbar()
        
        plt.tight_layout()
        if not os.path.exists('mfcc'):
            os.mkdir('mfcc')
        if save:
            fig.savefig(f'./mfcc/{self.filename}-mfcc.png', dpi=fig.dpi)
            print(f'Saved to ./mfcc/{self.filename}-mfcc.png')
            plt.close('all')

    def extractrmse(self):
        self.rmse = librosa.feature.rms(y=self.y)

In [127]:
class Dataset:
    def __init__(self,):
        self.X = np.empty(shape=(0,80))
        self.Y = np.empty(shape=(0,2))
        self.DATASET = None
        self.PATH_ARRAY = []
        self.failed_file = []
        self.unexpected_label = []
        self.processed_counter = 0
        # self.TRACK_DURATION
        # self.SAMPLES_PER_TRACK
        print("Object created!")

    def create_dataset(self,dataset_path,output_path):
        self.DATASET_PATH = dataset_path
        self.OUTPUT_PATH = output_path
        self.__process_dataset()
        self.__write_to_file()
        
    def get_feature_by_audio(self,y,sr):
          #exctract mfcc
        try:
            features = FeatureExtraction()
            features.load_y_sr(y,sr)
            features.melspectrogram()
            features.extractmfcc()
            features.extractrmse()
        except ValueError:
            self.failed_file.append(file_path)

        feature_vector = []

        for feature in features.mfcc:
            feature_vector.append(np.mean(feature))
            feature_vector.append(np.var(feature))

        for feature in features.delta_mfcc:
            feature_vector.append(np.mean(feature))
            feature_vector.append(np.var(feature))

        for feature in features.delta2_mfcc:
            feature_vector.append(np.mean(feature))
            feature_vector.append(np.var(feature))

        feature_vector.append(np.mean(features.rmse))
        feature_vector.append(np.var(features.rmse))

        # self.X = np.vstack((self.X,[feature_vector])) 
        return feature_vector
        
    def get_feature_by_file(self,audio):
        print("Extacting feature:", audio)
          #exctract mfcc
        try:
            features = FeatureExtraction()
            features.loadFile(audio)
            features.melspectrogram()
            features.extractmfcc()
            features.extractrmse()
        except ValueError:
            self.failed_file.apppend(file_path)

        feature_vector = []

        for feature in features.mfcc:
            feature_vector.append(np.mean(feature))
            feature_vector.append(np.var(feature))

        for feature in features.delta_mfcc:
            feature_vector.append(np.mean(feature))
            feature_vector.append(np.var(feature))

        for feature in features.delta2_mfcc:
            feature_vector.append(np.mean(feature))
            feature_vector.append(np.var(feature))

        feature_vector.append(np.mean(features.rmse))
        feature_vector.append(np.var(features.rmse))

        # self.X = np.vstack((self.X,[feature_vector])) 
        return feature_vector
        
    def __process_dataset(self):
        starttime = time.time()
        for i , (dirpath, dirnames, filenames) in enumerate(os.walk(self.DATASET_PATH)):
              if dirpath is not self.DATASET_PATH:
                label = dirpath.split("/")[-1]
                # print(label)
                print("Processing:", label)
                for file in filenames:
                  #load audio
                  file_path = os.path.join(dirpath,file)

                  # print(file_path)

                  #exctract mfcc
                try:
                    features = FeatureExtraction()
                    features.loadFile(file_path)
                    features.melspectrogram()
                    features.extractmfcc()
                    features.extractrmse()
                except ValueError:
                    self.failed_file.apppend(file_path)

                feature_vector = []

                for feature in features.mfcc:
                    feature_vector.append(np.mean(feature))
                    feature_vector.append(np.var(feature))

                for feature in features.delta_mfcc:
                    feature_vector.append(np.mean(feature))
                    feature_vector.append(np.var(feature))

                for feature in features.delta2_mfcc:
                    feature_vector.append(np.mean(feature))
                    feature_vector.append(np.var(feature))

                feature_vector.append(np.mean(features.rmse))
                feature_vector.append(np.var(features.rmse))

                self.X = np.vstack((self.X,[feature_vector]))
                if label == 'success':
                    self.Y = np.vstack((self.Y,[0,1]))
                    self.processed_counter += 1
                    print("Done ", self.processed_counter, file_path,' label=',label)
                elif label == 'stuttered':
                    self.Y = np.vstack((self.Y,[1,0]))
                    self.processed_counter += 1
                    print("Done ", self.processed_counter, file_path,' label=',label)
                else:
                    self.unexpected_label.append(file_path)
                    print("Fail ", self.processed_counter, file_path,' label=',label)

        for fail in self.unexpected_label:
            print("unexpected_label ", file_path, " !")

        for fail in self.failed_file:
            print("fail ", file_path, " !")

        # print("finished all!")
        print('Time taken = {} seconds'.format(time.time() - starttime))    
        self.DATASET = np.hstack((self.X,self.Y))

    def load_dataset(self,dataset_path):
        self.DATASET_PATH = dataset_path

        if os.path.exists(self.DATASET_PATH):
            print("Dataset exist!")
        else:
            print('Not found ',self.DATASET_PATH)
            return

        self.FILE_NAME, self.FILE_TYPE = os.path.splitext(self.DATASET_PATH)

        print("Loading ", self.DATASET_PATH)
        if self.FILE_TYPE == '.csv':
            print('Detect as .csv file')
            self.DATA = np.genfromtxt(self.DATASET_PATH, delimiter=',')
        elif self.FILE_TYPE == '.gz':
            print('Detect as .gz file')
            self.DATA = np.loadtxt(self.DATASET_PATH)
        else:
            print("Only support .gz and .csv file")
            return False

        self.X = self.DATA[:, 0:80]
        self.Y = self.DATA[:, 80:]

    def convert_to_csv(self,output_file):
        if os.path.exists(output_file):
            os.remove(output_file)
        np.savetxt(output_file,self.DATA, delimiter=',')
        print('Converted to',output_file)      

    def __write_to_file(self):
        if os.path.exists(self.OUTPUT_PATH):
            os.remove(self.OUTPUT_PATH)

        np.savetxt(self.OUTPUT_PATH, self.DATASET)
        print('Saved to',self.OUTPUT_PATH)  

    def get_x(self):
        return self.X

    def get_y(self):
        return self.Y

In [6]:
class Timestamp:
    def __init__(self,start = 0.0, end = 0.0, word='word', isInclude=False,feature=None, label=None):
        self.start = start
        self.end = end
        self.word = word
        self.isInclude = isInclude
        self.feature = feature
        self.label = label

In [184]:
'''auto-editor.py'''
#python auto-editor.py lib.mp4 --frame_margin 8 --silent_threshold 0.03
# external python libraries
from scipy.io import wavfile
import soundfile as sf

import numpy as np
from tqdm import tqdm

# internal python libraries
import sys
import time
import math
import os
import subprocess
from shutil import move, rmtree, copyfile

import json

# tested with VOSK 0.3.15
import vosk
import librosa
import numpy
import pandas

# developed librLary
from Timecode import Timecode

class AutoEdit:
    def __init__(self, file=None, ba='160000', ac='1', ar='16000',output_format='.wav', fps = 30.0, 
                 st = 0.04, fm = 4, lt = 2.00, verbose = False,
                 log=False, mono = True, 
                 model='Model/mymodel_78_18.h5'):
        #parameter for ffmpeg to convert the file
        self.MODEL_PATH = model
        self.INPUT_FILE = file
        self.FILENAME = file.split('.')[0]
        self.AUDIO_OUTPUT_FORMAT = output_format
        self.AUDIO_OUTPUT = f'{self.FILENAME}{self.AUDIO_OUTPUT_FORMAT}'
        
        self.BITRATE_AUDIO = ba
        self.AUDIO_CHANEL = ac
        self.AUDIO_RATE = ar
        self.FRAME_RATE = fps
        
        self.FRAME_MARGIN = fm
        self.SILENT_THRESHOLD = st
        self.LOUDNESS_THRESHOLD = lt
        
        self.VERBOSE = verbose
        
        self.audioData = None
        self.sampleRate = None
        
        self.audioSampleCount = None
        self.maxAudioVolume = None
        self.samplesPerFrame = None
        self.audioFrameCount = None
        self.hasLoudAudio = None
        
        self.chunks = None
        self.shouldIncludeFrame = None
        self.timecodeList = None
        self.chunks_path = 'chunks.txt'
        self.log = log
        self.isMono = mono
            
    def extract_audio(self):
        if self.INPUT_FILE == None:
            print("No input file!")
            
        cmd = ['ffmpeg', '-y' ,'-i',self.INPUT_FILE,'-acodec','pcm_s16le' ,'-b:a', self.BITRATE_AUDIO, '-ac', self.AUDIO_CHANEL, 
               '-ar', self.AUDIO_RATE, '-vn', f'{self.AUDIO_OUTPUT}']
        #ffmpeg -i "%%a" -acodec pcm_s16le -ac 1 -ar 16000 -af lowpass=3000,highpass=200 "converted\%%~na.wav
        if(not self.VERBOSE):
            cmd.extend(['-nostats', '-loglevel', '0'])
        subprocess.call(cmd)
        
    def get_max_volume(self,s):
        maxv = float(np.max(s))
        minv = float(np.min(s))
        return max(maxv, -minv)

    def load_audio(self):
        # self.sampleRate,self.audioData = wavfile.read(f'{self.AUDIO_OUTPUT}')
        self.audioData,self.sampleRate = librosa.load(f'{self.AUDIO_OUTPUT}',
        mono = self.isMono,sr=self.sampleRate)

        self.audioSampleCount = self.audioData.shape[0]
        self.maxAudioVolume = self.get_max_volume(self.audioData)
        self.samplesPerFrame = self.sampleRate / self.FRAME_RATE
        self.audioFrameCount = int(math.ceil(self.audioSampleCount / self.samplesPerFrame))
    
    def get_shape(self):
        return self.audioData.shape
    
    def calc_has_loud_audio(self):
        self.hasLoudAudio = np.zeros((self.audioFrameCount))
        
        for i in range(self.audioFrameCount):
            start = int(i * self.samplesPerFrame)
            end = min( int( (i+1) * self.samplesPerFrame ), self.audioSampleCount)
            audiochunks = self.audioData[start:end]
            maxchunksVolume = self.get_max_volume(audiochunks) / self.maxAudioVolume
            
            if(maxchunksVolume >= self.LOUDNESS_THRESHOLD):
                self.hasLoudAudio[i] = 2
            elif(maxchunksVolume >= self.SILENT_THRESHOLD):
                self.hasLoudAudio[i] = 1
    
    def calc_should_include_frame(self):
        self.shouldIncludeFrame = np.zeros((self.audioFrameCount))
        self.chunks = [[0,0,0]]
        
        for i in range(self.audioFrameCount):
            start = int(max(0, i-self.FRAME_MARGIN))
            end = int(min(self.audioFrameCount, i+1+self.FRAME_MARGIN))
            self.shouldIncludeFrame[i] = min(1,np.max(self.hasLoudAudio[start:end]))

            if(i >= 1 and self.shouldIncludeFrame[i] != self.shouldIncludeFrame[i-1]):
                self.chunks.append([self.chunks[-1][1], i, self.shouldIncludeFrame[i-1]])
        self.chunks.append([self.chunks[-1][1], self.audioFrameCount, self.shouldIncludeFrame[i-1]])
        self.chunks = self.chunks[1:]
        
    def calc_timecode(self):
        self.timecodeList = []
        
        for chunk in self.chunks:
            startTime = Timecode(fps=self.FRAME_RATE)
            endTime = Timecode(fps=self.FRAME_RATE)
            
            startTime.set_by_frames(chunk[0])
            endTime.set_by_frames(chunk[1])
            isInclude = chunk[2]
            self.timecodeList.append([startTime,endTime,isInclude])
            
    def execute(self):
        command = 'run.bat'
        if os.path.exists('run.bat'):
            if self.log:
                 command += ' > log.txt'    
            output = subprocess.call(command,shell=True)
        if self.VERBOSE:
            print("Complex frilter command success") if output == 0 else print("Complex filter command failed!")
      
            
            
    def write_to_bat(self,command):
        if os.path.exists('run.bat'):
            os.remove(f'run.bat')
        file1 = open("run.bat","w")
        file1.write(command)
        file1.close()
        filename = 'run.bat'
        if self.log:
            filename += ' > log.txt'
        return filename
    
    def produce_concat_file(self):
        if os.path.exists(self.chunks_path):
            os.remove(self.chunks_path)
            
        with open(self.chunks_path, 'w') as f:
            for index in range(len(self.timecodeList)):
                isInclude = float(self.timecodeList[index][2])
                if isInclude < 1:
                    continue;
                # startTime = self.timecodeList[index][0].get_timecode_ffmpeg()
                # endTime = self.timecodeList[index][1].get_timecode_ffmpeg()
                startTime = self.timecodeList[index][0].get_seconds()
                endTime = self.timecodeList[index][1].get_seconds()
                f.write(f'file {self.INPUT_FILE}\ninpoint {startTime}\noutpoint {endTime}\n')
    
    def concat_way(self):
        concat = ['ffmpeg','-y','-f','concat','-safe','0','-i', f'{self.chunks_path}',
                 '-async','1','-framerate', f'{self.FRAME_RATE}','-b:a', f'{self.BITRATE_AUDIO}',
                 '-c:v', 'copy', '-ar', f'{self.AUDIO_RATE}', '-ac', f'{self.AUDIO_CHANEL}',
                 '-c:a','aac','-movflags','+faststart',f'{self.FILENAME}_CONCATED.mp4']
        subprocess.call(concat)
        
    def select_filter(self):
        
        between = []
        counter = 0
        for i in self.timecodeList:
            if i[2] > 0:
#                 print(f'{self.INPUT_FILE},{i[0].get_seconds()},{i[1].get_seconds()}')
                between.append(f'between(t,{i[0].get_seconds()},{i[1].get_seconds()})') 
        
        betweens = '+'.join(between)
        slt = '\"select=\'' + betweens + '\'' + ',setpts=N/FRAME_RATE/TB\"'
        aslt = '\"aselect=\'' + betweens + '\'' + ',asetpts=N/SR/TB\"'
        
        sltFilter = ['ffmpeg','-y','-i',f'{self.INPUT_FILE}', '-vf', 
                     f'{slt}','-af', f'{aslt}',
                     f'{self.FILENAME}_FILTERED.mp4']
        
        total_string = ' '.join(sltFilter)
#         if self.log:
#             total_string += " > log.txt 2>&1";
        bat_path = self.write_to_bat(total_string)
        output = subprocess.call(bat_path,shell=True)
        if self.VERBOSE:
            print("Select filter command success") if output == 0 else print("Select filter command failed!")
            
    def remove_silence(self):
        trim = []
        duration_list = []
        number_of_segment = 0
        prev = 0
        current = 0

        # with out xfade
        for i in self.timecodeList:
            if i[2] > 0:
                duration_list.append(i[0].get_seconds()-i[1].get_seconds())
                trim.append(
                    f'[0:v]trim=start={i[0].get_seconds()}:end={i[1].get_seconds()},setpts=PTS-STARTPTS[v{number_of_segment}]')
                trim.append(
                    f'[0:a]atrim=start={i[0].get_seconds()}:end={i[1].get_seconds()},asetpts=PTS-STARTPTS[a{number_of_segment}]')
                number_of_segment += 1

                
        filter = ';'.join(trim)
        filter = filter + ";"

        # Normal cut feature
        for i in range(number_of_segment):
            filter += f' [v{i}] [a{i}]'

        # Start to generate ending of command
        filter += f'concat=n={number_of_segment}:v=1:a=1 [out]'
        filter = '"' + filter + '"'
        filter = f'ffmpeg -y -i {self.INPUT_FILE} -filter_complex ' + filter
        filter = filter + f' -map "[out]" {self.FILENAME}_SILENCE.mp4'
            
        bat_path = self.write_to_bat(filter)     
    

    def fliter_complex(self):
        trim = []
        duration_list = []
        number_of_segment = 0
        prev = 0
        current = 0

        # with out xfade
        for i in self.timecodeList:
            if i[2] > 0:
                duration_list.append(i[0].get_seconds()-i[1].get_seconds())
                trim.append(
                    f'[0:v]trim=start={i[0].get_seconds()}:end={i[1].get_seconds()},setpts=PTS-STARTPTS[v{number_of_segment}]')
                trim.append(
                    f'[0:a]atrim=start={i[0].get_seconds()}:end={i[1].get_seconds()},asetpts=PTS-STARTPTS[a{number_of_segment}]')
                number_of_segment += 1

        # # with xfade
        # for i in self.timecodeList:
        #     if i[2] > 0:
        #         duration_list.append(i[1].get_seconds()-i[0].get_seconds())
        #         trim.append(
        #             f'[0:v]trim=start={i[0].get_seconds()}:end={i[1].get_seconds()},setpts=PTS-STARTPTS[v{number_of_segment}]')
        #         number_of_segment += 1
        # number_of_segment = 0
        # for i in self.timecodeList:
        #     if i[2] > 0:
        #         trim.append(
        #             f'[0:a]atrim=start={i[0].get_seconds()}:end={i[1].get_seconds()},asetpts=PTS-STARTPTS[a{number_of_segment}]')
        #         number_of_segment += 1
                
        filter = ';'.join(trim)
        filter = filter + ";"

        # Normal cut feature
        for i in range(number_of_segment):
            filter += f' [v{i}] [a{i}]'

        # # Generate for xfade effect
        # prevOffset = 0
        # count = 1
        # for i in range(number_of_segment):
        #     offset = duration_list[i] + prevOffset - 0.5
        #     if(i < 2):
        #         filter += f'[v{i}]'
        #     if(i == 2):
        #         filter += f'xfade=transition=fade:duration=0.5:offset={offset}'
        #         filter += f'[x{count}];'
        #         filter += f'[x{count}]'
        #         filter += f'[v{i}]'
        #         filter += f'xfade=transition=fade:duration=0.5:offset={offset}'
        #         count += 1
        #         filter += f'[x{count}];'
        #     if(i > 2):
        #         filter += f'[x{count}]'
        #         filter += f'[v{i}]'
        #         filter += f'xfade=transition=fade:duration=0.5:offset={offset}'
        #         count += 1
        #         if(i == number_of_segment - 1):
        #             filter += f',format=yuv420p'
        #         filter += f'[x{count}];'
        #     prevOffset = offset

        # for i in range(number_of_segment):
        #     filter += f' [a{i}]'

        # Start to generate ending of command
        filter += f'concat=n={number_of_segment}:v=1:a=1 [out]'
        filter = '"' + filter + '"'
        filter = f'ffmpeg -y -i {self.INPUT_FILE} -filter_complex ' + filter
        filter = filter + f' -map "[out]" {self.FILENAME}_COMPLEX.mp4'


#         if self.log:
#             filter += ' > "log.txt" 2>&1'
            
        bat_path = self.write_to_bat(filter)
#         output = subprocess.call(bat_path,shell=True)
        output = 1
        if self.VERBOSE:
            print("Complex frilter command success") if output == 0 else print("Complex filter command failed!")
    
    
    def post_process(self):
        if os.path.exists(f'{self.chunks_path}'):
            os.remove(f'{self.chunks_path}')
            if self.VERBOSE:
                print(f"Removed {self.chunks_path}")
                
        if os.path.exists(f'{self.AUDIO_OUTPUT}'):
            os.remove(f'{self.AUDIO_OUTPUT}')
            if self.VERBOSE:
                print(f"Removed {self.AUDIO_OUTPUT}")
       
        
    def export_complex(self):
        self.pbar = tqdm(total=7)
        print("Start processing...")
        self.extract_audio()
        self.update_mypbar()
        self.load_audio()
        self.update_mypbar()
        self.calc_has_loud_audio()
        self.update_mypbar()
        self.calc_should_include_frame()
        self.update_mypbar()
        self.calc_timecode()
        self.update_mypbar()
        
        print(f'Exporting {self.FILENAME}_COMPLEX.mp4 ...')
        self.fliter_complex()
        self.update_mypbar()
        print(f'Exported {self.FILENAME}_COMPLEX.mp4 successfully!')
        
        self.post_process()
        self.update_mypbar()
        self.pbar.close()
        
    def export_fast(self):
        try:
            self.extract_audio()
            self.load_audio()
            self.calc_has_loud_audio()
            self.calc_should_include_frame()
            self.calc_timecode()
            self.produce_concat_file()
            self.concat_way()
            self.post_process()
            if(self.VERBOSE):
                print(f'Exported {self.FILENAME}_CONCATED.mp4 successfully!')
        except:
            print('Failed to export fast!')
            
    def update_mypbar(self):
        self.pbar.update(1)
        time.sleep(0.01)
        self.pbar.refresh()
            
    def export_good(self):
        self.pbar = tqdm(total=7)
        try:
            print("Start processing...")
            self.extract_audio()
            self.update_mypbar()

            self.load_audio()
            self.update_mypbar()

            self.calc_has_loud_audio()
            self.update_mypbar()
            self.calc_should_include_frame()
            self.update_mypbar()
            self.calc_timecode()
            self.update_mypbar()
            
            print(f'Exporting {self.FILENAME}_FILTERED.mp4 ...')
            self.select_filter()
            self.update_mypbar()
            print(f'Exported {self.FILENAME}_FILTERED.mp4 successfully!')
  
            self.post_process()
            self.update_mypbar()
            self.pbar.close()
        except:
            print(f'Failed to export {self.FILENAME}_FILTERED.mp4 !')
            
    def extract_words(self,res):
        jres = json.loads(res)
        if not 'result' in jres:
            return []
        words = jres['result']
        return words

    def transcribe_words(self,recognizer, bytes):
        results = []

        chunk_size = 4000
        for chunk_no in range(math.ceil(len(bytes)/chunk_size)):
            start = chunk_no*chunk_size
            end = min(len(bytes), (chunk_no+1)*chunk_size)
            data = bytes[start:end]

            if recognizer.AcceptWaveform(data):
                words = self.extract_words(recognizer.Result())
                results += words
        results += self.extract_words(recognizer.FinalResult())

        return results                

    def vosk_process(self):
        print('Loading vosk...')
        vosk.SetLogLevel(-1)
        int16 = np.int16(self.audioData * 32768).tobytes()
        model_path='vosk-model-en-us-aspire-0.2'
        vosk_model = vosk.Model(model_path)
        recognizer = vosk.KaldiRecognizer(vosk_model, 16000)
        print('Transcribing...')
        res = self.transcribe_words(recognizer, int16)
        df = pandas.DataFrame.from_records(res)
        df = df.sort_values('start')
        print('Completed transcribe')
        self.df = df
        
        
    def feature_process(self):
        # Process by using vosk
        self.audioData
        df = self.df
        model = tf.keras.models.load_model(self.MODEL_PATH)
        feature_file = f'{self.FILENAME}_feature.csv'
        
        sampleRate = cut.sampleRate
        fail_list = []
        time_margin = int( (  (1/self.FRAME_RATE) *self.FRAME_MARGIN ) )
        index_margin = int( (  (1/self.FRAME_RATE) *self.FRAME_MARGIN ) *self.sampleRate )

        if(os.path.exists(feature_file)):
            os.remove(feature_file)

        if(not os.path.exists(feature_file)):
            print("Extracting feature...")
            features = np.empty(shape=(0,80))
            ds = Dataset()
            for i in tqdm(df.index[:]): 
        #         start_index = int(df['start'][i] * sampleRate)
        #         end_index = int(df['end'][i] * sampleRate)
                start_index = max(0, int(  df['start'][i] * self.sampleRate))
                end_index = min( int( (df['end'][i]) * self.sampleRate), self.audioSampleCount)
        #         try:
                fea = ds.get_feature_by_audio(self.audioData[start_index:end_index],11025)
                features = np.vstack((features,[fea]))
        #         except:
        #             ts = Timestamp(df['start'][i],df['end'][i],word=df['word'][i])
        #             fail_list.append(ts)
        #             print('Failed index:',i)
            print(f'Saved features to {feature_file}')
            np.savetxt(feature_file, features, delimiter=',')


        print(f'Load feature from {feature_file}')
        features = np.loadtxt(feature_file,delimiter=',')

        print('Predicting...')
        predictions = model.predict(x=features, batch_size=84,verbose=0)
        print("Finish predict!")

        self.predictions = predictions
        include_list = []
        for i in tqdm(df.index[:]):
            isInclude = True
            predict = np.round(predictions[i])
            word = df['word'][i]
            if(word == "i'm" or word == 'um' or word =='m' or word=='ah'or word=='huh'or word=='hm'):
                if(predict == 1):
                    isInclude = False
            if(isInclude):
                start = df['start'][i]
                end = df['end'][i]
                ts = Timestamp(start,end,word=word,label=predict)
                include_list.append(ts)
                
        self.include_list = include_list        
        render_list = []
        counter = 0
        start = include_list[0].start
        end = include_list[0].end
        word = ""
        for i,ts in tqdm(enumerate(include_list)):
            current_start = ts.start
            current_end = ts.end
            prev_start = include_list[i-1].start
            prev_end = include_list[i-1].end
            if(i >= 1 and current_start != prev_end):
                segment = Timestamp(start,prev_end, word=word)
                word = ''
                start = current_start
                render_list.append(segment)
                counter = counter + 1
            word = word + ts.word + " "
#             print(segment.start, " ", segment.end," ", segment.word)        

        self.render_list = render_list
        
#         df = self.df
#         model = tf.keras.models.load_model(f'{self.MODEL_PATH}')
#         feature_file = f'{self.FILENAME}_feature.csv'
# #         df = self.df
# #         sampleRate = self.sampleRate
#         fail_list = []
#         time_margin = int( (  (1/self.FRAME_RATE) *self.FRAME_MARGIN ) )
#         index_margin = int( (  (1/self.FRAME_RATE) *self.FRAME_MARGIN ) *self.sampleRate )
        
#         if(os.path.exists(feature_file)):
#             os.remove(feature_file)

#         if(not os.path.exists(feature_file)):
# #             print('No feature cache')
#             print("Extracting feature...")
#             features = np.empty(shape=(0,80))
#             ds = Dataset()
#             for i in tqdm(df.index[:]): 
#                 start_index = max(0, int(  df['start'][i] * self.sampleRate))
#                 end_index = min( int( (df['end'][0]) * self.sampleRate), self.audioSampleCount)
#                 fea = ds.get_feature_by_audio(self.audioData[start_index:end_index],11025)
#                 features = np.vstack((features,[fea]))
#             print(f'Saved features to {feature_file}')
#             np.savetxt(feature_file, features, delimiter=',')


#         print(f'Load feature from {feature_file}')
#         features = np.loadtxt(feature_file,delimiter=',')

#         print('Predicting...')
#         predictions = model.predict(x=features, batch_size=84,verbose=0)
#         print("Finish predict!")
#         self.predictions = predictions

#         include_list = []
#         for i in tqdm(df.index[:]):
#             isInclude = True
#             predict = np.round(predictions[i])
#             word = df['word'][i]
#             if(word == "i'm" or word == 'um' or word =='m'):
#                 if(predict == 1):
#                     isInclude = False
#             if(isInclude):
#                 start = df['start'][i]
#                 end = df['end'][i]
#                 ts = Timestamp(start,end,word=word,label=predict)
#                 include_list.append(ts)
                
#         self.include_list = include_list
#         render_list = []
#         counter = 0
#         start = include_list[0].start
#         end = include_list[0].end
#         word = ""
#         for i,ts in tqdm(enumerate(include_list)):
#             current_start = ts.start
#             current_end = ts.end
#             prev_start = include_list[i-1].start
#             prev_end = include_list[i-1].end
#             if(i >= 1 and current_start != prev_end):
#                 segment = Timestamp(start,prev_end, word=word)
#                 word = ''
#                 start = current_start
#                 render_list.append(segment)
#                 counter = counter + 1
#             word = word + ts.word + " "
#         self.predictions = predictions
#         self.include_list = include_list
#         self.render_list = render_list
            


    def generate_complex_filter(self,render_list):
        trim = []
        duration_list = []
        number_of_segment = 0
        prev = 0
        current = 0
        # with out xfade
        for ts in render_list:
            duration_list.append(ts.end-ts.start)
            trim.append(
                f'[0:v]trim=start={ts.start}:end={ts.end},setpts=PTS-STARTPTS[v{number_of_segment}]')
            trim.append(
                f'[0:a]atrim=start={ts.start}:end={ts.end},asetpts=PTS-STARTPTS[a{number_of_segment}]')
            number_of_segment += 1

        filter = ';'.join(trim)
        filter = filter + ";"

        # Normal cut feature
        for i in range(number_of_segment):
            filter += f' [v{i}] [a{i}]'


        # Start to generate ending of command
        filter += f'concat=n={number_of_segment}:v=1:a=1 [out]'
        filter = '"' + filter + '"'
        filter = f'ffmpeg -y -i {cut.INPUT_FILE} -filter_complex ' + filter
        filter = filter + f' -map "[out]" {cut.FILENAME}_COMPLEX.mp4'
        bat_path = self.write_to_bat(filter)
        
        
#     print(segment.start, " ", segment.end," ", segment.word)
        
        
# All below is overlap technique
        
        
#         segment_length = 300
#         segment_hop = 100
#         samples_per_segment = int(segment_length * self.sampleRate / 1000)
#         samples_to_skip_per_hop = int(segment_hop * self.sampleRate / 1000)
#         print('samples_per_segment:',samples_per_segment,' samples_to_skip_per_hop',samples_to_skip_per_hop)
#         ts_list_index = []
        
#         ts_list = []
#         for ts in self.timecodeList:
#             if ts[2] > 0:
#                 ts_list.append(ts)
#         self.ts_list = ts_list


# """ remove this first to use the first overlap technique   
# #         Implementing overlap technique

# #         convert the time in seconds to index (sample rate) for audio data
#         counter = 0
#         for timecode in ts_list:
#             if timecode[2] > 0:
#                 start_time = float(timecode[0].get_seconds())
#                 end_time = float(timecode[1].get_seconds())
#                 start_index = int(start_time * self.sampleRate)
#                 end_index = int(end_time * self.sampleRate)
#                 # start_index = librosa.time_to_samples(start_time)
#                 # end_index = librosa.time_to_samples(end_time)
#                 ts = Timestamp(start_index,end_index,timecode[2])
# #                 sf.write(f'temp/{counter}.wav',self.audioData[start_index:end_index],self.sampleRate)
#                 counter = counter + 1
#                 ts_list_index.append(ts)
#         # for tx in ts_list_index:
#         #     print('start:',tx.start,'end:',tx.end)
#         # print("====")
        
#         self.ts_list_index = ts_list_index
        
# #         generate overlapping file
#         hop_list = []
#         for ts in ts_list_index:
#             for start_index in range(ts.start,ts.end,samples_to_skip_per_hop):
#                 end = start_index+samples_per_segment
# #                 print('start:',start_index,'end:',end)
#                 hop = Timestamp(start_index,end,ts.label)
#                 hop_list.append(hop)

#         # for hop in hop_list:
#         #     print('start:',hop.start,'end:',hop.end,'isInclude',hop.isInclude)

# #         Detect cached feature file if not generate the feature and store it as csv
#         feature_file = f'{self.FILENAME}_feature.csv'
#         if(not os.path.exists(feature_file)):            
#             ds = Dataset()
#             print("Extracting feature...")
#             features = np.empty(shape=(0,80))
#             for hop in hop_list:
# #                 audioData = librosa.core.resample(self.audioData[hop.start:hop.end],
# #                                                   self.sampleRate,target_sr=11025)
#                 fea = ds.get_feature_by_audio(self.audioData[hop.start:hop.end],11025)
#                 hop.feature = np.array(([fea]))
#                 features = np.vstack((features,[fea]))
#             print(f'Saved features to {feature_file}')
#             np.savetxt(feature_file, features, delimiter=',')   
# #             else load the features from the cached csv file
#         else:
#             print(f'Load feature from {feature_file}')
#             features = np.loadtxt(feature_file,delimiter=',')
# #         promote hop_list for testing purpose
#         self.hop_list = hop_list
#         print("Finish extract feature!")
# #         load the model
#         model = tf.keras.models.load_model(self.MODEL_PATH)
#         print('Predicting...')
#         self.predictions = model.predict(x=features, batch_size=20,verbose=0)
#         print("Finish predict!")
#         for i,hop in enumerate(self.hop_list):
#             hop.label = np.round(self.predictions[i])


# #             if previous and current label is stuttered so eliminate the current hop

# #         prevLabel = 0
# #         self.isInclude = []
# #         for hop in self.hop_list:
# #             if(hop.label == 0):
# #                 self.isInclude.append(hop)
# #             elif(hop.label == 1 and prevLabel == 0):
# #                 self.isInclude.append(hop)
# #             prevLabel = hop.label
            
# #         self.isSegment = []
# #         index = 0
# #         end = 0
# #         start = 0
                
                
            
# # implement own technique                
            
# #    Assign label for the hop in hop list
# # This part improve by the batch preductions at above
# # Significantly increase the performance

# #         for hop in hop_list:
# #             lbl = model.predict(hop.feature)
# #             hop.label = np.round(lbl[0][0])
# #         labels = []
# #         print('Finish predict!')


# # assign the predicted label to labels list
# # extract label from the hop list to generate another list called labels for
# # below process

# #         for i,hop in enumerate(hop_list):
# #             print(hop.label)
# #             labels.append(hop.label)

# # Divide all segment by 3 to calculate the should include frame

# #         number_of_segment = int(len(hop_list)/3)
# #         rest_number_of_segment = int(len(hop_list)%3)
# #         print("Hop-len:",number_of_segment)
# #         print("Rest-len:",rest_number_of_segment)
# #         segment = []
# #         segment_count = 0
# #         for i in range(number_of_segment):
# #             hop = hop_list[i]
# #             ts = Timestamp()
# #             print(hop.start," ",hop.end)

            
# #          print all needed segment start time and endime

#         # print(f"start_time: {start_time} end_time: {end_time} duration:{end_time - start_time} start_index: {start_index} end_index: {end_index}")

#         # for i,timecode in enumerate(ts_list_index):
#         #     start_index = timecode.start
#         #     end_index = timecode.end
#         #     for j in range(start_index,end_index,150):
#         #         print(j)
#         #     if i > 1:
#         #         break


# #         export each segment to individual wav file

#         # # Work successful to export each segment
#         # if os.path.exists('temp/segment'):
#         #     shutil.rmtree('temp/segment')
#         # if not os.path.exists('temp'):
#         #     os.mkdir('temp')
#         # if not os.path.exists('temp/segment'):
#         #     os.mkdir('temp/segment')
#         # for i, timecode in enumerate(ts_list_index):
#         #     librosa.output.write_wav(f'temp/segment/{i}.wav',
#                     y=self.audioData[timecode.start:timecode.end],sr=self.sampleRate)
    
# """ remove this first to use the first overlap technique

In [191]:
cut = AutoEdit(file='SBHD.mp4',ac='1',
               verbose=True,fm=4,st=0.2,fps=24.00,
               log=True,mono=True,
               model='Model/mymodel/mymodel_78_18.h5' #159
#                model='Model/newmodel/20201227-1123-MLP-RMSprop-Default-80-123.h5' #100
#                model='test1/model-ep202-loss0.145-acc0.889-val_loss0.176-val_acc0.877.h5' #135
#                model='test2/model-ep007-loss21.212-acc0.550-val_loss0.443-val_acc0.912.h5' #153
#                model='test2/20201228-0913-MLP1.2k-ADAM-BC-relu-sigmoid-75-96.h5' #120
#                model = '20201228-0824-MLP1.2k-ADAM-MSE-relu-sigmoid-73-32.h5' #117
#                model = '20201228-0152-MLP-ADAM-MS-tanh-sigmoid-75-17.h5'#120
#                model ='20201228-0152-MLP-ADAM-MSE-77-16.h5' #96      
#                model = 'test1/20201228-0824-MLP-ADAM-MSE-relu-sigmoid-77-23.h5' #120
              )
cut.extract_audio()
cut.load_audio()
# cut.calc_has_loud_audio()
# cut.calc_should_include_frame()
# cut.calc_timecode()
cut.vosk_process()
# cut.process_feature_vosk()
# cut.export_good()

Loading vosk...
Transcribing...
Completed transcribe


In [186]:
cut.MODEL_PATH = 'Model/newmodel/20201227-1123-MLP-RMSprop-Default-80-123.h5'

In [192]:
cut.feature_process()

  1%|█▏                                                                                | 8/542 [00:00<00:07, 74.97it/s]

Extracting feature...
Object created!


100%|███████████████████████████████████████████████████████████████████████████████| 542/542 [00:05<00:00, 103.32it/s]


Saved features to SBHD_feature.csv
Load feature from SBHD_feature.csv
Predicting...


100%|█████████████████████████████████████████████████████████████████████████████| 542/542 [00:00<00:00, 18115.49it/s]
515it [00:00, 172089.43it/s]

Finish predict!





In [193]:
cut.generate_complex_filter(cut.render_list)
cut.execute()

Complex frilter command success


In [181]:
for include in cut.include_list:
    print(include.start, " ", include.end, " ", include.word, " ", include.label)

1.47   1.86   oh   [0.]
3.06   3.42   oh   [0.]
3.48   3.75   shit   [0.]
7.8   8.07   ah   [1.]
8.1   9.06   hua   [0.]
9.06   9.27   lots   [0.]
9.27   9.42   of   [1.]
9.42   9.9   guys   [1.]
12.48   12.66   i'm   [0.]
12.66   13.17   judy   [0.]
13.41   13.889988   and   [0.]
14.28   14.43   i   [0.]
14.43   14.76   was   [1.]
14.94   15.27   wanting   [0.]
15.27   15.39   to   [0.]
15.39   15.57   make   [0.]
15.57   15.780038   this   [0.]
15.780038   16.83   shoe   [0.]
16.83   17.009954   our   [0.]
17.009954   17.19   little   [0.]
17.19   17.61   real   [0.]
21.0   21.81   about   [0.]
21.87   23.37   whites   [0.]
23.49   24.48   us   [0.]
25.23   25.56   a   [1.]
25.59   26.55   sapphire   [0.]
26.55   26.88   ring   [0.]
26.91   27.15   is   [0.]
27.15   27.69   like   [0.]
30.63   30.93   why   [0.]
30.96   31.32   is   [0.]
31.35   31.74   henry   [1.]
31.74   32.28   like   [0.]
35.67   35.85   it   [0.]
42.12   42.72   ah   [1.]
44.19   44.43   it's   [0.]
44.43   45.

In [167]:
for i in tqdm(cut.df.index[:]):
    print(cut.df['start'][i],cut.df['end'][i],cut.df['word'][i]," ",np.round(cut.predictions[i]))

 27%|█████████████████████                                                         | 143/531 [00:00<00:00, 1419.61it/s]

0.81 1.05 i'm   [1.]
1.05 1.29 hailed   [0.]
1.29 1.47 as   [1.]
1.47 1.65 it's   [0.]
1.68 2.191538 major   [0.]
2.191538 2.61 eighty   [0.]
3.06 3.42 i'm   [1.]
3.42 3.51 i   [1.]
3.51 3.63 just   [1.]
3.63 3.81 wanted   [0.]
3.81 3.87 to   [1.]
3.87 4.05 make   [1.]
4.05 4.32 this   [0.]
4.41 4.62 show   [0.]
6.36 6.63 a   [0.]
6.66 6.96 short   [0.]
6.96 7.11 little   [1.]
7.11 7.62 video   [0.]
7.65 8.31 i'm   [1.]
8.34 8.46 i   [1.]
8.46 8.580124 know   [0.]
8.580124 8.67 what   [0.]
8.67 8.729928 to   [1.]
8.729928 8.91 say   [0.]
8.91 9.09 that   [0.]
9.09 9.36 about   [0.]
9.45 9.9 every   [0.]
9.9 10.176273 single   [0.]
10.176273 10.469998 video   [0.]
10.470002 10.71 that   [0.]
10.77 11.25 ideal   [0.]
12.39 12.72 m   [1.]
13.86 14.25 but   [1.]
14.34 14.46 i   [1.]
14.46 14.73 had   [0.]
14.73 15.36 someone   [1.]
16.38 16.8 message   [0.]
16.8 16.95 me   [0.]
16.95 17.07 on   [0.]
17.07 17.13 to   [1.]
17.13 17.31 you   [0.]
17.31 17.79 too   [0.]
18.18 18.72 i'm   [1.]


 56%|████████████████████████████████████████████▎                                  | 298/531 [00:00<00:00, 836.68it/s]

in   [1.]
107.160549 107.37 so   [0.]
107.37 107.73 excited   [0.]
107.73 107.91 for   [0.]
107.91 108.0 it   [0.]
108.0 108.09 at   [0.]
108.09 108.42 all   [1.]
109.77 109.98 but   [0.]
109.98 110.4 um   [1.]
111.33 111.78 and   [0.]
111.84 112.2 then   [1.]
112.2 112.29 the   [0.]
112.29 112.47 other   [1.]
112.47 112.89 thing   [0.]
113.31 113.94 i'm   [1.]
114.12 114.33 that   [0.]
114.33 114.48 they   [0.]
114.48 114.81 said   [0.]
114.81 114.93 they   [0.]
114.93 115.2 talked   [0.]
115.2 115.83 about   [0.]
116.97 117.54 chemistry   [0.]
118.23 118.41 my   [0.]
118.41 119.16 blocks   [0.]
120.24 120.48 big   [0.]
120.48 120.78 field   [0.]
120.78 120.9 is   [0.]
120.900117 121.11 talk   [0.]
121.11 121.41 about   [0.]
122.43 122.73 walks   [0.]
122.76 123.27 way   [0.]
125.01 125.13 i   [1.]
125.13 125.58 have   [0.]
125.58 125.76 done   [0.]
125.76 125.85 in   [1.]
125.85 125.94 the   [1.]
125.94 126.48 past   [1.]
126.81 126.93 you   [0.]
126.93 127.05 can   [0.]
127.05 127.3

 75%|███████████████████████████████████████████████████████████▏                   | 398/531 [00:00<00:00, 735.55it/s]

180.12 a   [1.]
180.12 180.45 feeling   [1.]
180.78 181.05 taking   [0.]
181.05 181.17 my   [0.]
181.17 181.56 tongue   [0.]
181.56 182.01 l   [1.]
182.76 183.45 a   [1.]
184.74 184.83 i   [1.]
184.83 184.95 don't   [1.]
184.95 185.16 really   [0.]
185.16 185.34 know   [0.]
185.34 185.91 why   [0.]
186.84 186.93 i   [1.]
186.93 187.02 don't   [1.]
187.02 187.17 really   [0.]
187.17 187.29 know   [0.]
187.29 187.62 where   [1.]
187.8 188.16 where   [0.]
188.16 188.25 that   [1.]
188.25 188.52 came   [0.]
188.52 188.91 from   [1.]
190.47 190.768594 cal   [0.]
190.768594 191.34 works   [0.]
191.37 192.06 so   [0.]
192.93 193.11 kind   [0.]
193.11 193.17 of   [1.]
193.17 193.44 helps   [0.]
193.44 193.8 me   [0.]
193.83 194.37 i'm   [1.]
194.4 194.61 with   [0.]
194.61 195.3 blocking   [0.]
198.54 199.05 and   [0.]
200.52 201.0 i'm   [1.]
201.72 202.23 i'm   [0.]
204.06 204.33 i   [1.]
204.33 204.54 know   [0.]
204.54 204.69 this   [0.]
204.69 204.81 is   [1.]
204.81 204.93 a   [1.]
204.96

100%|███████████████████████████████████████████████████████████████████████████████| 531/531 [00:00<00:00, 785.27it/s]

237.78 238.02 it   [0.]
239.19 240.06 comment   [0.]
240.81 241.17 comment   [0.]
241.17 241.32 what   [0.]
241.32 241.44 you   [0.]
241.74 242.01 want   [0.]
242.01 242.1 to   [0.]
242.1 242.31 see   [0.]
242.31 242.64 next   [0.]
242.64 242.82 door   [0.]
242.82 242.91 if   [0.]
242.91 243.0 you   [0.]
243.0 243.18 have   [0.]
243.18 243.33 any   [1.]
243.33 244.02 questions   [0.]
244.68 245.04 about   [0.]
245.49 245.97 entering   [0.]
246.63 247.17 or   [0.]
247.2 247.5 about   [0.]
247.5 247.71 my   [0.]
247.71 248.76 experiences   [0.]
248.76 248.97 with   [1.]
248.97 249.12 it   [1.]
249.15 249.63 i'm   [1.]
250.38 251.01 orders   [0.]
251.49 252.15 ideas   [0.]
252.15 252.3 for   [0.]
252.3 252.54 pretty   [0.]
252.54 252.9 much   [0.]
253.05 253.23 any   [0.]
253.23 253.65 video   [0.]
253.68 253.95 i'm   [0.]
253.95 254.16 just   [0.]
254.19 254.55 laid   [0.]
254.58 254.82 down   [0.]
254.85 254.97 in   [0.]
254.97 255.06 the   [1.]
255.06 255.6 comments   [0.]
255.9 256.5 




In [None]:
cut.generate_complex_filter(cut.render_list)

In [None]:
for render in cut.render_list:
    print(render.start, " ", render.end, " ", render.word)

In [None]:
for render in render_list:
    print(render.start, " ", render.end, " ", render.word)

In [12]:
cut.df
# cut.df.to_csv('now.csv', index=False)

Unnamed: 0,conf,end,start,word
0,0.767439,1.230000,1.050000,hey
1,0.767439,1.470000,1.230000,guys
2,1.000000,1.650000,1.470000,it's
3,1.000000,2.191272,1.680000,major
4,0.901827,2.580000,2.191272,eighty
...,...,...,...,...
530,0.993181,274.230000,273.990000,in
531,0.997430,274.439971,274.320000,that
532,0.999491,274.890000,274.439971,actually
533,0.998176,274.920000,274.890000,i


All Bellow is Process by Using vosk

In [131]:
# Process by using vosk

model = tf.keras.models.load_model('Model/newmodel/20201227-1123-MLP-RMSprop-Default-80-123.h5')
feature_file = f'{cut.FILENAME}_feature.csv'
df = cut.df
sampleRate = cut.sampleRate
fail_list = []
time_margin = int( (  (1/cut.FRAME_RATE) *cut.FRAME_MARGIN ) )
index_margin = int( (  (1/cut.FRAME_RATE) *cut.FRAME_MARGIN ) *cut.sampleRate )

if(os.path.exists(feature_file)):
    os.remove(feature_file)

if(not os.path.exists(feature_file)):
    print("Extracting feature...")
    features = np.empty(shape=(0,80))
    ds = Dataset()
    for i in tqdm(cut.df.index[:]): 
#         start_index = int(df['start'][i] * sampleRate)
#         end_index = int(df['end'][i] * sampleRate)
        start_index = max(0, int(  df['start'][i] * cut.sampleRate))
        end_index = min( int( (df['end'][i]) * cut.sampleRate), cut.audioSampleCount)
#         print(start_index," ",end_index)
#         try:
        fea = ds.get_feature_by_audio(cut.audioData[start_index:end_index],11025)
        features = np.vstack((features,[fea]))
#         except:
#             ts = Timestamp(df['start'][i],df['end'][i],word=df['word'][i])
#             fail_list.append(ts)
#             print('Failed index:',i)
    print(f'Saved features to {feature_file}')
    np.savetxt(feature_file, features, delimiter=',')

    
print(f'Load feature from {feature_file}')
features = np.loadtxt(feature_file,delimiter=',')

print('Predicting...')
predictions = model.predict(x=features, batch_size=84,verbose=0)
print("Finish predict!")


include_list = []
for i in tqdm(cut.df.index[:]):
    isInclude = True
    predict = np.round(predictions[i])
    word = df['word'][i]
    if(word == "i'm" or word == 'um' or word =='m'):
        if(predict == 1):
            isInclude = False
    if(isInclude):
        start = df['start'][i]
        end = df['end'][i]
        ts = Timestamp(start,end,word=word,label=predict)
        include_list.append(ts)
#     print(df['word'][i],' ',np.round(predictions[i]))


  1%|▉                                                                                 | 6/535 [00:00<00:09, 56.23it/s]

Extracting feature...
Object created!


100%|████████████████████████████████████████████████████████████████████████████████| 535/535 [00:05<00:00, 89.63it/s]


Saved features to sbcon_feature.csv
Load feature from sbcon_feature.csv
Predicting...


100%|█████████████████████████████████████████████████████████████████████████████| 535/535 [00:00<00:00, 10518.64it/s]

Finish predict!





In [None]:
for i in tqdm(df.index[:]):
    print(df['start'][i],df['end'][i],df['word'][i]," ",np.round(predictions[i]))

In [None]:
render_list = []
counter = 0
start = include_list[0].start
end = include_list[0].end
word = ""
for i,ts in tqdm(enumerate(include_list)):
    current_start = ts.start
    current_end = ts.end
    prev_start = include_list[i-1].start
    prev_end = include_list[i-1].end
    if(i >= 1 and current_start != prev_end):
        segment = Timestamp(start,prev_end, word=word)
        word = ''
        start = current_start
        render_list.append(segment)
        counter = counter + 1
    word = word + ts.word + " "
    print(segment.start, " ", segment.end," ", segment.word)

In [28]:
trim = []
duration_list = []
number_of_segment = 0
prev = 0
current = 0

# with out xfade
for ts in render_list:
    duration_list.append(ts.end-ts.start)
    trim.append(
        f'[0:v]trim=start={ts.start}:end={ts.end},setpts=PTS-STARTPTS[v{number_of_segment}]')
    trim.append(
        f'[0:a]atrim=start={ts.start}:end={ts.end},asetpts=PTS-STARTPTS[a{number_of_segment}]')
    number_of_segment += 1
    
filter = ';'.join(trim)
filter = filter + ";"

# Normal cut feature
for i in range(number_of_segment):
    filter += f' [v{i}] [a{i}]'
    
    
# Start to generate ending of command
filter += f'concat=n={number_of_segment}:v=1:a=1 [out]'
filter = '"' + filter + '"'
filter = f'ffmpeg -y -i {cut.INPUT_FILE} -filter_complex ' + filter
filter = filter + f' -map "[out]" {cut.FILENAME}_COMPLEX.mp4'
bat_path = cut.write_to_bat(filter)

In [None]:
for ts in include_list:
    print(ts.start, " ", ts.end," ",ts.word)

In [None]:
# for ts in include_list:
#     newstart = max(0,   (ts.start - frame_margin) )
#     newend = min(  (ts.end + frame_margin), (cut.audioSampleCount/cut.sampleRate) )
#     ts.start = newstart
#     ts.end = newend

In [70]:
# index_margin = int( (  (1/cut.FRAME_RATE) *4 ) *cut.sampleRate )
# start_index = max(0, int(  df['start'][0] * cut.sampleRate - index_margin))
# end_index = min( int( (df['end'][0]) * cut.sampleRate + index_margin), cut.audioSampleCount)
# print(df['start'][0], " ",start_index)
# print(df['end'][0], " ",end_index)
# print(index_margin)

# frame_margin = ( (1/cut.FRAME_RATE) *cut.FRAME_MARGIN )
# index_margin = int( (  (1/cut.FRAME_RATE) *cut.FRAME_MARGIN ) *cut.sampleRate )
# print(index_margin)
# print(frame_margin)

2133
0.13333333333333333


In [79]:
# for ts in include_list:
#     newstart = max(0,   (ts.start - frame_margin) )
#     newend = min(  (ts.end + frame_margin), (cut.audioSampleCount/cut.sampleRate) )
#     ts.start = newstart
#     ts.end = newend

https://github.com/keras-team/keras/issues/14040
KeyError: 'sample_weight_mode'
"pip install --upgrade tesorflow --user
Then restart the kernal

librosa change log

https://stackoverflow.com/questions/63997969/attributeerror-module-librosa-has-no-attribute-output

In [31]:
#overlap technique


# print(len(cut.isInclude))
# print(len(cut.hop_list))
# print(len(cut.ts_list_index))
# print(cut.sampleRate)
# for ts in cut.ts_list_index:
#     print(ts.start," ",ts.end, ((ts.end-ts.start)/cut.sampleRate))

1383
1970
136
16000


https://medium.com/@vvk.victory/audio-processing-librosa-split-on-silence-8e1edab07bbb

In [7]:
# def extract_words(res):
#     jres = json.loads(res)
#     if not 'result' in jres:
#         return []
#     words = jres['result']
#     return words

# def transcribe_words(recognizer, bytes):
#     results = []

#     chunk_size = 4000
#     for chunk_no in range(math.ceil(len(bytes)/chunk_size)):
#         start = chunk_no*chunk_size
#         end = min(len(bytes), (chunk_no+1)*chunk_size)
#         data = bytes[start:end]

#         if recognizer.AcceptWaveform(data):
#             words = extract_words(recognizer.Result())
#             results += words
#     results += extract_words(recognizer.FinalResult())

#     return results

In [None]:
# import time
# start_time = time.time()
# vosk.SetLogLevel(-1)
# ts = cut.ts_list_index[10]
# start = ts.start
# end = ts.end
# int16 = np.int16(cut.audioData * 32768).tobytes()
# model_path='vosk-model-en-us-aspire-0.2'
# model = vosk.Model(model_path)
# recognizer = vosk.KaldiRecognizer(model, 16000)
# res = transcribe_words(recognizer, int16)
# df = pandas.DataFrame.from_records(res)
# df = df.sort_values('start')
# print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# print(ts.start)
# print(ts.end)
# print(((df['start'][0]*16000)+(ts.start))/ cut.sampleRate)
# print((df['end'][0]*16000)+(ts.end))

In [None]:
# print(df)

In [None]:
# out_path='sbcon-filtered.csv'
# df.to_csv(out_path, index=False)
# print('Word segments saved to', out_path)

Another possible transcribe: kaldi

https://github.com/gooofy/py-kaldi-asr

Possible transcribe alternative: pocketsphinx
Reason not to use: is very hard to use and low accuracy
Possbile problem: the framerate doenst match with the model
https://stackoverflow.com/questions/64153590/audio-signal-split-at-word-level-boundary

https://stackoverflow.com/questions/38808776/python-pocketsphinx-recognition-from-the-microphone

https://github.com/cmusphinx/pocketsphinx-python/blob/dfca2739c7a32dd474c425dad1cf87c1d6e1a316/readme.md

Solved cant install pyaudio
https://stackoverflow.com/questions/52283840/i-cant-install-pyaudio-on-windows-how-to-solve-error-microsoft-visual-c-14

In [None]:
from os import environ, path
import pyaudio
import wave
import sys

from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *
from pocketsphinx import get_model_path

In [None]:
# print(path.join(MODELDIR, 'en-us.lm.bin'))

In [None]:
# config = Decoder.default_config()
# MODELDIR= get_model_path()
# config.set_string('-hmm', path.join(MODELDIR, 'en-us'))
# config.set_string('-lm', path.join(MODELDIR, 'en-us.lm.bin'))
# config.set_string('-dict', path.join(MODELDIR, 'cmudict-en-us.dict'))
# decoder = Decoder(config)

In [None]:

# # stream = open('sbcon-1.wav', 'rb')
# # p = pyaudio.PyAudio()
# # stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
# # stream.start_stream() 
# wf = wave.open('sbcon-1.wav', 'rb')

# stream = p.open(format =
#                 p.get_format_from_width(wf.getsampwidth()),
#                 channels = 1,
#                 rate = 16000,
#                 input = True,
#                frames_per_buffer=1024)
# stream.start_stream() 
# decoder.start_utt()
# while True:
#     buf = stream.read(1024)
#     if buf:
#         decoder.process_raw(buf, False, False)
#     else:
#         break
# decoder.end_utt()
# print ('Best hypothesis segments: ', [seg.word for seg in decoder.seg()])

https://stackoverflow.com/questions/58841039/i-think-librosa-effect-split-has-some-problem

In [None]:
# x = cut.audioData
# y=librosa.amplitude_to_db(abs(x))
# # short time fourier transform
# # (n_fft and hop length determine frequency/time resolution)
# n_fft = 2048
# S = librosa.stft(x, n_fft=n_fft, hop_length=n_fft//2)
# print(S.shape)
# # convert to db
# D = librosa.amplitude_to_db(np.abs(S), ref=np.max)
# top_db = np.max(abs(D)) * 0.3
# print(np.max(abs(D)))
# nonMuteSections = librosa.effects.split(x,top_db=top_db, ref=np.max)
# print(nonMuteSections)
# counter = 0
# # for nonMute in nonMuteSections:
# #     print(nonMute)
# #     sf.write(f'temp/{counter}.wav',cut.audioData[nonMute[0]:nonMute[1]],cut.sampleRate)
# #     counter = counter + 1 

https://stackoverflow.com/questions/36458214/split-speech-audio-file-on-words-in-python **

https://stackoverflow.com/questions/45526996/split-audio-files-using-silence-detection

https://stackoverflow.com/questions/59102171/getting-timestamps-from-audio-using-python

https://www.reddit.com/r/learnpython/comments/4st8sf/splitting_audio_file_into_segmentswords/

https://www.geeksforgeeks.org/python-speech-recognition-on-large-audio-files/

https://radiant-brushlands-42789.herokuapp.com/medium.com/better-programming/simple-audio-processing-in-python-with-pydub-c3a217dabf11

https://www.thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python

https://radiant-brushlands-42789.herokuapp.com/towardsdatascience.com/extracting-speech-from-video-using-python-f0ec7e312d38

https://realpython.com/python-speech-recognition/

In [None]:
# from pydub import AudioSegment
# from pydub.silence import split_on_silence
# from pydub.silence import detect_nonsilent
# def match_target_amplitude(sound, target_dBFS):
#     change_in_dBFS = target_dBFS - sound.dBFS
#     return sound.apply_gain(change_in_dBFS)
# cut.ts_list_index[0]
# # audio_chunks  = split_on_silence()
# cut.audioData[cut.ts_list_index[1].start:cut.ts_list_index[1].end]
# sound_file = AudioSegment.from_wav("example.wav")
# normalized_sound = match_target_amplitude(sound_file, -5.0)
# nonsilent_data = detect_nonsilent(normalized_sound, min_silence_len=100, silence_thresh=-5, seek_step=1)
# print(nonsilent_data)
# nonsilent_index = []
# print("start,Stop")
# for chunks in nonsilent_data:
#      nonsilent_index.append([chunk/1000 for chunk in chunks])

In [None]:
cut.ts_list[0][0].get_seconds()
cut.ts_list[-1][1].get_seconds()

In [None]:
model = keras.models.load_model('Model/mymodel/mymodel_78_18.h5')
model.summary()

In [None]:
ds = Dataset()
feature = ds.get_feature_by_file('Data/blindtest/for.wav')
features = np.array(([feature]))
print(features.shape)
features.reshape(80,1)
features.shape
label = model.predict(features)
print(label)
print(np.round(label))

All below is the evaluation code

In [None]:
# class Dataset:
#     def __init__(self,feature,predict):
#         self.feature = feature;
#         self.predict = predict

In [None]:
# class Traindata:
#     def __init__(self,fea,lab,head):
#         self.features = fea
#         self.labels = lab
#         self.header = head

In [None]:
# def plot_confusion_matrixs(cm, classes,
#                         normalize=False,
#                         title='Confusion matrix',
#                         cmap=plt.cm.Blues):
#     """
#     This function prints and plots the confusion matrix.
#     Normalization can be applied by setting `normalize=True`.
#     """
#     plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title)
#     plt.colorbar()
#     tick_marks = np.arange(len(classes))
#     plt.xticks(tick_marks, classes, rotation=45)
#     plt.yticks(tick_marks, classes)

#     if normalize:
#         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#         print("Normalized confusion matrix")
#     else:
#         print('Confusion matrix, without normalization')

#     print(cm)

#     thresh = cm.max() / 2.
#     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#         plt.text(j, i, cm[i, j],
#             horizontalalignment="center",
#             color="white" if cm[i, j] > thresh else "black")

#     plt.tight_layout()
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')


In [None]:
# def prepare_data(link):
#     dataset_name = link.split('/')[-1]
#     file = tf.keras.utils.get_file(dataset_name,link,file_hash='sha256')
#     CSV_HEADER = np.arange(80)
#     CSV_HEADER = np.hstack((CSV_HEADER,['isStuttered','isSuccess']))
#     label = ['isStuttered','isSuccess']
#     dataframe = pd.read_csv(file, names=CSV_HEADER.tolist())
#     isSuccess = dataframe.pop('isSuccess')
#     labels = dataframe.pop('isStuttered')
#     features = dataframe
#     return (features,labels,CSV_HEADER)                      

In [None]:
# x,y,head = prepare_data('https://raw.githubusercontent.com/ming0520/training_data/main/20201126/20201126-cleanonly.csv')
# cleanOnly = Traindata(x,y,head)
# x,y,head = prepare_data('https://raw.githubusercontent.com/ming0520/training_data/main/20201221/20201221.csv')
# IBM = Traindata(x,y,head)
# x,y,head = prepare_data('https://raw.githubusercontent.com/ming0520/fyp2-data-20201126-fortrain/main/dataset_20201126.csv')
# augmented = Traindata(x,y,head)
# x,y,head = prepare_data('https://raw.githubusercontent.com/ming0520/training_data/main/test/test.csv')
# test = Traindata(x,y,head)

In [None]:
# score, acc = model.evaluate(IBM.features, IBM.labels,batch_size=20)
# print('Test score:', score)
# print('Test accuracy:', acc)

In [None]:
# inputFeatures = test.features
# inputLabels = test.labels
# predictions = model.predict(x=inputFeatures, batch_size=10,verbose=0)
# rounded_predictions = np.round(predictions)
# yt= inputLabels.to_numpy()
# yt = yt.reshape(yt.shape[0],1)
# yTrue = np.round(yt)
# # yTrue = yTrue.astype(dtype=np.float32)
# cm = confusion_matrix(y_true=yTrue, y_pred=rounded_predictions)
# cm_plot_labels = ['NoStuttered','isStuttered']
# plot_confusion_matrix(cm=cm ,classes=cm_plot_labels, title='C Matrix')

In [None]:
# models = glob('Model/*.h5')

In [None]:
# class Predict:
#     def __init__(self,predict,ytrue,cm,title,score,acc):
#         self.predict = predict
#         self.ytrue = ytrue
#         self.cm = cm
#         self.title = title
#         self.score = score
#         self.acc = acc

In [None]:
# predicts = []
# fail_list = []
# fig, axes = plt.subplots(nrows=len(models), ncols=1, figsize=(15,10))
# for model_file in models:
#     try:
#         inputFeatures = test.features
#         inputLabels = test.labels
#         model = tf.keras.models.load_model(model_file)
#         print('Model:',model_file)
#         print('Test score:', score)
#         print('Test accuracy:', acc)
#         predictions = model.predict(x=inputFeatures, batch_size=10,verbose=0)
#         rounded_predictions = np.round(predictions)
#         yt= inputLabels.to_numpy()
#         yt = yt.reshape(yt.shape[0],1)
#         yTrue = np.round(yt)
#         # yTrue = yTrue.astype(dtype=np.float32)
#         cm = confusion_matrix(y_true=yTrue, y_pred=rounded_predictions)
#         cm_plot_labels = ['NoStuttered','isStuttered']
#         score, acc = model.evaluate(inputFeatures, inputLabels,batch_size=20)
#         predicts.append(Predict(rounded_predictions,yTrue,cm,model_file,score,acc))
# #         plot_confusion_matrix(cm=cm ,classes=cm_plot_labels, title=model_file)
#     except:
#         fail_list.append(model_file)
# #     plot_confusion_matrix(cm=cm ,classes=cm_plot_labels, title='C Matrix')

In [None]:
# for pred in predicts:
#     print('accuracy=>',pred.acc,'score=>',pred.score,'file=>',pred.title)

In [None]:
# for fail in fail_list:
#     print(fail)

In [None]:
# fig, axes = plt.subplots(nrows=len(models), ncols=1, figsize=(15,10))
# cm_plot_labels = ['NoStuttered','isStuttered']
# for pred, ax in zip(predicts, axes.flatten()):
#     mod = tf.keras.models.load_model(pred.title)
#     plot_confusion_matrix(cm=pred.cm ,classes=cm_plot_labels, title=pred.title)
#     ax.title.set_text(pred.title)
# plt.tight_layout()  
# plt.show()

In [None]:
# predicts.sort(key=lambda x: x.score, reverse=True)

In [None]:
# newlist = sorted(predicts, key=lambda x: x.score, reverse=False)

In [None]:
# cm_plot_labels = ['NoStuttered','isStuttered']
# for pred in newlist:
#     fig1 = plt.gcf()
#     print('accuracy=>',pred.acc,'score=>',pred.score,'file=>',pred.title)
#     ttl = pred.title + " acc:" + str(pred.acc) + " sco:" + str(pred.score)
#     plot_confusion_matrix(cm=pred.cm ,classes=cm_plot_labels, title=ttl)
#     plt.show()
#     plt.draw()
#     fig1.savefig(f'{pred.title}.png')