In [7]:
# Step 1: There will be an audio file in mp3 format, divide into chunks of lenght CHUNK_LENGTH and 
#         save the chunks in a directory
# Step 2: transcript of the enire file in this format "|अब्राहम|की|सन्तान|दाऊद|की|सन्तान|यीशु|मसीह|की|वंशावली|अब्राहम|से|इसहाक|उत्पन्न|हुआ|"
# Step 3: Create a huggingface dataset with the audio chunks 
# Step 4: Perform inferencing on the entire dataset, either in batches or one by one and then collate the logits and waveforms 
#         end to end to prepare them for CTC decoding on the entire audio file and transcript at once
# Step 5: Perform CTC decoding on the logits to get the timestamps corresponding to each word in the transcript

import argparse
import os

import torch
import torchaudio
from dataclasses import dataclass
import matplotlib.pyplot as plt

from datasets import load_dataset
from transformers import AutoModelForCTC, AutoProcessor
import torchaudio.functional as F
from datasets import load_dataset, Audio, Dataset

torch.random.manual_seed(0)

from pydub import AudioSegment
import IPython
import warnings
warnings.filterwarnings("ignore")

print(torch.__version__)
print(torchaudio.__version__)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


AUDIO_DIR = "/Users/Nishant/Desktop/sarvam-ai/part-two/Hindi_hin_BCS_NT_Non-Drama"
TRANSCRIPT_DIR = (
    "/Users/Nishant/Desktop/sarvam-ai/part-two/Hindi_hin_BCS_NT_Non-Drama_transcripts"
)
AUDIO_FILE_NAMES = [
    os.path.join(AUDIO_DIR, file) for file in os.listdir(AUDIO_DIR)
]
TRANSCRIPT_FILE_NAMES = [
    os.path.join(TRANSCRIPT_DIR, file) for file in os.listdir(TRANSCRIPT_DIR)
]
FILE_PAIRS = list(zip(AUDIO_FILE_NAMES, TRANSCRIPT_FILE_NAMES))
BASE_NAMES = [os.path.basename(file).split(".")[0] for file in AUDIO_FILE_NAMES]

CLEANED_TRANSCRIPT_DIR = "/Users/Nishant/Desktop/sarvam-ai/part-two/Hindi_hin_BCS_NT_Non-Drama_cleaned_transcripts"
CHUNK_LENGTH = 10


def create_chunks(audio_file, chunk_length=CHUNK_LENGTH):

    audio = AudioSegment.from_file(audio_file)
    audio = audio[3100:] # cropping the unnecessary part of the audio, where they say book and chapter number
    audio_chunks = audio[::chunk_length * 1000]
    sampling_rate = audio.frame_rate

    folder = os.path.basename(audio_file).split(".")[0]+f"_chunks{CHUNK_LENGTH}"
    folder = os.path.join(os.path.dirname(audio_file), folder)
    # if folder exists , delete it
    if os.path.exists(folder):
        os.system(f"rm -rf {folder}")
    os.makedirs(folder, exist_ok=True)

    print(f"Saving chunks in {folder}")

    for i, chunk in enumerate(audio_chunks):
        chunk.export(f"{folder}/chunk{i}.mp3", format="mp3")

    return folder,sampling_rate

def get_trellis(emission, tokens, blank_id=0):
    """
    Returns the trellis for the given emission and tokens.
    Trellis is of shape (num_frame, num_tokens)
    blank_id is the index of the blank token in the tokens list
    """
    num_frame = emission.size(0) # 169
    num_tokens = len(tokens) # number of characters in transcript 84

    trellis = torch.zeros((num_frame, num_tokens))
    trellis[1:, 0] = torch.cumsum(emission[1:, blank_id], 0)
    trellis[0, 1:] = -float("inf")
    trellis[-num_tokens + 1 :, 0] = float("inf")

    for t in range(num_frame - 1):
        trellis[t + 1, 1:] = torch.maximum(
            # Score for staying at the same token
            trellis[t, 1:] + emission[t, blank_id],
            # Score for changing to the next token
            trellis[t, :-1] + emission[t, tokens[1:]],
        )
    return trellis

@dataclass
class Point:
    token_index: int
    time_index: int
    score: float


def backtrack(trellis, emission, tokens, blank_id=0):
    t, j = trellis.size(0) - 1, trellis.size(1) - 1

    path = [Point(j, t, emission[t, blank_id].exp().item())]
    while j > 0:
        # Should not happen but just in case
        assert t > 0

        # 1. Figure out if the current position was stay or change
        # Frame-wise score of stay vs change
        p_stay = emission[t - 1, blank_id]
        p_change = emission[t - 1, tokens[j]]

        # Context-aware score for stay vs change
        stayed = trellis[t - 1, j] + p_stay
        changed = trellis[t - 1, j - 1] + p_change

        # Update position
        t -= 1
        if changed > stayed:
            j -= 1

        # Store the path with frame-wise probability.
        prob = (p_change if changed > stayed else p_stay).exp().item()
        path.append(Point(j, t, prob))

    # Now j == 0, which means, it reached the SoS.
    # Fill up the rest for the sake of visualization
    while t > 0:
        prob = emission[t - 1, blank_id].exp().item()
        path.append(Point(j, t - 1, prob))
        t -= 1

    return path[::-1]

# Merge the labels
@dataclass
class Segment:
    label: str
    start: int
    end: int
    score: float
    timestamp_start : int
    timestamp_end: int

    def __repr__(self):
        return f"{self.label}\t({self.score:4.2f}): [{self.start:5d}, {self.end:5d})"

    @property
    def length(self):
        return self.end - self.start


def merge_repeats(path):
    i1, i2 = 0, 0
    segments = []
    while i1 < len(path):
        while i2 < len(path) and path[i1].token_index == path[i2].token_index:
            i2 += 1
        score = sum(path[k].score for k in range(i1, i2)) / (i2 - i1)
        segments.append(
            Segment(
                transcript[path[i1].token_index],
                path[i1].time_index,
                path[i2 - 1].time_index + 1,
                score,
            )
        )
        i1 = i2
    return segments

# Merge words
def merge_words(segments, separator="|"):
    words = []
    i1, i2 = 0, 0
    while i1 < len(segments):
        if i2 >= len(segments) or segments[i2].label == separator:
            if i1 != i2:
                segs = segments[i1:i2]
                word = "".join([seg.label for seg in segs])
                score = sum(seg.score * seg.length for seg in segs) / sum(
                    seg.length for seg in segs
                )
                words.append(
                    Segment(word, segments[i1].start, segments[i2 - 1].end, score)
                )
            i1 = i2 + 1
            i2 = i1
        else:
            i2 += 1
    return words


  from .autonotebook import tqdm as notebook_tqdm


2.2.2
2.2.2
cpu


In [57]:
audio_file_basename = "B01___01_Matthew_____HINBCSN1DA.mp3"
original_file = os.path.join(AUDIO_DIR, audio_file_basename)

sound_file = AudioSegment.from_file(original_file)
sound_file = sound_file[3100:] # cropping the unnecessary part of the audio, where they say book and chapter number
sound_file.export("full_sample.mp3", format="mp3")

<_io.BufferedRandom name='full_sample.mp3'>

In [58]:
audio_file_basename = "B01___01_Matthew_____HINBCSN1DA.mp3"
CHUNK_LENGTH = 20
# audio_file_complete_path = os.path.join(AUDIO_DIR, audio_file_basename)
audio_file_complete_path = "/Users/Nishant/Desktop/sarvam/10s_sample.mp3"
audio_file_complete_path = "/Users/Nishant/Desktop/sarvam/60s_sample.mp3"
audio_file_complete_path = "/Users/Nishant/Desktop/sarvam/full_sample.mp3"

transcript = None
transcript_path = os.path.join(CLEANED_TRANSCRIPT_DIR, audio_file_basename.split(".")[0]+".txt")
with open(transcript_path, 'r') as f:
    transcript = f.read()
# transcript = "|अब्राहम|की|सन्तान|दाऊद|की|सन्तान|यीशु|मसीह|की|वंशावली|अब्राहम|से|इसहाक|उत्पन्न|हुआ|इसहाक|से|याकूब|उत्पन्न|हुआ|और|याकूब|से|यहूदा|और|उसके|भाई|उत्पन्न|हुए|यहूदा|और|तामार|से|पेरेस|व|जेरह|उत्पन्न|हुए|और|पेरेस|से|हेस्रोन|उत्पन्न|हुआ|और|हेस्रोन|से|एराम|उत्पन्न|हुआ|एराम|से|अम्मीनादाब|उत्पन्न|हुआ|और|अम्मीनादाब|से|नहशोन|और|नहशोन|से|सलमोन|उत्पन्न|हुआ|सलमोन|और|राहाब|से|बोअज|उत्पन्न|हुआ|और|बोअज|और|रूत|से|ओबेद|उत्पन्न|हुआ|और|ओबेद|से|यिशै|उत्पन्न|हुआ|और|यिशै|से|दाऊद|राजा|उत्पन्न|हुआ|और|दाऊद|से|सुलैमान|उस|स्त्री|से|उत्पन्न|हुआ|जो|पहले|ऊरिय्याह|की|पत्नी|थी|सुलैमान|से|रहबाम|उत्पन्न|हुआ|और|रहबाम|से|अबिय्याह|उत्पन्न|हुआ|और|अबिय्याह|से|"
print("Transcript: ")
print(transcript)

print(f"Processing Audio file: {audio_file_basename}")

# Creating 10s chunks of the entire audio file. 

# chunk_folder,sampling_rate = create_chunks(audio_file_complete_path, 20)
# chunk_files = os.listdir(chunk_folder)
# chunk_files = [os.path.join(chunk_folder, file) for file in chunk_files]
# print(f"Total number of chunks: {len(chunk_files)}")

# create the huggingface dataset
audio_dataset = Dataset.from_dict(
    {
        "audio": [audio_file_complete_path]
    }
).cast_column("audio", Audio(sampling_rate=16_000))

print(audio_dataset[0]['audio']['array'].shape)

# Load the model and processor
DEVICE_ID = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "ai4bharat/indicwav2vec-hindi"

model = AutoModelForCTC.from_pretrained(MODEL_ID).to(DEVICE_ID)
processor = AutoProcessor.from_pretrained(MODEL_ID)

# fix the labels as derived from the vocab, the blank token should be at the start
labels = list(processor.tokenizer.get_vocab().keys())
first_label = labels[0]
blank_token = labels[4]
labels[0] = blank_token
labels[4] = first_label
labels = tuple(labels)


# Perform inferencing on the entire dataset
emission = []
waveform = []
for i in range(len(audio_dataset)):
    waveform.append(torch.tensor(audio_dataset[i]["audio"]["array"]).unsqueeze(0))
    input_values = processor(
        audio_dataset[i]["audio"]["array"], sampling_rate=16_000, return_tensors="pt"
    ).input_values.to(DEVICE_ID)
    with torch.no_grad():
        emission.append(model(input_values).logits.cpu().squeeze(0))

# Perform CTC decoding on the logits
emission = torch.cat(emission, dim=0)
waveform = torch.cat(waveform, dim=1)
print(f"Shape of emission is {emission.shape}")
print(f"Shape of waveform is {waveform.shape}")

dictionary = {c: i for i, c in enumerate(labels)}

tokens = [dictionary[c] for c in transcript]

trellis = get_trellis(emission, tokens, blank_id=0)

path = backtrack(trellis, emission, tokens)

segments = merge_repeats(path)

word_segments = merge_words(segments)
for word in word_segments:
    print(word)


Transcript: 
|अब्राहम|की|सन्तान|दाऊद|की|सन्तान|यीशु|मसीह|की|वंशावली|अब्राहम|से|इसहाक|उत्पन्न|हुआ|इसहाक|से|याकूब|उत्पन्न|हुआ|और|याकूब|से|यहूदा|और|उसके|भाई|उत्पन्न|हुए|यहूदा|और|तामार|से|पेरेस|व|जेरह|उत्पन्न|हुए|और|पेरेस|से|हेस्रोन|उत्पन्न|हुआ|और|हेस्रोन|से|एराम|उत्पन्न|हुआ|एराम|से|अम्मीनादाब|उत्पन्न|हुआ|और|अम्मीनादाब|से|नहशोन|और|नहशोन|से|सलमोन|उत्पन्न|हुआ|सलमोन|और|राहाब|से|बोअज|उत्पन्न|हुआ|और|बोअज|और|रूत|से|ओबेद|उत्पन्न|हुआ|और|ओबेद|से|यिशै|उत्पन्न|हुआ|और|यिशै|से|दाऊद|राजा|उत्पन्न|हुआ|और|दाऊद|से|सुलैमान|उस|स्त्री|से|उत्पन्न|हुआ|जो|पहले|ऊरिय्याह|की|पत्नी|थी|सुलैमान|से|रहबाम|उत्पन्न|हुआ|और|रहबाम|से|अबिय्याह|उत्पन्न|हुआ|और|अबिय्याह|से|आसा|उत्पन्न|हुआ|आसा|से|यहोशाफात|उत्पन्न|हुआ|और|यहोशाफात|से|योराम|उत्पन्न|हुआ|और|योराम|से|उज्जियाह|उत्पन्न|हुआ|उज्जियाह|से|योताम|उत्पन्न|हुआ|योताम|से|आहाज|उत्पन्न|हुआ|और|आहाज|से|हिजकिय्याह|उत्पन्न|हुआ|हिजकिय्याह|से|मनश्शे|उत्पन्न|हुआ|मनश्शे|से|आमोन|उत्पन्न|हुआ|और|आमोन|से|योशिय्याह|उत्पन्न|हुआ|और|बन्दी|होकर|बाबेल|जाने|के|समय|में|योशिय्याह|से|यकुन्याह|और|उसके|भाई|

(3775808,)


Some weights of the model checkpoint at ai4bharat/indicwav2vec-hindi were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at ai4bharat/indicwav2vec-hindi and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this mode

Shape of emission is torch.Size([11799, 68])
Shape of waveform is torch.Size([1, 3775808])
अब्राहम	(45024.67): [   64,    87)
की	(54941.46): [   91,    98)
सन्तान	(66817.57): [  101,   123)
दाऊद	(61107.56): [  156,   171)
की	(152700.40): [  175,   179)
सन्तान	(47464.74): [  184,   206)
यीशु	(57019.36): [  235,   251)
मसीह	(46932.46): [  256,   272)
की	(309168.54): [  274,   279)
वंशावली	(120858.20): [  282,   312)
अब्राहम	(32540.09): [  364,   386)
से	(288096.31): [  391,   397)
इसहाक	(42238.82): [  406,   428)
उत्पन्न	(425879.14): [  436,   451)
हुआ	(741417.94): [  454,   464)
इसहाक	(34678.90): [  497,   518)
से	(233936.74): [  523,   529)
याकूब	(39569.91): [  538,   557)
उत्पन्न	(278123.79): [  563,   578)
हुआ	(802359.98): [  581,   590)
और	(49351.97): [  621,   626)
याकूब	(71124.39): [  629,   649)
से	(208234.06): [  653,   659)
यहूदा	(58540.27): [  663,   682)
और	(79684.98): [  703,   708)
उसके	(140084.62): [  714,   728)
भाई	(67935.36): [  734,   747)
उत्पन्न	(322298.09): [  753, 

In [59]:
import IPython

def display_segment(i):
    ratio = waveform.size(1) / trellis.size(0)
    word = word_segments[i]
    x0 = int(ratio * word.start)
    x1 = int(ratio * word.end)
    print(x0, x1)
    print(
        f"{word.label} ({word.score:.2f}): {x0 / 16_000:.3f} - {x1 / 16_000:.3f} sec"
    )
    segment = waveform[:, x0:x1]
    return IPython.display.Audio(segment.numpy(), rate=16_000)

In [None]:
ration = waveform.size(1) / trellis.size(0)
for word in word_segments:
    print(word.label, word.start, word.end, word.score)
    word.timestamp_start = int(ration * word.start)*1000/16_000
    word.timestamp_end = int(ration * word.end)*1000/16_000


In [60]:
# Generate the audio for each segment
print(transcript)
IPython.display.Audio(audio_file_complete_path)

|अब्राहम|की|सन्तान|दाऊद|की|सन्तान|यीशु|मसीह|की|वंशावली|अब्राहम|से|इसहाक|उत्पन्न|हुआ|इसहाक|से|याकूब|उत्पन्न|हुआ|और|याकूब|से|यहूदा|और|उसके|भाई|उत्पन्न|हुए|यहूदा|और|तामार|से|पेरेस|व|जेरह|उत्पन्न|हुए|और|पेरेस|से|हेस्रोन|उत्पन्न|हुआ|और|हेस्रोन|से|एराम|उत्पन्न|हुआ|एराम|से|अम्मीनादाब|उत्पन्न|हुआ|और|अम्मीनादाब|से|नहशोन|और|नहशोन|से|सलमोन|उत्पन्न|हुआ|सलमोन|और|राहाब|से|बोअज|उत्पन्न|हुआ|और|बोअज|और|रूत|से|ओबेद|उत्पन्न|हुआ|और|ओबेद|से|यिशै|उत्पन्न|हुआ|और|यिशै|से|दाऊद|राजा|उत्पन्न|हुआ|और|दाऊद|से|सुलैमान|उस|स्त्री|से|उत्पन्न|हुआ|जो|पहले|ऊरिय्याह|की|पत्नी|थी|सुलैमान|से|रहबाम|उत्पन्न|हुआ|और|रहबाम|से|अबिय्याह|उत्पन्न|हुआ|और|अबिय्याह|से|आसा|उत्पन्न|हुआ|आसा|से|यहोशाफात|उत्पन्न|हुआ|और|यहोशाफात|से|योराम|उत्पन्न|हुआ|और|योराम|से|उज्जियाह|उत्पन्न|हुआ|उज्जियाह|से|योताम|उत्पन्न|हुआ|योताम|से|आहाज|उत्पन्न|हुआ|और|आहाज|से|हिजकिय्याह|उत्पन्न|हुआ|हिजकिय्याह|से|मनश्शे|उत्पन्न|हुआ|मनश्शे|से|आमोन|उत्पन्न|हुआ|और|आमोन|से|योशिय्याह|उत्पन्न|हुआ|और|बन्दी|होकर|बाबेल|जाने|के|समय|में|योशिय्याह|से|यकुन्याह|और|उसके|भाई|उत्पन्न|हुए|ब

In [61]:
display_segment(0)

20480 27840
अब्राहम (45024.67): 1.280 - 1.740 sec


In [62]:
display_segment(1)

29120 31361
की (54941.46): 1.820 - 1.960 sec


In [63]:
display_segment(2)

32321 39361
सन्तान (66817.57): 2.020 - 2.460 sec


In [64]:
display_segment(3)

49921 54721
दाऊद (61107.56): 3.120 - 3.420 sec


In [65]:
display_segment(4)

56001 57281
की (152700.40): 3.500 - 3.580 sec


In [66]:
display_segment(5)

58881 65922
सन्तान (47464.74): 3.680 - 4.120 sec


In [67]:
display_segment(6)

75202 80322
यीशु (57019.36): 4.700 - 5.020 sec


In [68]:
display_segment(7)

81922 87042
मसीह (46932.46): 5.120 - 5.440 sec


In [69]:
display_segment(8)

87682 89283
की (309168.54): 5.480 - 5.580 sec


In [78]:
display_segment(9)

90243 99843
वंशावली (120858.20): 5.640 - 6.240 sec


In [71]:
display_segment(10)

116483 123524
अब्राहम (32540.09): 7.280 - 7.720 sec


In [72]:
display_segment(11)

125124 127044
से (288096.31): 7.820 - 7.940 sec


In [73]:
display_segment(12)

129924 136964
इसहाक (42238.82): 8.120 - 8.560 sec


In [74]:
display_segment(16)

167365 169285
से (233936.74): 10.460 - 10.580 sec


In [75]:
display_segment(17)

172165 178246
याकूब (39569.91): 10.760 - 11.140 sec


In [76]:
display_segment(25)

228487 232967
उसके (140084.62): 14.280 - 14.560 sec


In [77]:
display_segment(35)

307210 312330
जेरह (60307.75): 19.201 - 19.521 sec


In [80]:
display_segment(124)

975393 977953
हुआ (41975.82): 60.962 - 61.122 sec


In [84]:
display_segment(314)

2364560 2369040
मसीह (85370.20): 147.785 - 148.065 sec


In [90]:
word = word_segments[0]
word.timestamp_start,word.timestamp_end

(1280.0, 1740.0)

In [88]:
display_segment(0)

20480 27840
अब्राहम (45024.67): 1.280 - 1.740 sec


In [91]:
sound_file[1280:1740]

In [114]:
import pandas as pd
sentences_path = "/Users/Nishant/Desktop/sarvam/transcripts_sentences.csv"
sentences = pd.read_csv(sentences_path)
sentences.head()

Unnamed: 0,book_number,chapter_number,prefix,normalized_text,word_count
0,1,B01,Matthew,|अब्राहम|की|सन्तान|दाऊद|की|सन्तान|यीशु|मसीह|की...,12
1,1,B01,Matthew,|अब्राहम|से|इसहाक|उत्पन्न|हुआ|इसहाक|से|याकूब|उ...,21
2,1,B01,Matthew,|यहूदा|और|तामार|से|पेरेस|व|जेरह|उत्पन्न|हुए|और...,23
3,1,B01,Matthew,|एराम|से|अम्मीनादाब|उत्पन्न|हुआ|और|अम्मीनादाब|...,17
4,1,B01,Matthew,|सलमोन|और|राहाब|से|बोअज|उत्पन्न|हुआ|और|बोअज|और...,23


In [118]:
sentences['book_number'] = sentences['book_number'].apply(lambda x: str(x))
sentences.head()

Unnamed: 0,book_number,chapter_number,prefix,normalized_text,word_count
0,1,B01,Matthew,|अब्राहम|की|सन्तान|दाऊद|की|सन्तान|यीशु|मसीह|की...,12
1,1,B01,Matthew,|अब्राहम|से|इसहाक|उत्पन्न|हुआ|इसहाक|से|याकूब|उ...,21
2,1,B01,Matthew,|यहूदा|और|तामार|से|पेरेस|व|जेरह|उत्पन्न|हुए|और...,23
3,1,B01,Matthew,|एराम|से|अम्मीनादाब|उत्पन्न|हुआ|और|अम्मीनादाब|...,17
4,1,B01,Matthew,|सलमोन|और|राहाब|से|बोअज|उत्पन्न|हुआ|और|बोअज|और...,23


In [105]:
audio_file_complete_path = "/Users/Nishant/Desktop/sarvam-ai/part-two/Hindi_hin_BCS_NT_Non-Drama/B01___01_Matthew_____HINBCSN1DA.mp3"


In [107]:
def extract_book_name(filename):

    print(filename)
    filename = os.path.basename(filename)

    suffix = "HINBCSN1DA.mp3"
    filename = filename[: -len(suffix)]
    print(filename)
    filename = filename.rstrip("_")  # Remove trailing underscores
    filename = filename.lstrip("_")  # Remove preceeding underscores
    filename = filename.replace("___","_")
    
    chapter, book , name = filename.split("_")
    chapter = str(chapter).lstrip('0')
    book = str(book).lstrip('0')
    return chapter, book , name

chapter, book, name = extract_book_name(audio_file_complete_path)
print(chapter, book, name)

/Users/Nishant/Desktop/sarvam-ai/part-two/Hindi_hin_BCS_NT_Non-Drama/B01___01_Matthew_____HINBCSN1DA.mp3
B01___01_Matthew_____
B01 1 Matthew


In [119]:
# slice of sentences where book,chapter and name are equal to the current audio file
sentences = sentences[(sentences["book_number"] == book) & (sentences["chapter_number"] == chapter) & (sentences["prefix"] == name)]
sentences.head()

Unnamed: 0,book_number,chapter_number,prefix,normalized_text,word_count
0,1,B01,Matthew,|अब्राहम|की|सन्तान|दाऊद|की|सन्तान|यीशु|मसीह|की...,12
1,1,B01,Matthew,|अब्राहम|से|इसहाक|उत्पन्न|हुआ|इसहाक|से|याकूब|उ...,21
2,1,B01,Matthew,|यहूदा|और|तामार|से|पेरेस|व|जेरह|उत्पन्न|हुए|और...,23
3,1,B01,Matthew,|एराम|से|अम्मीनादाब|उत्पन्न|हुआ|और|अम्मीनादाब|...,17
4,1,B01,Matthew,|सलमोन|और|राहाब|से|बोअज|उत्पन्न|हुआ|और|बोअज|और...,23


In [120]:
sentences['word_count'] = sentences['word_count']-2
sentences['word_index_end'] = sentences['word_count'].cumsum() - 1
sentences['word_index_start'] = sentences['word_index_end'] - sentences['word_count'] + 1
sentences['word_index_end'] = sentences['word_count'].cumsum() - 1

In [122]:
sentences['start_timestamp_ms'] = sentences['word_index_start'].apply(lambda x: int(word_segments[x].timestamp_start))
sentences['end_timestamp_ms'] = sentences['word_index_end'].apply(lambda x: int(word_segments[x].timestamp_end))
sentences.head()

Unnamed: 0,book_number,chapter_number,prefix,normalized_text,word_count,word_index_end,word_index_start,start_timestamp_ms,end_timestamp_ms
0,1,B01,Matthew,|अब्राहम|की|सन्तान|दाऊद|की|सन्तान|यीशु|मसीह|की...,10,9,0,1280,6240
1,1,B01,Matthew,|अब्राहम|से|इसहाक|उत्पन्न|हुआ|इसहाक|से|याकूब|उ...,19,28,10,7280,15560
2,1,B01,Matthew,|यहूदा|और|तामार|से|पेरेस|व|जेरह|उत्पन्न|हुए|और...,21,49,29,16360,26060
3,1,B01,Matthew,|एराम|से|अम्मीनादाब|उत्पन्न|हुआ|और|अम्मीनादाब|...,15,64,50,26660,33281
4,1,B01,Matthew,|सलमोन|और|राहाब|से|बोअज|उत्पन्न|हुआ|और|बोअज|और...,21,85,65,34221,43301


In [123]:
sentences['audio_duration'] = sentences['end_timestamp_ms'] - sentences['start_timestamp_ms']
sentences['audio_duration'].describe()

count       26.000000
mean      8295.653846
std       2932.517798
min       2700.000000
25%       6900.250000
50%       7520.000000
75%       9025.250000
max      16721.000000
Name: audio_duration, dtype: float64

In [124]:
sentences['word_count'].describe()

count    26.000000
mean     19.923077
std       7.557370
min       7.000000
25%      16.000000
50%      18.000000
75%      21.000000
max      39.000000
Name: word_count, dtype: float64