In [None]:
#visualization tool for displaying long load/processing times
pip install tqdm 
#data processing
pip install pandas 
#workhorse for converting text into embeddings/vectors
pip install sentence-transformers==2.2.2 
#data framework for LLM applications
pip install llama-index==0.9.29
#logging output
pip install loguru==0.7.0 
#convenient pretty printing library
pip install rich 
#openai Tokenizer library
pip install tiktoken 

In [None]:
%load_ext autoreload
%autoreload 2

#standard libraries
import sys
sys.path.append('../')

import os
import time
from typing import List, Tuple
from math import ceil

#external libraries
import pandas as pd
import numpy as np
from rich import print
from rich.pretty import pprint #nifty library for pretty printing
from torch import cuda
from tqdm import tqdm


In [None]:
import tiktoken
from llama_index.text_splitter import SentenceSplitter

In [None]:
import json

In [None]:

# Instantiate tokenizer for use with ChatGPT-3.5-Turbo
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0125')

# Set chunk size (512 tokens) and instantiate your SentenceSplitter
chunk_size = 256
gpt35_txt_splitter = SentenceSplitter(chunk_size=chunk_size, tokenizer=encoding.encode, chunk_overlap=0)

def merge_transcription_text(transcriptions):
    merged_text = ' '.join([t['text'].replace('\n', ' ').replace('\r', ' ').strip() for t in transcriptions])
    return merged_text

def chunk_transcriptions(transcriptions, max_tokens, tokenizer):
    current_chunk = []
    current_chunk_text = ""
    current_tokens = 0
    chunks = []

    for t in transcriptions:
        tokens = tokenizer(t['text'])
        if current_tokens + len(tokens) > max_tokens:
            # finalize current chunk
            if current_chunk:
                chunks.append(current_chunk)
            # start new chunk
            current_chunk = [t]
            current_chunk_text = t['text']
            current_tokens = len(tokens)
        else:
            current_chunk.append(t)
            current_chunk_text += " " + t['text']
            current_tokens += len(tokens)

    # append last chunk
    if current_chunk:
        chunks.append(current_chunk)

    return chunks

def process_chunks(chunks):
    processed_chunks = []
    for chunk in chunks:
        start_time = chunk[0]['start']
        duration = sum(t['duration'] for t in chunk)
        text = ' '.join(t['text'].replace('\n', ' ').replace('\r', ' ').strip() for t in chunk)
        processed_chunks.append({
            "start": start_time,
            "duration": duration,
            "text": text
        })
    return processed_chunks

# Read JSON data from the file
with open('raw_dataset.json', 'r') as file:
    data = json.load(file)

# Assuming data is a list of dictionaries, where each dictionary has a 'content' key
video_data = []
for item in data:
    video_id = item['video_id']
    title = item['title']
    guest = item['guest']
    likes_count = item['likes_count']
    chunks = chunk_transcriptions(item['content'], chunk_size, encoding.encode)
    transcription_splits = process_chunks(chunks)
    video_data.append({
        "video_id": video_id,
        "title": title,
        "guest": guest,
        "likes_count": likes_count,
        "content": transcription_splits
    })

#making content chunks into singular objects in json
final_data = []
for item in video_data:
    video_id = item['video_id']
    title = item['title']
    guest = item['guest']
    likes_count = item['likes_count']
    content = item['content']
    index=1
    for con in content:
        doc_id= f"{item['video_id']}-{str(index).zfill(4)}"
        start_time = con['start']
        duration = con['duration']
        text = con['text']
        final_data.append({
            "doc_id": doc_id,
            "video_id": video_id,
            "title": title,
            "guest": guest,
            "likes_count": int(likes_count),
            "start": start_time,
            "duration": duration,
            "content": text
        })
        index+=1



# Save the JSON data to a file
output_file_path = 'chunked_256_token.json'
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    json.dump(final_data, outfile, ensure_ascii=False, indent=4)

print(f"Data saved to {output_file_path}")
