In [98]:
from langchain.document_loaders import JSONLoader, DirectoryLoader
import jq
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["video_id"] = record.get('video_id')
    metadata["episode_title"] = record.get("Episode Title")
    metadata["guest"] = record.get("Guest")
    metadata["video_url"] = record.get("URL")
    metadata["date_posted"] = record.get("Date")
    
    return metadata

loader = DirectoryLoader(
    'Final Transcripts', 
    glob='*.json',
    loader_cls=JSONLoader,
    loader_kwargs={'jq_schema' : '.[]', 'content_key' : "text", 'metadata_func' : metadata_func},
    show_progress = True   
)

docs = loader.load()

100%|██████████| 1/1 [00:00<00:00, 18.85it/s]


In [99]:
len(docs)

3

In [100]:
print(docs[0])

page_content="welcome back here we go again great to see you and congratulations thank you um you will never forget what is going on in the world when you think about when your child is born you will know for the rest of this child's life you were born during a weird time that's for sure that is for sure they're probably the weirdest that i can remember uh yeah yeah um and he was born on uh may the fourth and yeah that's hilarious too yeah may the fourth be with him yeah exactly it has to be hopefully i sure hope so perfect yes i mean that was the perfect day for you and how do you say the name well uh is it a placeholder first of all my partner is the one that actually mostly came up with the name congratulations to her yeah yeah she's great at names um so i mean it's just x the letter x um and then the ae is like pronounced ash yeah and then a12 a12 is my contribution oh why a12 uh archangel 12 the precursor to the sr-71 coolest plane ever that's true i i agree with you i don't know 

Now that they are loaded into docs we need to go through each of them and split and run through the function. We will then save the Results of the function in a json file and load that into the retriever_creation notebook

Elon Musk Episode:

In [None]:
#Gonna run through with just one transcript to see how it goes
document = docs[0]

print(document.metadata)

In [103]:
import tiktoken 
tokenizer = tiktoken.get_encoding('cl100k_base')

#Create function to check token length
def tiktoken_len(text):
    tokens= tokenizer.encode(
        text,
        disallowed_special = ()
    )
    return len(tokens)

from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 0)

if tiktoken_len(document.page_content) > 1000:
    # batches = splitter.split_text(document.page_content)
    batches = splitter.split_documents([document])

else:
    batches = document

In [104]:
len(batches)

23

In [105]:
def create_system_message(text_type, text_title):
    system_message = f'''Given some text, which is part of a {text_type} from {text_title}, your goal is to split the text in half so that no thought or topic is cutoff and the split is performed at the end of a complete thought/topic.

        You will be given steps to follow until the final desired result is achieved. 

        Some important things to note:
        - '!MIDPOINT!' denotes the midpoint of the text.
        - To complete this task effectively you must adhere to strictly to the instructions at each step
        - You must be exact with your output, whenever providing words from the text, copy exactly what is written, even if there are missing words, repeated words or misspellings, it does not matter.

        Step 1 - Determine what the main topics are before and after the !MIDPOINT! label. Main topics are overall topics to which the text is about, not brief things that are mentioned in passing. Create a list of topics as such:
        Before Midpoint = Topic 1, Topic 2, Topic 3
        After Midpoint = Topic 4, Topic 5, Topic 6

        Step 2 - Based on the the lists of Topic Labels you have created identify if either of these conditions are true. 
        - The last topic of the first section is semantically related to the first topic of the second section
        - The last topic of the first section continues on into the text of the second section. 

        Step 3 - Depending on the condition that you have identified from Step 2, decide which of the following course of actions must be taken:
        - If there was semantic overlap between the two topics then the words you split on should be located at the conclusion of overlapping topics. When the transition to the next semantically unrelated topic begins.
        - If there is a continuation of the last topic of the first section past the !MIDPOINT! then the text needs to be split where this continuation ends.
        - If neither of these conditions where meet then you must check if the current location of the !MIDPOINT! interrupts the completion of a thought, if this is the case then the split point should be where the interrupted thought concludes.
        
        ​​Step 4- Based on the course of action you have identified in Step 3, perform this course of action and locate a small set of exact words on which to split the text. The words must be exact and should not be long. 

        Step 5 - Given the exact words on where to split reorganize the topics so that they match the new sections which are determined by the split location.
        For example if in the example from Step 1, Topic 3 and Topic 4 have overlap then the lists would now be as follows:
        Before Split = Topic 1, Topic 2, Topic 3, Topic 4
        After Split = Topic 5, Topic 6

        Final Step - Now that we know where the text should be split and the new organization of topics, provide the final output which is a python dictionary with 3 key-value pairs:
        - "before_split_topics" will be the 'Before Split' list that you identified in Step 3.
        -  "after_split_topics" will be the 'After Split' list that you identified in Step 3.
        - "split_key" will be the exact words that identify where the text should be split.
        An example of what the final output should look like structurally:

        {{"before_split_topics : ["Topic 1", "Topic 2", "Topic 3", "Topic 4",], "after_split_topics" : ["Topic 5", "Topic 6"],"split_key" : "split the text here"}}

        Begin!'''
    
    return system_message

from langchain.prompts import PromptTemplate
def create_user_message(bigtext):
    user_template = PromptTemplate.from_template("TEXT: \n  {bigtext} \n Remember follow the outlined 6 step plan. Write the out the result of each step and then the final output: ")
    return user_template.format(bigtext = bigtext)



In [106]:
from langchain.schema import SystemMessage, HumanMessage
from langchain.chat_models import ChatOpenAI
import time
import ast

def smart_chunking(documents):
    #First we pass the LangChain Documents that have already gone through a simple splitter in.

    #Then we create our system message based on the metedata of the documents.

    #Get the Title of the Text so the LLM has some context
    text_title = documents[0].metadata["episode_title"]

    #Give the Type of Text for context
    text_type = "Transcript"

    chat = ChatOpenAI(model="gpt-4", temperature=0)

    #Use this for the two chunks
    final_docs = {}
    changed = []
    dicts_of_changes = []

    for i, chunk in enumerate(documents):
        print(f"Index = {i}")
        if i + 1 < len(documents):
            first = documents[i].page_content
            second = documents[i+1].page_content
            full_text = first + ' !MIDPOINT! ' + second
            hum_message = create_user_message(full_text)
            # hum_message = user_template.format(bigtext = full_text)
            sys_message = create_system_message(text_title=text_title,text_type=text_type)
            messages = [
                SystemMessage(content = sys_message),
                HumanMessage(content = hum_message)
                ]
            response = chat(messages)
            output = response.content

            print(output)

            #Process to extract the dictionary
            
            #Extract index of brackets
            open_bracket_index = output.find('{')
            closed_bracket_index = output.find('}') + 1

            #Slice those indicies
            dictionary_string = output[open_bracket_index:closed_bracket_index]

            #Convert the string to dictionary literal
            boundary_dict = ast.literal_eval(dictionary_string)
            dicts_of_changes.append(boundary_dict)

            total_text = first + ' ' + second

            end_index_first_chunk = total_text.find(boundary_dict['split_key'])
            print(end_index_first_chunk)

            new_first_chunk = total_text[0:end_index_first_chunk]
            new_second_chunk = total_text[end_index_first_chunk:]

            changed.append(new_first_chunk)
            changed.append(new_second_chunk)

            #ADD TO THE DICT THAT KEEPS TRACK OF THE DOCS
            final_docs[f'Chunk {i}'] = {
                'Text' : new_first_chunk, 'Topics' : boundary_dict['before_split_topics']
            }
            final_docs[f'Chunk {i + 1}'] = {
                'Text' : new_second_chunk, 'Topics' : boundary_dict['after_split_topics']
            }

            print(f"Old first: \n {first} \n New first: \n {new_first_chunk} \n Old Second: \n {second} \n New Second: \n {new_second_chunk}")

            documents[i].page_content = new_first_chunk
            documents[i].metadata["Topics"] = boundary_dict['before_split_topics']
            documents[i+1].page_content = new_second_chunk
            documents[i+1].metadata["Topics"] = boundary_dict['after_split_topics']


            time.sleep(5)
    return final_docs

In [107]:
elon_musk_dictionary = smart_chunking(batches)

Index = 0
Step 1 - Determine the main topics before and after the !MIDPOINT! label.
Before Midpoint = Elon Musk's new child and its name, AI and neural nets, Elon Musk's view on babies, Comparison between AI and human brain, Elon Musk's thoughts on AI becoming smarter than humans
After Midpoint = Elon Musk selling his houses and possessions, Elon Musk's thoughts on being defined by wealth, Elon Musk's views on capital allocation, Elon Musk's views on manufacturing, Elon Musk's decision to get rid of material possessions

Step 2 - Identify if either of these conditions are true.
The last topic of the first section does not continue on into the text of the second section and is not semantically related to the first topic of the second section.

Step 3 - Decide which course of action must be taken.
The current location of the !MIDPOINT! does not interrupt the completion of a thought, so the split point should be where the interrupted thought concludes.

Step 4 - Locate a small set of exac

In [109]:
len(batches)

23

In [111]:
total_sum = 0
for i,batch in enumerate(batches):
    print(f'Chunk {i}: {tiktoken_len(batch.page_content)}')
    total_sum += tiktoken_len(batch.page_content)

print(f'Total Tokens: {total_sum}')

Chunk 0: 992
Chunk 1: 999
Chunk 2: 993
Chunk 3: 1609
Chunk 4: 785
Chunk 5: 599
Chunk 6: 1017
Chunk 7: 847
Chunk 8: 2143
Chunk 9: 991
Chunk 10: 993
Chunk 11: 874
Chunk 12: 126
Chunk 13: 992
Chunk 14: 1984
Chunk 15: 1000
Chunk 16: 85
Chunk 17: 1057
Chunk 18: 871
Chunk 19: 999
Chunk 20: 988
Chunk 21: 1003
Chunk 22: 18
Total Tokens: 21965


In [117]:
#Code to write the Chunks to JSON file:
final_doc_list = []
for batch in batches:
    temp = {'text' : batch.page_content}
    temp.update(batch.metadata)
    final_doc_list.append(temp)
    


In [121]:
import json
with open(f'Final Transcripts/Elon_Episode.json','w') as f:
    f.write(json.dumps(final_doc_list))

Snowden Epsiode:

In [122]:
#Gonna run through with just one transcript to see how it goes
document = docs[1]

print(document.metadata)

import tiktoken 
tokenizer = tiktoken.get_encoding('cl100k_base')

#Create function to check token length
def tiktoken_len(text):
    tokens= tokenizer.encode(
        text,
        disallowed_special = ()
    )
    return len(tokens)

from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 0)

if tiktoken_len(document.page_content) > 1000:
    # batches = splitter.split_text(document.page_content)
    batches = splitter.split_documents([document])

else:
    batches = document

print(len(batches))

{'source': '/Users/werneckprojects/Documents/GitHub/StreamlitHackathonPGLW/Final Transcripts/rogan_transcripts.json', 'seq_num': 2, 'video_id': 'efs3QRr8LWw', 'episode_title': 'The Joe Rogan Experience #1368 - Edward Snowden', 'guest': 'Edward Snowden', 'video_url': 'www.youtube.com/watch?v=efs3QRr8LWw', 'date_posted': '2019-10-23T13:00:07Z'}
31


In [123]:
total_sum = 0
for i,batch in enumerate(batches):
    print(f'Chunk {i}: {tiktoken_len(batch.page_content)}')
    total_sum += tiktoken_len(batch.page_content)

print(f'Total Tokens: {total_sum}')

Chunk 0: 1004
Chunk 1: 1008
Chunk 2: 1001
Chunk 3: 1008
Chunk 4: 1008
Chunk 5: 1003
Chunk 6: 1003
Chunk 7: 1005
Chunk 8: 1006
Chunk 9: 999
Chunk 10: 998
Chunk 11: 1000
Chunk 12: 997
Chunk 13: 1009
Chunk 14: 1012
Chunk 15: 1002
Chunk 16: 1003
Chunk 17: 1017
Chunk 18: 1005
Chunk 19: 1000
Chunk 20: 1006
Chunk 21: 1002
Chunk 22: 1004
Chunk 23: 999
Chunk 24: 1004
Chunk 25: 1007
Chunk 26: 997
Chunk 27: 993
Chunk 28: 1000
Chunk 29: 1004
Chunk 30: 765
Total Tokens: 30869


In [124]:
snowden_dictionary = smart_chunking(batches)

Index = 0
Step 1 - Determine the main topics before and after the !MIDPOINT! label.

Before Midpoint = Edward Snowden's life and work, Edward Snowden's book and lawsuit, Joe Rogan's show and its impact, Edward Snowden's initial impression of Joe Rogan, Edward Snowden's experience with smear campaigns
After Midpoint = Edward Snowden's identity and role in the revelations of global mass surveillance in 2013, The government's violation of the law and Constitution, The concept of government surveillance, The aftermath of 9/11 and the government's response, The mass surveillance program called stellar wind

Step 2 - Identify if either of these conditions are true. 

The last topic of the first section is not semantically related to the first topic of the second section. The last topic of the first section does not continue on into the text of the second section. 

Step 3 - Decide which course of action must be taken.

The current location of the !MIDPOINT! interrupts the completion of a tho

In [125]:
#Code to write the Chunks to JSON file:
final_doc_list = []
for batch in batches:
    temp = {'text' : batch.page_content}
    temp.update(batch.metadata)
    final_doc_list.append(temp)


In [126]:
len(final_doc_list)

31

In [127]:
import json
with open(f'LangChain Documents/Snowden_Episode.json','w') as f:
    f.write(json.dumps(final_doc_list))

Left off here:
- Need to make the notebook neat with functions
- finish doing the 3rd episode
- check the chunks more in depth

DEVELOPMENT SCRATCH WORK

In [93]:
#Gonna run through with just one transcript to see how it goes
document = docs[0]

In [47]:
print(document)

page_content="welcome back here we go again great to see you and congratulations thank you um you will never forget what is going on in the world when you think about when your child is born you will know for the rest of this child's life you were born during a weird time that's for sure that is for sure they're probably the weirdest that i can remember uh yeah yeah um and he was born on uh may the fourth and yeah that's hilarious too yeah may the fourth be with him yeah exactly it has to be hopefully i sure hope so perfect yes i mean that was the perfect day for you and how do you say the name well uh is it a placeholder first of all my partner is the one that actually mostly came up with the name congratulations to her yeah yeah she's great at names um so i mean it's just x the letter x um and then the ae is like pronounced ash yeah and then a12 a12 is my contribution oh why a12 uh archangel 12 the precursor to the sr-71 coolest plane ever that's true i i agree with you i don't know 

In [94]:
document.metadata

{'source': '/Users/werneckprojects/Documents/GitHub/StreamlitHackathonPGLW/Final Transcripts/rogan_transcripts.json',
 'seq_num': 1,
 'video_id': 'RcYjXbSJBN8',
 'episode_title': 'The Joe Rogan Experience #1470 - Elon Musk',
 'guest': 'Elon Musk',
 'video_url': 'www.youtube.com/watch?v=RcYjXbSJBN8',
 'date_posted': '2020-05-07T13:00:07Z'}

In [95]:
import tiktoken 
tokenizer = tiktoken.get_encoding('cl100k_base')

#Create function to check token length
def tiktoken_len(text):
    tokens= tokenizer.encode(
        text,
        disallowed_special = ()
    )
    return len(tokens)

from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 0)

if tiktoken_len(document.page_content) > 1000:
    # batches = splitter.split_text(document.page_content)
    batches = splitter.split_documents([document])

else:
    batches = document


In [96]:
len(batches)

23

In [85]:
batches[0]

Document(page_content="welcome back here we go again great to see you and congratulations thank you um you will never forget what is going on in the world when you think about when your child is born you will know for the rest of this child's life you were born during a weird time that's for sure that is for sure they're probably the weirdest that i can remember uh yeah yeah um and he was born on uh may the fourth and yeah that's hilarious too yeah may the fourth be with him yeah exactly it has to be hopefully i sure hope so perfect yes i mean that was the perfect day for you and how do you say the name well uh is it a placeholder first of all my partner is the one that actually mostly came up with the name congratulations to her yeah yeah she's great at names um so i mean it's just x the letter x um and then the ae is like pronounced ash yeah and then a12 a12 is my contribution oh why a12 uh archangel 12 the precursor to the sr-71 coolest plane ever that's true i i agree with you i do

Putting Everything together in one function

In [64]:
def create_system_message(text_type, text_title):
    system_message = f'''Given some text, which is part of a {text_type} from {text_title}, your goal is to split the text in half so that no thought or topic is cutoff and the split is performed at the end of a complete thought/topic.

        You will be given steps to follow until the final desired result is achieved. 

        Some important things to note:
        - '!MIDPOINT!' denotes the midpoint of the text.
        - To complete this task effectively you must adhere to strictly to the instructions at each step
        - You must be exact with your output, whenever providing words from the text, copy exactly what is written, even if there are missing words, repeated words or misspellings, it does not matter.

        Step 1 - Determine what the main topics are before and after the !MIDPOINT! label. Main topics are overall topics to which the text is about, not brief things that are mentioned in passing. Create a list of topics as such:
        Before Midpoint = Topic 1, Topic 2, Topic 3
        After Midpoint = Topic 4, Topic 5, Topic 6

        Step 2 - Based on the the lists of Topic Labels you have created identify if either of these conditions are true. 
        - The last topic of the first section is semantically related to the first topic of the second section
        - The last topic of the first section continues on into the text of the second section. 

        Step 3 - Depending on the condition that you have identified from Step 2, decide which of the following course of actions must be taken:
        - If there was semantic overlap between the two topics then the words you split on should be located at the conclusion of overlapping topics. When the transition to the next semantically unrelated topic begins.
        - If there is a continuation of the last topic of the first section past the !MIDPOINT! then the text needs to be split where this continuation ends.
        - If neither of these conditions where meet then you must check if the current location of the !MIDPOINT! interrupts the completion of a thought, if this is the case then the split point should be where the interrupted thought concludes.
        
        ​​Step 4- Based on the course of action you have identified in Step 3, perform this course of action and locate a small set of exact words on which to split the text. The words must be exact and should not be long. 

        Step 5 - Given the exact words on where to split reorganize the topics so that they match the new sections which are determined by the split location.
        For example if in the example from Step 1, Topic 3 and Topic 4 have overlap then the lists would now be as follows:
        Before Split = Topic 1, Topic 2, Topic 3, Topic 4
        After Split = Topic 5, Topic 6

        Final Step - Now that we know where the text should be split and the new organization of topics, provide the final output which is a python dictionary with 3 key-value pairs:
        - "before_split_topics" will be the 'Before Split' list that you identified in Step 3.
        -  "after_split_topics" will be the 'After Split' list that you identified in Step 3.
        - "split_key" will be the exact words that identify where the text should be split.
        An example of what the final output should look like structurally:

        {{"before_split_topics : ["Topic 1", "Topic 2", "Topic 3", "Topic 4",], "after_split_topics" : ["Topic 5", "Topic 6"],"split_key" : "split the text here"}}

        Begin!'''
    
    return system_message

In [75]:
from langchain.prompts import PromptTemplate
def create_user_message(bigtext):
    user_template = PromptTemplate.from_template("TEXT: \n  {bigtext} \n Remember follow the outlined 6 step plan. Write the out the result of each step and then the final output: ")
    return user_template.format(bigtext = bigtext)

In [86]:
batches[0]

Document(page_content="welcome back here we go again great to see you and congratulations thank you um you will never forget what is going on in the world when you think about when your child is born you will know for the rest of this child's life you were born during a weird time that's for sure that is for sure they're probably the weirdest that i can remember uh yeah yeah um and he was born on uh may the fourth and yeah that's hilarious too yeah may the fourth be with him yeah exactly it has to be hopefully i sure hope so perfect yes i mean that was the perfect day for you and how do you say the name well uh is it a placeholder first of all my partner is the one that actually mostly came up with the name congratulations to her yeah yeah she's great at names um so i mean it's just x the letter x um and then the ae is like pronounced ash yeah and then a12 a12 is my contribution oh why a12 uh archangel 12 the precursor to the sr-71 coolest plane ever that's true i i agree with you i do

In [87]:
from langchain.schema import SystemMessage, HumanMessage
from langchain.chat_models import ChatOpenAI
import time
import ast

def smart_chunking(documents):
    #First we pass the LangChain Documents that have already gone through a simple splitter in.

    #Then we create our system message based on the metedata of the documents.

    #Get the Title of the Text so the LLM has some context
    text_title = documents[0].metadata["episode_title"]

    #Give the Type of Text for context
    text_type = "Transcript"

    chat = ChatOpenAI(model="gpt-4", temperature=0)

    #Use this for the two chunks
    final_docs = {}
    changed = []
    dicts_of_changes = []

    for i, chunk in enumerate(documents):
        print(f"Index = {i}")
        if i + 1 < len(documents):
            first = documents[i].page_content
            second = documents[i+1].page_content
            full_text = first + ' !MIDPOINT! ' + second
            hum_message = create_user_message(full_text)
            # hum_message = user_template.format(bigtext = full_text)
            sys_message = create_system_message(text_title=text_title,text_type=text_type)
            messages = [
                SystemMessage(content = sys_message),
                HumanMessage(content = hum_message)
                ]
            response = chat(messages)
            output = response.content

            print(output)

            #Process to extract the dictionary
            
            #Extract index of brackets
            open_bracket_index = output.find('{')
            closed_bracket_index = output.find('}') + 1

            #Slice those indicies
            dictionary_string = output[open_bracket_index:closed_bracket_index]

            #Convert the string to dictionary literal
            boundary_dict = ast.literal_eval(dictionary_string)
            dicts_of_changes.append(boundary_dict)

            total_text = first + ' ' + second

            end_index_first_chunk = total_text.find(boundary_dict['split_key'])
            print(end_index_first_chunk)

            new_first_chunk = total_text[0:end_index_first_chunk]
            new_second_chunk = total_text[end_index_first_chunk:]

            changed.append(new_first_chunk)
            changed.append(new_second_chunk)

            #ADD TO THE DICT THAT KEEPS TRACK OF THE DOCS
            final_docs[f'Chunk {i}'] = {
                'Text' : new_first_chunk, 'Topics' : boundary_dict['before_split_topics']
            }
            final_docs[f'Chunk {i + 1}'] = {
                'Text' : new_second_chunk, 'Topics' : boundary_dict['after_split_topics']
            }

            print(f"Old first: \n {first} \n New first: \n {new_first_chunk} \n Old Second: \n {second} \n New Second: \n {new_second_chunk}")

            documents[i].page_content = new_first_chunk
            documents[i].metadata["Topics"] = boundary_dict['before_split_topics']
            documents[i+1].page_content = new_second_chunk
            documents[i+1].metadata["Topics"] = boundary_dict['after_split_topics']


            time.sleep(5)
    return final_docs

The above function performs the smart chunking on a set of documents. So it goes through and modifies each chunk following the logic we provided it with and then adds the topics for each chunk as well. WE can easily separate the docs into two sets, one more the transcripts with the metadata and one with the topics and the metadata.

We can create separate JSONS from these documents.

In [88]:
final_test = smart_chunking(batches[0:5])

Index = 0
Step 1 - Determine the main topics before and after the !MIDPOINT! label.
Before Midpoint = Elon Musk's new child and its name, AI and neural nets, Elon Musk's view on babies, Comparison between AI and human brain, Future of AI
After Midpoint = Elon Musk selling his houses, Elon Musk's view on possessions, Elon Musk's view on wealth and billionaires, Elon Musk's view on finance and law, Elon Musk's view on manufacturing, Elon Musk's decision to get rid of material possessions

Step 2 - Identify if either of these conditions are true.
The last topic of the first section does not seem to be semantically related to the first topic of the second section. The last topic of the first section does not continue on into the text of the second section.

Step 3 - Decide which course of action must be taken.
The current location of the !MIDPOINT! does not interrupt the completion of a thought. Therefore, the split point should be where the !MIDPOINT! is located.

Step 4 - Locate a small 