In [5]:
import os
import re

def clean_filename(filename):
    """
    Clean the podcast name by removing or replacing unwanted characters like underscores.
    """
    cleaned_name = filename.replace('_', ' ')  # Replace underscores with spaces
    cleaned_name = re.sub(r'[^\w\s]', '', cleaned_name)  # Remove any non-alphanumeric characters except spaces
    return cleaned_name.strip()

def split_document_by_timestamps(document, podcast_name):
    # Regular expression to capture timestamps in the format of "0:00", "1:10", etc.
    timestamp_pattern = r"\d{1,2}:\d{2}"
    
    # Find all the timestamps in the document
    timestamps = re.findall(timestamp_pattern, document)
    
    # Split the document by timestamps
    chunks = re.split(timestamp_pattern, document)
    
    # Remove any empty strings and pair each chunk with its timestamp
    parsed_document = []
    for i in range(len(timestamps)):
        parsed_document.append({
            'timestamp': timestamps[i],
            'text': chunks[i + 1].strip(),  # Remove leading/trailing whitespace
            'podcast_name': podcast_name  # Add the cleaned podcast name field
        })
    
    return parsed_document

# Function to process all text files in a given folder
def process_all_podcasts_in_folder(folder_path):
    all_parsed_documents = []
    
    # Loop through each file in the specified folder
    for filename in os.listdir(folder_path):
        # Check if the file is a text file (you can adjust this condition if needed)
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            podcast_name = os.path.splitext(filename)[0]  # Get the file name without extension
            podcast_name = clean_filename(podcast_name)  # Clean the podcast name
            
            # Read the content of the file
            with open(file_path, 'r') as file:
                document = file.read()
            
            # Split the document and get the parsed data
            parsed_document = split_document_by_timestamps(document, podcast_name)
            
            # Add the parsed document data to the final list
            all_parsed_documents.extend(parsed_document)
    
    return all_parsed_documents

# Specify the folder containing the podcast transcripts
folder_path = '../resources'  # Replace this with the actual folder path

# Process all files in the folder
all_podcasts_data = process_all_podcasts_in_folder(folder_path)

# Output the result (you can further process this or save it to a file)
print(all_podcasts_data[0])

{'timestamp': '0:07', 'text': "Welcome Back To The Head heart and Boots podcast.\nI'm Chris and I'm Brandon.\nJoin us as we wrestle with what it takes to transform ourselves and the businesses we leave this industry.\nHello.\nHow are you?", 'podcast_name': 'Head Heart  Boots  Ep 74   Who Are You Really'}


In [8]:
parsed_document = all_podcasts_data

In [3]:
# installing weavite and openai dependencies

In [34]:
%pip install -U openai==0.28 weaviate-client


Note: you may need to restart the kernel to use updated packages.


In [4]:
import weaviate

In [5]:
%pip freeze | grep openai

openai==1.51.0


  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.


In [29]:
import os
OPENAI_API_KEY=os.environ["OPENAI_API_KEY"]

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [13]:
client = weaviate.connect_to_custom(
            http_host="localhost",
            http_port=8080,
            http_secure=False,
            grpc_host="localhost",
            grpc_port=50051,
            grpc_secure=False,
            headers={
                "X-OpenAI-Api-Key": OPENAI_API_KEY
            },
        )

In [20]:
from openai import OpenAI
openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [9]:
# Creating embedings with openai
def get_embeddings(text, model="text-embedding-3-large"):
    text = text.replace("\n", " ")
    return openai_client.embeddings.create(input = [text], model=model).data[0].embedding

# Generate embeddings for each chunk
for chunk in parsed_document:
    chunk['embedding'] = get_embeddings(chunk['text'])



In [26]:
import weaviate

# Initialize the Weaviate client
weaviate_client = weaviate.Client("http://localhost:8080")  # Change the URL to your Weaviate instance


Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  weaviate_client = weaviate.Client("http://localhost:8080")  # Change the URL to your Weaviate instance
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                -

In [27]:
# Define the schema if not already set up
class_schema = {
    "class": "PodcastChunk",
    "properties": [
        {
            "name": "timestamp",
            "dataType": ["string"]
        },
        {
            "name": "text",
            "dataType": ["text"]
        },
        {
            "name": "podcast_name",
            "dataType": ["text"]
        },
        {
            "name": "embedding",
            "dataType": ["number[]"]
        }
    ]
}

weaviate_client.schema.create_class(class_schema)



UnexpectedStatusCodeError: Create class! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "PodcastChunk" already exists'}]}.

In [15]:
wcollection = weaviate_client.collections.get("PodcastChunk")

In [16]:
wcollection

<weaviate.collections.collection.sync.Collection at 0x1097caba0>

In [18]:
for chunk in parsed_document:
    wcollection.data.insert(
        properties={
            "timestamp": chunk['timestamp'],
            "text": chunk['text'],
            "podcast_name": chunk['podcast_name']
        },
        vector=chunk['embedding']
     )

In [28]:
from typing import Optional

def _retrieve_transcripts( query: str, n: Optional[int] = 20):
        query_embedding = openai_client.embeddings.create(input=[query], model='text-embedding-3-large').data[0].embedding
        results = (
            weaviate_client.query
            .get("PodcastChunk", ["timestamp",  "text", "podcast_name"])
            .with_hybrid(
                query=query,  # Replace with relevant search term
                vector=query_embedding,
                alpha=0.75  # Adjust this value to balance between vector and keyword search
            )
            .with_limit(n)  # Increased limit for more results
            .with_additional(["score"])  # Include the score to see relevance
            .do()
        )
        return results['data']['Get']['PodcastChunk']

In [29]:
_retrieve_transcripts("What are common challenges faced in estimating?")

[{'_additional': {'score': '0.8291863'},
  'text': "And it can do anything from helping a new team member assimilate some estimating Beck's practices.\nAnd it also helps the grizzled vets add back that few percent that we've just forgot over time.\nSo, Actionable Insights.\nGet insights.org/floodlight and take a look at what the Actionable Insights Exactimate profile could be doing for you and your team.",
  'timestamp': '41:13'},
 {'_additional': {'score': '0.75'},
  'text': "That's the game changer.\nIt's essentially an AI tool that's walking alongside of you as you write your estimate, bringing things to your attention that should be added that could be considered all of them.\nItems that increase our profitability, increase the effectiveness and the consistency of that scope.",
  'timestamp': '40:55'},
 {'_additional': {'score': '0.69543576'},
  'text': "Maybe that one project was 50% of their net for the year.\nDo you think that changes the discussion?\nIt absolutely does.\nSo, an

In [30]:
# Installing Langchain

In [7]:
%pip install langchain langchain_openai langchain_community



Collecting langchain_community
  Downloading langchain_community-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.5.2-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.3.1-py3-none-any.whl (2.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━

In [125]:
import asyncio
from typing import Dict, Any, List, Optional
from langchain.chains.base import Chain
from langchain_core.language_models import BaseLanguageModel
from langchain.callbacks.manager import CallbackManagerForChainRun
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Weaviate
import json


TRANSCRIPT_RETRIEVAL_SYSTEM_PROMPT = """You are an AI assistant tasked with retrieving, interpreting, and presenting information from podcast transcript to answer a user's question. Your goal is to provide a detailed answer to user requests.

CRITICAL INSTRUCTIONS:

1. NEVER make up or infer information not present in the provided transcripts.
2. If the transcript don't provide all the information needed to answer the question fully, clearly state what's missing.

Guidelines for retrieval:

1. Analyze all the provided transcripts carefully. 

Your response should help the user understand not only the content of the transcript but also the reasoning behind the selection of each transcript, providing a comprehensive and interpretable overview of the available information."""

RETREIVAL_FORMAT_PROMPT = """
You are an AI language model designed to transform user requests into optimized search queries for a Weaviate index of poscast transcripts, utilizing vector-based search powered by OpenAI embeddings.

Your Task:

Given a user request:

Identify Terms: Extract all terms mentioned in the transcripts.

Expand Concepts: For each term, generate related aspects.

Compose a Descriptive Query: Create a concise and informative sentence or phrase that encompasses the term and its related aspects, effectively describing the kind of transcript being searched for.

Guidelines:

Content: Include key aspects of the term to aid in retrieving relevant transcripts.
Style: Write in a tone typical for podcasts.
Relevance: Ensure the query reflects the user's interest in the podcasts and related.
Conciseness: Keep the query short and focused.
Example:

Only return the formatted query, nothing else.
"""

def parse_llm_output(response: str):
    """
    Parse the LLM output into HTML and summary parts.
    """
    # Split the response into HTML and summary parts
    parts = re.split(r'(</html>)', response, maxsplit=1)
    
    if len(parts) > 1:
        html_content = parts[0] + parts[1]
        summary = parts[2].strip() if len(parts) > 2 else ""
    else:
        html_content = ""
        summary = response.strip()
    
    return html_content, summary

def clean_html(html_content: str):
    """
    Clean and format the HTML content.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()
    
    # Remove comments
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()
    
    # Ensure the table has border and cellpadding
    table = soup.find('table')
    if table:
        table['border'] = '1'
        table['cellpadding'] = '5'
    
    return str(soup)


def format_retreival_query(llm: BaseLanguageModel, user_query: str) -> str:

    human_message = f"Format this user query for articles retreival: {user_query}"

    prompt = ChatPromptTemplate.from_messages([
        ("system", RETREIVAL_FORMAT_PROMPT),
        ("human", human_message),
    ])

    response = llm(prompt.format_messages())
    # import pdb; pdb.set_trace()
    
    return response.content.strip()



class TranscriptRetrievalChain(Chain):
    llm: BaseLanguageModel
    weaviate_client: Any
    openai_client: Any
    cl_instance: Any
    alpha: float = 0.75
    
    class Config:
        arbitrary_types_allowed = True

    @property
    def input_keys(self) -> List[str]:
        return ["question"]

    @property
    def output_keys(self) -> List[str]:
        return ["answer"]

    def sort_by_timestamp(self, results: List[Dict]) -> List[Dict]:
        def timestamp_to_seconds(timestamp: str) -> int:
            minutes, seconds = map(int, timestamp.split(':'))
            return minutes * 60 + seconds
        return sorted(results, key=lambda transcript: timestamp_to_seconds(transcript['timestamp']))

    def _retrieve_transcripts(self, query: str, n: int = 20, rich_author_meta=False) -> List[Dict]:

        query_formatted = format_retreival_query(self.llm, query)

        print("Formatted query: ", query_formatted)
        
        #self.cl_instance
        

        query_embedding = self.openai_client.embeddings.create(input=[query_formatted], model='text-embedding-3-large').data[0].embedding

        fields = ["timestamp", "text", "podcast_name"]
    
        results = (
            self.weaviate_client.query
            .get("PodcastChunk", fields)
            .with_hybrid(
                query=query,
                vector=query_embedding,
                alpha=self.alpha
            )
            .with_limit(n)
            .with_additional(["score"])
            .do()
        )

        return results['data']['Get']['PodcastChunk']

    def transcript_synthesize(self, results: List[Dict], original_question: str, task: str) -> str:
        system_prompt = TRANSCRIPT_RETRIEVAL_SYSTEM_PROMPT
        
        transcripts = results

        # Format the transcripts for the prompt
        formatted_transcripts = json.dumps(transcripts, indent=2)
        human_message_template = """Original question: {question}
        PodcastChunk:
        {transcripts}
        Please {task} the information from these transcripts to answer the original question."""
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            ("human", human_message_template),
        ])

        # import pdb; pdb.set_trace()

        response = self.llm(prompt.format_messages(
            question=original_question,
            transcripts=formatted_transcripts,
            task = task,
        ))

        return response.content

    async def _acall(
        self,
        inputs: Dict[str, Any],
        run_manager: Optional[CallbackManagerForChainRun] = None
    ) -> Dict[str, str]:
        question = inputs["question"]
        task = inputs.get("task", "summary")  # Default to summary if task is not provided
        
        # Retrieve transcripts
        results = await asyncio.to_thread(self._retrieve_transcripts, question)
        
        results_filtered = await asyncio.to_thread(self.sort_by_timestamp, results)
       
        # Synthesize final answer
        final_answer = await asyncio.to_thread(self.transcript_synthesize, results_filtered, question, task)
        
        return {"answer": final_answer}

    def _call(
        self,
        inputs: Dict[str, Any],
        run_manager: Optional[CallbackManagerForChainRun] = None
    ) -> Dict[str, str]:
        question = inputs["question"]
        task = inputs.get("task", "summary")  # Default to summary if task is not provided

        # Retrieve articles
        results = self._retrieve_transcripts(question)
       
        # Synthesize final answer
        final_answer = self.transcript_synthesize(results_filtered, question, task)
        
        return {"answer": final_answer}

/opt/homebrew/Cellar/jupyterlab/4.2.5_1/libexec/lib/python3.12/site-packages/pydantic/_internal/_config.py:291: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/


In [126]:
from typing import Dict, Any
from openai import OpenAI
import os
import weaviate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory


class TranscriptAgent:
    def __init__(self, cl_instance=None, alpha=.75):
        self.openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
        self.weaviate_client = weaviate.Client("http://localhost:8080")
        # self.cl_instance = cl_instance
        # self.logger = setup_logging()
        # self.logger.info("Transcipt initialized")

        self.llm = ChatOpenAI(model="gpt-4o", temperature=0)
        self.memory = ConversationBufferMemory(return_messages=True)
        
        self.transcript_chain = TranscriptRetrievalChain(
            llm=self.llm,
            weaviate_client=self.weaviate_client,
            openai_client=self.openai_client,
            cl_instance=cl_instance,
            alpha=alpha
        )
        

    async def route_and_execute(self, inputs: Dict[str, Any]) -> Dict[str, str]:
        question = inputs["question"]
        result = await self.transcript_chain.ainvoke({"question": question, "task": "retrieve"})
        
        print(f"Query: {question}")
        print(f"Response: {result['answer']}")
        
        return result

    async def process_query(self, query: str, user_id: str) -> str:
        self.memory.chat_memory.add_user_message(query)
        result = await self.route_and_execute({"question": query})
        answer = result.get("answer", "I'm sorry, I couldn't generate a response for this query.")
        print(f"User {user_id} - Query: {query}")
        print(f"User {user_id} - Response: {answer}")
        return answer

In [127]:
# Setting up chain and testing

In [128]:
agent = TranscriptAgent()


Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  self.weaviate_client = weaviate.Client("http://localhost:8080")
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weavi

In [132]:
response = await agent.process_query("Which roles typically deal with the estimating process?", 1)

Formatted query:  Exploring roles like project managers, cost estimators, and quantity surveyors involved in the estimating process.
Query: Which roles typically deal with the estimating process?
Response: The provided podcast transcripts do not explicitly mention which roles typically deal with the estimating process. However, there are some relevant insights that can be inferred:

1. **Department Heads and GMs**: One transcript mentions department heads and general managers (GMs) becoming part of the production cycle, which could imply involvement in processes like estimating, especially if it relates to production efficiency and cost management (timestamp: 1:07).

2. **Finance Directors**: The same transcript also mentions finance directors, who are likely involved in the estimating process due to their role in managing financial aspects, including cost estimation and budgeting (timestamp: 1:07).

3. **Chief Engineer or Director of Facilities**: Another transcript mentions creating 