In [6]:
import os
import chromadb
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chromadb.config import Settings
from typing import List, Dict, Any
# Use Sentence Transformers for free embeddings
from sentence_transformers import SentenceTransformer



In [7]:
def load_stories(directory):
    """
    function to read stories from text files.
    """
    stories = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            loader = TextLoader(filepath, encoding='utf-8')
            documents = loader.load()
            
            # Extract story title from filename (remove .txt)
            story_title = os.path.splitext(filename)[0]
            
            # Combine document contents
            full_text = " ".join([doc.page_content for doc in documents])
            
            stories.append({
                'title': story_title,
                'content': full_text
            })
    
    return stories

In [8]:
a = load_stories("./data")
a[0]

{'title': 'a-mother',
 'content': 'A Mother\n\nMr Holohan, assistant secretary of the Eire Abu Society, had been walking up and down Dublin for nearly a month, with his hands and pockets full of dirty pieces of paper, arranging about the series of concerts. He had a game leg and for this his friends called him Hoppy Holohan. He walked up and down constantly, stood by the hour at street corners arguing the point and made notes; but in the end it was Mrs Kearney who arranged everything.\n\nMiss Devlin had become Mrs Kearney out of spite. She had been educated in a high-class convent, where she had learned French and music. As she was naturally pale and unbending in manner she made few friends at school. When she came to the age of marriage she was sent out to many houses where her playing and ivory manners were much admired. She sat amid the chilly circle of her accomplishments, waiting for some suitor to brave it and offer her a brilliant life. But the young men whom she met were ordina

In [9]:
def compute_embeddings(stories: List[Dict[str, Any]], 
                        persist_directory: str = './chroma_db'):

    # Initialize embedding model
    model = SentenceTransformer("all-MiniLM-L6-v2")
    # model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    
    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(
        path=persist_directory, 
        settings=Settings(anonymized_telemetry=False)
    )
    
    # Create or get collection
    collection = chroma_client.get_or_create_collection(name="stories")
    
    # Process and embed stories
    for story in stories:
        # Split story into chunks for better embedding
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            chunk_overlap=200
        )
        
        # Split story content into chunks
        text_chunks = text_splitter.split_text(story['content'])
        
        # Embed and store chunks
        for chunk_idx, chunk in enumerate(text_chunks):
            # Generate embedding using Sentence Transformers
            embedding = model.encode(chunk).tolist()
            
            collection.add(
                embeddings=[embedding],
                documents=[chunk],
                metadatas=[{
                    'story_title': story['title'], 
                    'chunk_index': chunk_idx
                }],
                ids=[f"{story['title']}_{chunk_idx}"]
            )
    
    print(f"Embedded {len(stories)} stories in ChromaDB")


In [10]:

a = load_stories("./data")
compute_embeddings(a)

Add of existing embedding ID: a-mother_0
Insert of existing embedding ID: a-mother_0
Add of existing embedding ID: a-mother_1
Insert of existing embedding ID: a-mother_1
Add of existing embedding ID: a-mother_2
Insert of existing embedding ID: a-mother_2
Add of existing embedding ID: a-mother_3
Insert of existing embedding ID: a-mother_3
Add of existing embedding ID: a-mother_4
Insert of existing embedding ID: a-mother_4
Add of existing embedding ID: a-mother_5
Insert of existing embedding ID: a-mother_5
Add of existing embedding ID: a-mother_6
Insert of existing embedding ID: a-mother_6
Add of existing embedding ID: a-mother_7
Insert of existing embedding ID: a-mother_7
Add of existing embedding ID: a-mother_8
Insert of existing embedding ID: a-mother_8
Add of existing embedding ID: a-mother_9
Insert of existing embedding ID: a-mother_9
Add of existing embedding ID: a-mother_10
Insert of existing embedding ID: a-mother_10
Add of existing embedding ID: a-mother_11
Insert of existing em

Embedded 5 stories in ChromaDB


## load information

In [14]:
import os
import json
from typing import List, Dict, Any

import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings

# Use Groq for text generation
from groq import Groq

In [16]:
def extract_relevant_chunks(character_name: str, 
                            persist_directory: str = './chroma_db', 
                            model_name: str = 'all-MiniLM-L6-v2') -> Dict[str, Any]:
    """
    Extract character information from embedded stories.
    """
    # Initialize embedding model
    model = SentenceTransformer(model_name)
    
    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(
        path=persist_directory, 
        settings=Settings(anonymized_telemetry=False)
    )
    
    # Get collection
    collection = chroma_client.get_or_create_collection(name="stories")
    
    # Embed character name
    character_embedding = model.encode(character_name).tolist()
    
    # Search for character mentions
    results = collection.query(
        query_embeddings=[character_embedding],
        n_results=5  # Top 5 most relevant chunks
    )
    
    return results

In [None]:
result = extract_relevant_chunks("Eliza")
print(result)

{'ids': [['the-lantern-keepers_5', 'the-lantern-keepers_2', 'the-lantern-keepers_1', 'the-lantern-keepers_3', 'the-schoolmistress_19']], 'embeddings': None, 'documents': [['One evening, as the first snow of winter fell, Eliza stood by the lantern and gazed out at the town below. She thought of Callum and the countless others who had tended the light before her. She wondered about the keepers yet to come.\n\nAnd as she lit the lantern, its beam stretching far into the night, she whispered, “May it always guide those who are lost.”', 'That night, Eliza walked back up to the lighthouse. Callum was inside, adjusting the mechanism. “Tell me,” she asked, “do you believe it’s magic?”\n\nCallum chuckled, though it sounded more like a cough. “Magic, faith, coincidence—it doesn’t matter what you call it. It works.”\n\nBefore Eliza could ask more, a boy burst in. His name was Ollie, the baker’s son, and his face was pale. “Please,” he gasped, “my sister’s gone missing in the woods!”\n\nCallum did

In [18]:
# Access the 'metadatas' key
data = result['metadatas']
print(data)

[[{'chunk_index': 5, 'story_title': 'the-lantern-keepers'}, {'chunk_index': 2, 'story_title': 'the-lantern-keepers'}, {'chunk_index': 1, 'story_title': 'the-lantern-keepers'}, {'chunk_index': 3, 'story_title': 'the-lantern-keepers'}, {'chunk_index': 19, 'story_title': 'the-schoolmistress'}]]


In [19]:
result = extract_relevant_chunks("john")
# print(result)
data = result['metadatas']
# print(data)

# Initialize an empty dictionary to store the counts
title_counts = {}

# Iterate through the list of dictionaries
for sublist in data:
    for item in sublist:
        title = item['story_title']
        if title in title_counts:
            title_counts[title] += 1
        else:
            title_counts[title] = 1

# Sort the dictionary by the count of occurrences in descending order
sorted_title_counts = dict(sorted(title_counts.items(), key=lambda item: item[1], reverse=True))

# Print the resulting dictionary
print(sorted_title_counts)


{'the-poor-relations-story': 5}


In [21]:
import os
import glob
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_groq import ChatGroq

# Load environment variables
load_dotenv()


# Define a Pydantic model for structured output
class CharacterVerification(BaseModel):
    """
    Structured model for character verification in a story
    """
    character_present: int = Field(
        description="1 if character exists in the story, 0 if not",
        ge=0,  # Greater than or equal to 0
        le=1   # Less than or equal to 1
    )

def verify_character_in_story(story_text: str, character_name: str) -> int:
    """
    Verify if a character exists in a given story using LangChain and Groq
    """
    # Initialize JSON output parser
    output_parser = JsonOutputParser(pydantic_object=CharacterVerification)
    
    # Create prompt template
    prompt = PromptTemplate(
        template="""
        Analyze the following story and determine if the character '{character_name}' is present.
        
        Story:
        {story_text}
        
        {format_instructions}
        
        Respond with 1 if the character is in the story, 0 if not.
        """,
        input_variables=["story_text", "character_name"],
        partial_variables={
            "format_instructions": output_parser.get_format_instructions()
        }
    )
    
    # Initialize Groq LLM
    llm = ChatGroq(
        temperature=0,  # Most precise output
        model_name="llama3-8b-8192",
        api_key=os.getenv('GROQ_API_KEY')
    )
    
    # Create the chain
    chain = prompt | llm | output_parser
    
    try:
        # Invoke the chain
        result = chain.invoke({
            "story_text": story_text, 
            "character_name": character_name
        })
        
        # Return the verification result
        return result.get('character_present', 0)
    
    except Exception as e:
        print(f"Error in character verification: {e}")
        return 0




For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [22]:
stories = load_stories("./data")
story = stories[0]['content']
story

character = "mayur"
result = verify_character_in_story(story, character)
print(f"Character '{character}': {result}")

Character 'mayur': 0


In [23]:
import os
import json
import glob
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_groq import ChatGroq

# Load environment variables
load_dotenv()


# Define a Pydantic model for structured character information
class CharacterInfo(BaseModel):
    """
    Structured model for character information in a story
    """
    name: str = Field(description="Full name of the character")
    storyTitle: str = Field(description="Title of the story")
    summary: str = Field(description="Brief summary of the character's story")
    relations: list = Field(description="List of character's relationships")
    characterType: str = Field(description="Character's role in the story")

def extract_character_info(story_text: str, character_name: str) -> dict:
    """
    Extract detailed information about a character from a story
    
    :param story_text: Full text of the story
    :param character_name: Name of the character to extract info about
    :return: Structured character information as a dictionary
    """
    # Initialize JSON output parser
    output_parser = JsonOutputParser(pydantic_object=CharacterInfo)
    
    # Create prompt template
    prompt = PromptTemplate(
        template="""
        Analyze the following story and extract comprehensive information about the character '{character_name}'.
        
        Story:
        {story_text}
        
        {format_instructions}
        
        Provide a detailed JSON response with:
        - Character's full name
        - Story title
        - Character's role and journey summary
        - Relationships with other characters
        - Character type (protagonist, antagonist, side character, etc.)
        
        If the character is not found, return an empty JSON object.
        """,
        input_variables=["story_text", "character_name"],
        partial_variables={
            "format_instructions": output_parser.get_format_instructions()
        }
    )
    
    # Initialize Groq LLM
    llm = ChatGroq(
        temperature=0.7,  # Allow some creativity in summarization
        model_name="llama3-8b-8192",
        api_key=os.getenv('GROQ_API_KEY')
    )
    
    # Create the chain
    chain = prompt | llm | output_parser
    
    try:
        # Invoke the chain
        result = chain.invoke({
            "story_text": story_text, 
            "character_name": character_name
        })
        
        return result
    
    except Exception as e:
        print(f"Error in character information extraction: {e}")
        return {}



In [27]:
stories = load_stories("./data")

# Convert the list of dictionaries into a dictionary for quick access
content_dict = {item['title']: item['content'] for item in stories}


# Get the content for the provided title
story = content_dict.get("a-mother")


character = "Holohan"
result = extract_character_info(story, character)
print(f"Character '{character}': {result}")

Character 'Holohan': {'name': 'Mr. Holohan', 'storyTitle': 'A Mother', 'summary': 'Mr. Holohan is an assistant secretary of the Eire Abu Society, responsible for organizing a series of concerts. He is a novice in the music business and relies on Mrs. Kearney for advice and guidance.', 'relations': [{'name': 'Mrs. Kearney', 'relationship': 'Collaborator'}], 'characterType': 'Side Character'}


## ChromaDB output format

In [82]:
# Provided dictionary
data = {
  'documents': [[
      'This is a document about pineapple',
      'This is a document about oranges'
  ]],
  'ids': [['id1', 'id2']],
  'distances': [[1.0404009819030762, 1.243080496788025]],
  'uris': None,
  'data': None,
  'metadatas': [[None, None]],
  'embeddings': None,
}

# Access the 'metadatas' key
metadatas = data['metadatas']

# Print the metadatas
print(metadatas)


[[None, None]]
