In [5]:
import os
import chromadb
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chromadb.config import Settings
from typing import List, Dict, Any
# Use Sentence Transformers for free embeddings
from sentence_transformers import SentenceTransformer



  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def load_stories(directory):
    """
    function to read stories from text files.
    """
    stories = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            loader = TextLoader(filepath, encoding='utf-8')
            documents = loader.load()
            
            # Extract story title from filename (remove .txt)
            story_title = os.path.splitext(filename)[0]
            
            # Combine document contents
            full_text = " ".join([doc.page_content for doc in documents])
            
            stories.append({
                story_title : full_text
            })
    
    return stories

In [7]:
a = load_stories("./data")
a[:3]

[{'a-mother': 'A Mother\n\nMr Holohan, assistant secretary of the Eire Abu Society, had been walking up and down Dublin for nearly a month, with his hands and pockets full of dirty pieces of paper, arranging about the series of concerts. He had a game leg and for this his friends called him Hoppy Holohan. He walked up and down constantly, stood by the hour at street corners arguing the point and made notes; but in the end it was Mrs Kearney who arranged everything.\n\nMiss Devlin had become Mrs Kearney out of spite. She had been educated in a high-class convent, where she had learned French and music. As she was naturally pale and unbending in manner she made few friends at school. When she came to the age of marriage she was sent out to many houses where her playing and ivory manners were much admired. She sat amid the chilly circle of her accomplishments, waiting for some suitor to brave it and offer her a brilliant life. But the young men whom she met were ordinary and she gave them

In [13]:
def compute_embeddings(stories: List[Dict[str, Any]], 
                        persist_directory: str = './chroma_db'):

    # Initialize embedding model
    model = SentenceTransformer("all-MiniLM-L6-v2")
    # model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    
    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(
        path=persist_directory, 
        settings=Settings(anonymized_telemetry=False)
    )
    
    # Create or get collection
    collection = chroma_client.get_or_create_collection(name="stories")
    
    # Process and embed stories
    for story in stories:
        # Split story into chunks for better embedding
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            chunk_overlap=200
        )
        
        # Split story content into chunks
        text_chunks = text_splitter.split_text(story['content'])
        
        # Embed and store chunks
        for chunk_idx, chunk in enumerate(text_chunks):
            # Generate embedding using Sentence Transformers
            embedding = model.encode(chunk).tolist()
            
            collection.add(
                embeddings=[embedding],
                documents=[chunk],
                metadatas=[{
                    'story_title': story['title'], 
                    'chunk_index': chunk_idx
                }],
                ids=[f"{story['title']}_{chunk_idx}"]
            )
    
    print(f"Embedded {len(stories)} stories in ChromaDB")


In [8]:

def compute_embeddings(stories: List[Dict[str, Any]], 
                        persist_directory: str = './chroma_db'):

    # Initialize embedding model
    model = SentenceTransformer("all-MiniLM-L6-v2")
    
    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(
        path=persist_directory, 
        settings=Settings(anonymized_telemetry=False)
    )
    
    # Create or get collection
    collection = chroma_client.get_or_create_collection(name="stories")
    
    # Process and embed stories
    for story in stories:
        # Since each story is a dictionary with a single key-value pair, unpack it
        for title, content in story.items():
            # Split story into chunks for better embedding
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000, 
                chunk_overlap=200
            )
            
            # Split the story content into chunks
            text_chunks = text_splitter.split_text(content)
            
            # Embed and store chunks
            for chunk_idx, chunk in enumerate(text_chunks):
                # Generate embedding using Sentence Transformers
                embedding = model.encode(chunk).tolist()
                
                collection.add(
                    embeddings=[embedding],
                    documents=[chunk],
                    metadatas=[{
                        'story_title': title,  # Use the title (key) for metadata
                        'chunk_index': chunk_idx
                    }],
                    ids=[f"{title}_{chunk_idx}"]  # Create unique ID for each chunk
                )
    
    print(f"Embedded {len(stories)} stories in ChromaDB")


In [4]:
data = [
    {'a-mother':'A Mother\n\nMr Holohan, assistant secretary of the Eire Abu Society, had been walking up and down Dublin for nearly a month, with his hands and pockets full of dirty pieces of paper, arranging aboutal.\n'},
    {'sorrow' : 'Sorrow\n\nTHE turner, Grigory Petrov'}
]


for story in data:
    # Since each story is a dictionary with a single key-value pair, unpack it
    for title, content in story.items():
        # Split story into chunks for better embedding
        # print(title)
        print(content)

        print("\n############")




A Mother

Mr Holohan, assistant secretary of the Eire Abu Society, had been walking up and down Dublin for nearly a month, with his hands and pockets full of dirty pieces of paper, arranging aboutal.


############
Sorrow

THE turner, Grigory Petrov

############


In [9]:

a = load_stories("./data")
compute_embeddings(a)

Embedded 5 stories in ChromaDB


## load information

In [10]:
import os
import json
from typing import List, Dict, Any

import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings

# Use Groq for text generation
from groq import Groq

In [11]:
def extract_character_info(character_name: str, 
                            persist_directory: str = './chroma_db', 
                            model_name: str = 'all-MiniLM-L6-v2') -> Dict[str, Any]:
    """
    Extract character information from embedded stories.
    """
    # Initialize embedding model
    model = SentenceTransformer(model_name)
    
    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(
        path=persist_directory, 
        settings=Settings(anonymized_telemetry=False)
    )
    
    # Get collection
    collection = chroma_client.get_or_create_collection(name="stories")
    
    # Embed character name
    character_embedding = model.encode(character_name).tolist()
    
    # Search for character mentions
    results = collection.query(
        query_embeddings=[character_embedding],
        n_results=5  # Top 5 most relevant chunks
    )
    
    return results

In [12]:
result = extract_character_info("Eliza")
print(result)

{'ids': [['the-lantern-keepers_5', 'the-lantern-keepers_2', 'the-lantern-keepers_1', 'the-lantern-keepers_3', 'the-schoolmistress_19']], 'embeddings': None, 'documents': [['One evening, as the first snow of winter fell, Eliza stood by the lantern and gazed out at the town below. She thought of Callum and the countless others who had tended the light before her. She wondered about the keepers yet to come.\n\nAnd as she lit the lantern, its beam stretching far into the night, she whispered, “May it always guide those who are lost.”', 'That night, Eliza walked back up to the lighthouse. Callum was inside, adjusting the mechanism. “Tell me,” she asked, “do you believe it’s magic?”\n\nCallum chuckled, though it sounded more like a cough. “Magic, faith, coincidence—it doesn’t matter what you call it. It works.”\n\nBefore Eliza could ask more, a boy burst in. His name was Ollie, the baker’s son, and his face was pale. “Please,” he gasped, “my sister’s gone missing in the woods!”\n\nCallum did

In [13]:
# Access the 'metadatas' key
data = result['metadatas']
print(data)

[[{'chunk_index': 5, 'story_title': 'the-lantern-keepers'}, {'chunk_index': 2, 'story_title': 'the-lantern-keepers'}, {'chunk_index': 1, 'story_title': 'the-lantern-keepers'}, {'chunk_index': 3, 'story_title': 'the-lantern-keepers'}, {'chunk_index': 19, 'story_title': 'the-schoolmistress'}]]


In [14]:
result = extract_character_info("john")
# print(result)
data = result['metadatas']
# print(data)

# Initialize an empty dictionary to store the counts
title_counts = {}

# Iterate through the list of dictionaries
for sublist in data:
    for item in sublist:
        title = item['story_title']
        if title in title_counts:
            title_counts[title] += 1
        else:
            title_counts[title] = 1

# Sort the dictionary by the count of occurrences in descending order
sorted_title_counts = dict(sorted(title_counts.items(), key=lambda item: item[1], reverse=True))

# Print the resulting dictionary
print(sorted_title_counts)


{'the-poor-relations-story': 5}


In [15]:
stories = load_stories("./data")
story = stories[:2]
story


[{'a-mother': 'A Mother\n\nMr Holohan, assistant secretary of the Eire Abu Society, had been walking up and down Dublin for nearly a month, with his hands and pockets full of dirty pieces of paper, arranging about the series of concerts. He had a game leg and for this his friends called him Hoppy Holohan. He walked up and down constantly, stood by the hour at street corners arguing the point and made notes; but in the end it was Mrs Kearney who arranged everything.\n\nMiss Devlin had become Mrs Kearney out of spite. She had been educated in a high-class convent, where she had learned French and music. As she was naturally pale and unbending in manner she made few friends at school. When she came to the age of marriage she was sent out to many houses where her playing and ivory manners were much admired. She sat amid the chilly circle of her accomplishments, waiting for some suitor to brave it and offer her a brilliant life. But the young men whom she met were ordinary and she gave them

In [21]:
import os
import glob
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_groq import ChatGroq

# Load environment variables
load_dotenv()


# Define a Pydantic model for structured output
class CharacterVerification(BaseModel):
    """
    Structured model for character verification in a story
    """
    character_present: int = Field(
        description="1 if character exists in the story, 0 if not",
        ge=0,  # Greater than or equal to 0
        le=1   # Less than or equal to 1
    )

def verify_character_in_story(story_text: str, character_name: str) -> int:
    """
    Verify if a character exists in a given story using LangChain and Groq
    """
    # Initialize JSON output parser
    output_parser = JsonOutputParser(pydantic_object=CharacterVerification)
    
    # Create prompt template
    prompt = PromptTemplate(
        template="""
        Analyze the following story and determine if the character '{character_name}' is present.
        
        Story:
        {story_text}
        
        {format_instructions}
        
        Respond with 1 if the character is in the story, 0 if not.
        """,
        input_variables=["story_text", "character_name"],
        partial_variables={
            "format_instructions": output_parser.get_format_instructions()
        }
    )
    
    # Initialize Groq LLM
    llm = ChatGroq(
        temperature=0,  # Most precise output
        model_name="llama3-8b-8192",
        api_key=os.getenv('GROQ_API_KEY')
    )
    
    # Create the chain
    chain = prompt | llm | output_parser
    
    try:
        # Invoke the chain
        result = chain.invoke({
            "story_text": story_text, 
            "character_name": character_name
        })
        
        # Return the verification result
        return result.get('character_present', 0)
    
    except Exception as e:
        print(f"Error in character verification: {e}")
        return 0



In [30]:
stories = load_stories("./data")
abc = "a-mother"
for title, content in stories.items():
    print(title)


AttributeError: 'list' object has no attribute 'items'

In [23]:
stories = load_stories("./data")
story = stories[0]['sorrow']
story

character = "mayur"
result = verify_character_in_story(story, character)
print(f"Character '{character}': {result}")

KeyError: 'sorrow'

In [80]:
import os
import json
import glob
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_groq import ChatGroq

# Load environment variables
load_dotenv()


# Define a Pydantic model for structured character information
class CharacterInfo(BaseModel):
    """
    Structured model for character information in a story
    """
    name: str = Field(description="Full name of the character")
    storyTitle: str = Field(description="Title of the story")
    summary: str = Field(description="Brief summary of the character's story")
    relations: list = Field(description="List of character's relationships")
    characterType: str = Field(description="Character's role in the story")

def extract_character_info(story_text: str, character_name: str) -> dict:
    """
    Extract detailed information about a character from a story
    
    :param story_text: Full text of the story
    :param character_name: Name of the character to extract info about
    :return: Structured character information as a dictionary
    """
    # Initialize JSON output parser
    output_parser = JsonOutputParser(pydantic_object=CharacterInfo)
    
    # Create prompt template
    prompt = PromptTemplate(
        template="""
        Analyze the following story and extract comprehensive information about the character '{character_name}'.
        
        Story:
        {story_text}
        
        {format_instructions}
        
        Provide a detailed JSON response with:
        - Character's full name
        - Story title
        - Character's role and journey summary
        - Relationships with other characters
        - Character type (protagonist, antagonist, side character, etc.)
        
        If the character is not found, return an empty JSON object.
        """,
        input_variables=["story_text", "character_name"],
        partial_variables={
            "format_instructions": output_parser.get_format_instructions()
        }
    )
    
    # Initialize Groq LLM
    llm = ChatGroq(
        temperature=0.7,  # Allow some creativity in summarization
        model_name="llama3-8b-8192",
        api_key=os.getenv('GROQ_API_KEY')
    )
    
    # Create the chain
    chain = prompt | llm | output_parser
    
    try:
        # Invoke the chain
        result = chain.invoke({
            "story_text": story_text, 
            "character_name": character_name
        })
        
        return result
    
    except Exception as e:
        print(f"Error in character information extraction: {e}")
        return {}

# def main():
#     # Load stories from data directory
#     stories = load_stories('./data')
    
#     # Characters to extract information about
#     characters_to_extract = [
#         "Jon Snow",
#         "Arya Stark",
#         "Tyrion Lannister"
#     ]
    
#     # Extract character information across all loaded stories
#     for story_name, story_text in stories.items():
#         print(f"\n--- Story: {story_name} ---")
        
#         for character in characters_to_extract:
#             # Extract character information
#             character_info = extract_character_info(story_text, character)
            
#             # Print formatted JSON output
#             print(f"Character: {character}")
#             print(json.dumps(character_info, indent=2))
#             print("-" * 50)

# if __name__ == '__main__':
#     main()

In [85]:
stories = load_stories("./data")
story = stories[2]['content']
story

character = "Eliza"
result = extract_character_info(story, character)
print(f"Character '{character}': {result}")

Character 'Eliza': {'name': 'Eliza', 'storyTitle': 'The Lantern Keepers', 'summary': 'Eliza, a young artist, becomes the keeper of the mysterious lighthouse in Bramblewick, guiding those who are lost and learning the stories of those the light has saved.', 'relations': [{'name': 'Callum', 'relationship': 'mentor'}, {'name': 'Ollie', 'relationship': 'friend'}, {'name': 'Tilly', 'relationship': 'helped'}], 'characterType': 'protagonist'}


## ChromaDB output format

In [82]:
# Provided dictionary
data = {
  'documents': [[
      'This is a document about pineapple',
      'This is a document about oranges'
  ]],
  'ids': [['id1', 'id2']],
  'distances': [[1.0404009819030762, 1.243080496788025]],
  'uris': None,
  'data': None,
  'metadatas': [[None, None]],
  'embeddings': None,
}

# Access the 'metadatas' key
metadatas = data['metadatas']

# Print the metadatas
print(metadatas)


[[None, None]]


In [83]:
import chromadb
from chromadb.config import Settings

def print_collection_structure(collection_name='stories', persist_directory='./chroma_db'):
    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(
        path=persist_directory, 
        settings=Settings(anonymized_telemetry=False)
    )
    
    # Get or create collection
    collection = chroma_client.get_or_create_collection(name=collection_name)
    
    # Print collection details
    print("Collection Name:", collection_name)
    
    # Get all document IDs in the collection
    document_ids = collection.get_all_document_ids()
    print("\nDocument IDs:", document_ids)
    
    # Get all documents in the collection
    documents = collection.get_all_documents()
    print("\nDocuments:")
    for doc in documents:
        print(f"ID: {doc['id']}, Document: {doc['document']}, Metadata: {doc['metadata']}")
    
    # Get all embeddings in the collection
    embeddings = collection.get_all_embeddings()
    print("\nEmbeddings:")
    for embedding in embeddings:
        print(embedding)
    
    # If needed, you can also print metadata and other details
    metadata = collection.get_all_metadata()
    print("\nMetadata:")
    for meta in metadata:
        print(meta)

# Example usage
print_collection_structure(collection_name="stories", persist_directory="./chroma_db")


Collection Name: stories


AttributeError: 'Collection' object has no attribute 'get_all_document_ids'