In [25]:
from rag_tests.db.mongo import connection
from rag_tests.core import get_logger
logger = get_logger(__name__)
mongo_db_connector = connection

In [26]:
def fetch_posts_from_mongo_by_author_firstname(firstname:str):
    _database = mongo_db_connector.get_database("scrabble")
    collection = _database["posts"]
    return collection.find({"author_id": {"$regex": firstname, "$options": "i"}})

In [27]:
from rag_tests.models.base import BaseModel
from rag_tests.models import PostsRawModel, PostChunkModel, PostCleanedModel, PostEmbeddedChunkModel
from qdrant_client.models import Batch
from rag_tests.models.clean import PostCleanedModel
from rag_tests.data_logic.cleaning_data_handlers import PostCleaningHandler
from rag_tests.data_logic.chunking_data_handlers import PostChunkingHandler
from rag_tests.data_logic.embedding_data_handlers import PostEmbeddingDataHandler
from rag_tests.qdrant_db import QdrantDatabaseConnector
from rag_tests.config import settings
collection_name = f"TEST_VARUN_{settings.EMBEDDING_MODEL_ID.replace('/','_')}"

In [4]:
# def fetch_posts_from_warehouse_by_linkedin_id(linkedin_id:str):
#     _data_warehouse= mongo_db_connector.get_database("srabble_raw")
#     collection = _data_warehouse["linkedin_users_all_data"]

In [5]:
cursor = fetch_posts_from_mongo_by_author_firstname("varun")
raw_posts = []
seen_posts = set()
flag =True
for post in cursor:
    if flag : 
        print(post.keys())
        flag = False
    entry_id = post["_id"]
    if entry_id not in seen_posts:
        seen_posts.add(entry_id)
        # remove _id from post
        post.pop("_id")

        raw_posts.append(PostsRawModel(entry_id=entry_id, **post))


dict_keys(['_id', 'platform', 'content', 'author_id'])


In [6]:
raw_posts= raw_posts[:110]

In [None]:
cleaning_handler  = PostCleaningHandler()
cleaned_posts = []
for raw_post in raw_posts:
    cleaned_obj = cleaning_handler.clean(raw_post)
    cleaned_posts.append(cleaned_obj)

print(f"Number of cleaned posts: {len(cleaned_posts)}")


Number of cleaned posts: 110


In [None]:

chunking_handler = PostChunkingHandler()
chunked_posts = []
for cleaned_post in cleaned_posts:
    chunked_obj = chunking_handler.chunk(cleaned_post)
    chunked_posts.append(chunked_obj)

print(f"Number of chunked posts: {len(chunked_posts)}")


In [None]:

embedding_handler = PostEmbeddingDataHandler()
embedded_posts:list[PostEmbeddedChunkModel] = []
for chunked_post in chunked_posts:
    embedded_obj = embedding_handler.embedd(chunked_post[0])
    embedded_posts.append(embedded_obj)

print(f"Number of embedded posts: {len(embedded_posts)}")


In [None]:

qdrant_db = QdrantDatabaseConnector()
payloads = [
    post.to_payload() for post in embedded_posts
]
ids, vectors, metadatas = zip(*payloads)

collection_name = f"TEST_VARUN_{settings.EMBEDDING_MODEL_ID.replace('/','_')}"
qdrant_db.create_vector_collection(collection_name)
qdrant_db.write_data(collection_name, points=Batch(ids=ids, vectors=vectors, payloads=metadatas))
print(f"Chala bhai daaldi {len(embedded_posts)} posts qdrant mei")

In [4]:
from dotenv import load_dotenv
from rag_tests.llm.chain import GeneralQueryChain
from rag_tests.rag.retreiver import VectorRetriever
from langchain_openai import ChatOpenAI
from rag_tests.config import settings
from rag_tests.llm.prompt_templates import GetToneUnderstandingPromptTemplate, ExtractToneFromHitsPromptTemplate


In [28]:
load_dotenv()
print("Starting the process")

author_id = "https://www.linkedin.com/in/varunaggarwal2"
query = """
    My author_id is https://www.linkedin.com/in/varunaggarwal2
    i want to draft a LinkedIn post discussing what are common mistakes that entrepreneurs make??.
    """
retriever = VectorRetriever(query=query)
hits = retriever.retrieve_top_k(k=6, to_expand_to_n_queries=5, collection_name=collection_name)
logger.info(f"Retrieved hits: {hits}")
print(f"Retrieved hits: {hits}")



Starting the process


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are an AI language model assistant. You are being used a topic expanding service, in a 
    Rag system. The user query would describe a topic, user wants to generate a linkedin post about. 
    based on the user query, you need to generate 5 topics as "english statements" aka queries to make a search in a vector database.
    These queries should be similar to the user query.
    
    Your task is to generate 5
    different versions of the given user's topic to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user topics, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search in a vector database.
    The vector database contains metadata of posts, along with their embeddings. The embeddings are based on the posts' content.
    Provide these queries to search in the vector databas

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
[A


[A[A[A

[A[A

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
[A


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]

Retrieved hits: [ScoredPoint(id='a4565651-7165-2189-a1b2-180aabb1e10d', version=0, score=0.48996496, payload={'id': 'e88f6f1a-8894-4bcc-89d3-3c057d2d3d62', 'platform': 'linkedin', 'owner_id': 'https://www.linkedin.com/in/varunaggarwal2', 'content': 'at change engine, we talk to several ai startups. < doublelinebreak / > what do i look for in startups when making an investing decision? < doublelinebreak / > 1. problem understanding : do the founders truly understand the problem? have they done value chain mapping to know how the process works today, where the inefficiencies are, and why they exist? < doublelinebreak / > 2. product demo : the demo is crucial. is the ui / ux seamless? how well does the ai solve the problem? is it a painkiller or just a nice - to - have? i love hearing user stories - - successes and failures. < doublelinebreak / > 3. team : i prefer a balance of tech and business co - founders. how driven are they? how well do they listen to customers and', 'original_conte




In [29]:
[{"Post " + str(i): hit.payload['content']} for i, hit in enumerate(hits)]

[{'Post 0': 'at change engine, we talk to several ai startups. < doublelinebreak / > what do i look for in startups when making an investing decision? < doublelinebreak / > 1. problem understanding : do the founders truly understand the problem? have they done value chain mapping to know how the process works today, where the inefficiencies are, and why they exist? < doublelinebreak / > 2. product demo : the demo is crucial. is the ui / ux seamless? how well does the ai solve the problem? is it a painkiller or just a nice - to - have? i love hearing user stories - - successes and failures. < doublelinebreak / > 3. team : i prefer a balance of tech and business co - founders. how driven are they? how well do they listen to customers and'},
 {'Post 1': 'the fight for the ace fellowship is heating up. we have 200 + applications from aspiring entrepreneurs who wants to participate in our 6 month program to build a < newline / > hashtag < newline / > # < newline / > jobtech < newline / > st

In [30]:
# Extracting tone from all hits
model = ChatOpenAI(model=settings.OPENAI_MODEL_ID, temperature=0)   
get_tone_understanding_prompt_template = ExtractToneFromHitsPromptTemplate()
prompt_template = get_tone_understanding_prompt_template.create_template()
chain = GeneralQueryChain().get_chain(
    llm=model, output_key="tone_understanding", template=prompt_template
)
response = chain.invoke({"author_id":"https://www.linkedin.com/in/varunaggarwal2", "posts":[{"Post " + str(i): hit.payload['content']} for i, hit in enumerate(hits)]})
logger.info(f"Get Tone Understanding.generate_response() result: \033[93m{response['tone_understanding']}\033[0m")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a content assistant specialized in analyzing text to extract tone, style, and formatting guidelines. 
    Your goal is to help identify consistent patterns and rules in a user’s writing style by examining their past content.
      Based on these insights, generate specific rules for style, tone, vocabulary, sentence structure, and formatting to ensure 
      future content aligns with the user's established voice.
      Analyze the following posts and derive a set of rules for generating content that aligns with my writing style and tone. Include guidelines for:  
        1.Style and Tone (formal, conversational, persuasive).
        2.Vocabulary Preferences (preferred words/phrases, jargon to avoid).
        3.Sentence Structure (active vs. passive voice, sentence length variation).
        4.Platform-Specific Formatting (LinkedIn, blog posts, emails).
        5.Inclusivity & Accessibility (gender

In [31]:
print(response['tone_understanding'])
# print("{\n\"Style and Tone\": {\n    \"Description\": \"The user's writing style is formal and informative, with a conversational tone. The user often shares insights, advice, and personal experiences related to startups, entrepreneurship, and investment. The tone is generally positive and encouraging.\",\n    \"Examples\": [\"building deeptech startups is challenging!\", \"what do i look for in startups when making an investing decision?\"]\n},\n\"Vocabulary Preferences\": {\n    \"Description\": \"The user prefers using industry-specific jargon related to startups, entrepreneurship, and technology. The user also uses hashtags frequently to highlight key topics.\",\n    \"Examples\": [\"deeptech\", \"startups\", \"entrepreneur\", \"investing decision\", \"problem understanding\", \"product demo\", \"team\", \"#deeptech\", \"#startups\", \"#entrepreneur\", \"#success\", \"#hacks\"]\n},\n\"Sentence Structure\": {\n    \"Description\": \"The user prefers using active voice and varies sentence length. The user often uses questions to engage the audience and lists to break down complex ideas.\",\n    \"Examples\": [\"we looked into 10 deeptech startups to understand what they did right and where they went wrong.\", \"what do i look for in startups when making an investing decision?\"]\n},\n\"Platform-Specific Formatting\": {\n    \"Description\": \"The user's posts are formatted for social media platforms like LinkedIn. The user uses line breaks and double line breaks to separate ideas and paragraphs. The user also uses hashtags to highlight key topics and tag relevant people or organizations.\",\n    \"Examples\": [\"< newline / >\", \"< doublelinebreak / >\", \"#deeptech\", \"#startups\", \"#entrepreneur\", \"#success\", \"#hacks\"]\n},\n\"Inclusivity & Accessibility\": {\n    \"Description\": \"The user uses gender-neutral language and avoids complex jargon to ensure readability and inclusivity.\",\n    \"Examples\": [\"if you are a deeptech entrepreneur, a wannabe, or work at one, these are the hacks you need to apply every day to the business.\"]\n},\n\"Semantic Rules\": {\n    \"Description\": \"The user adjusts the tone based on the context and avoids repetition by using synonyms or rephrasing sentences. The user also uses metaphors to explain complex ideas.\",\n    \"Examples\": [\"standing on the shoulders of others, make you see further, and ahead of time.\", \"is it a painkiller or just a nice - to - have?\"]\n},\n\"Consistency\": {\n    \"Description\": \"The user consistently uses lowercase letters, even when referring to oneself as 'i'. The user also consistently uses spaces around hyphens and slashes.\",\n    \"Examples\": [\"i love hearing user stories - - successes and failures.\", \"< newline / >\", \"< doublelinebreak / >\"]\n}\n}"
# )

{
"Style_and_Tone": {
    "Description": "The user's writing style is formal and informative, with a focus on sharing insights and knowledge. The tone is authoritative and confident, often providing advice or sharing personal experiences.",
    "Examples": ["at change engine, we talk to several ai startups.", "the fight for the ace fellowship is heating up."]
},
"Vocabulary_Preferences": {
    "Description": "The user prefers professional and industry-specific vocabulary, often related to startups, entrepreneurship, and technology. Jargon and acronyms are frequently used, but they are generally well-explained or commonly understood within the industry.",
    "Examples": ["problem understanding", "product demo", "business model", "ace fellowship", "deeptech startups"]
},
"Sentence_Structure": {
    "Description": "The user primarily uses active voice and varies sentence length to maintain reader interest. Lists and bullet points are often used to break down complex ideas or processes.",

In [32]:

reranked_hits = retriever.rerank(hits=hits, keep_top_k=5)
for rank, hit in enumerate(reranked_hits):
    print(f"{rank}: {hit}")
# reranked_hits = ["himanshu aggarwal < newline / > and i will talk through the aspiring minds story today at 530pm! < newline / > we will bring you insights using change engine ' s ace framework of building successful startups. < newline / > a. identifying the problem to solve < newline / > b. finding the right business model / markets < newline / > c. developing escape velocity < newline / > do not miss, register here : < newline / > [ url ] out ace here : < newline / > [ url ]", "join < newline / > manish gupta < newline / > and me today evening for an insightful conversation on ' building an ai startup in times of llm '. whether you are a seasoned entrepreneur or just starting your journey, this session is designed to be valuable for both sets of individuals. do not miss.....", 'at change engine, we talk to several ai startups. < doublelinebreak / > what do i look for in startups when making an investing decision? < doublelinebreak / > 1. problem understanding : do the founders truly understand the problem? have they done value chain mapping to know how the process works today, where the inefficiencies are, and why they exist? < doublelinebreak / > 2. product demo : the demo is crucial. is the ui / ux seamless? how well does the ai solve the problem? is it a painkiller or just a', 'the fight for the ace fellowship is heating up. we have 200 + applications from aspiring entrepreneurs who wants to participate in our 6 month program to build a < newline / > hashtag < newline / > # < newline / > jobtech < newline / > startup. < newline / > in parallel, we have started nurturing these entrepreneurs to fine tune their ideas to build scalable and impactful startups. < newline / > mukul singhal < newline / > from < newline / > pravega ventures < newline / > and i will discuss this thursday, " what business models attract', 'congratulations to the inaugural ace startup cohort 2023! < newline / > we are thrilled to announce that we found these kickass entrepreneurs and will be joining their exciting journey to success. with our passion for innovation and years of entrepreneurial experience, we will nurture their thoughts and collectively brew some groundbreaking ideas that can become thriving organizations over the next 6 months. < newline / > watch this space. we will keep you updated as these new founders make progress.']
# model = ChatOpenAI(model=settings.OPENAI_MODEL_ID, temperature=0)   
# get_tone_understanding_prompt_template = GetToneUnderstandingPromptTemplate()
# prompt_template = get_tone_understanding_prompt_template.create_template()
# chain = GeneralQueryChain().get_chain(
#     llm=model, output_key="tone_understanding", template=prompt_template
# )
# response = chain.invoke({"author_id":"https://www.linkedin.com/in/varunaggarwal2", "posts":reranked_hits})
# logger.info(f"Get Tone Understanding.generate_response() result: \033[93m{response['tone_understanding']}\033[0m")





[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are an AI language model assistant. Your task is to rerank passages related to a 
    query based on their relevance. 
    The most relevant passages should be put at the beginning. 
    You should only pick at max 5 passages.
    The provided and reranked documents are separated by '
#next-document#
'.
    
    The following are passages related to this query: 
    My author_id is https://www.linkedin.com/in/varunaggarwal2
    i want to draft a LinkedIn post discussing what are common mistakes that entrepreneurs make??.
    .
    
    Passages: 
    at change engine, we talk to several ai startups. < doublelinebreak / > what do i look for in startups when making an investing decision? < doublelinebreak / > 1. problem understanding : do the founders truly understand the problem? have they done value chain mapping to know how the process works today, where the inefficiencies are, and why they exist

In [33]:

prompt = f"""
You are a helpful assistant that can draft a LinkedIn post.
USING ONLY THE FOLLOWING tone, patterns and voice of the author {author_id} extracted from similar posts to the current topic, draft me a LinkedIn post discussing what are common mistakes that entrepreneurs make?.

The tone, patterns and voice of the author is: {response['tone_understanding']}
"""
response = model.invoke(prompt)
logger.info(f"Response: \033[93m{response.content}\033[0m")



In [34]:
print(response.content)

In the dynamic world of entrepreneurship, we often see common mistakes that can hinder the growth and success of startups. Here are a few that I've observed:

1. Lack of Market Understanding: Many entrepreneurs dive into their ventures without a comprehensive understanding of the market. This can lead to a misalignment between the product or service and the actual needs of the customers.

2. Inadequate Business Model: A robust business model is the backbone of any successful startup. Entrepreneurs often underestimate its importance, leading to unsustainable growth or even failure.

3. Ignoring Customer Feedback: How well an entrepreneur listens to their customers can make or break their startup. Ignoring customer feedback can result in a product that doesn't meet market needs.

4. Poor Financial Management: Startups need to have a clear financial plan. Mismanagement of funds can lead to a quick burnout.

5. Not Adapting to Change: The business landscape is constantly evolving. Entrepre