In [None]:
from kinconnect_api.config import MONGO_CONNECTION_STRING, load_dotenv
import os
from pymongo import MongoClient
import os, pymongo, pprint
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymongo import MongoClient

# Connect to your Atlas cluster
client = MongoClient(MONGO_CONNECTION_STRING)
# Define collection and index name
db_name = "kinconnect"
collection_name = "app"
atlas_collection = client[db_name][collection_name]
vector_search_index = "vector_index"

In [None]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi


uri = MONGO_CONNECTION_STRING
# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
loader = PyPDFLoader("https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP")
data = loader.load()
# Split PDF into documents
from langchain_text_splitters import MarkdownHeaderTextSplitter
markdown_document = "# Foo\n\n    ## Bar\n\nHi this is Jim\n\nHi this is Joe\n\n ### Boo \n\n Hi this is Lance \n\n ## Baz\n\n Hi this is Molly"

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
md_header_splits
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
docs = text_splitter.split_documents(data)
# Print the first document
docs[0]

In [None]:
from langchain_fireworks import FireworksEmbeddings
load_dotenv()


# Create the vector store
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents = docs,
    embedding = FireworksEmbeddings(model="nomic-ai/nomic-embed-text-v1.5"),
    collection = atlas_collection,
    index_name = vector_search_index
)

In [None]:
from kinconnect_api.config import load_dotenv
load_dotenv()
from typing import List
from langchain_core.messages import HumanMessage
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_fireworks import ChatFireworks

portfolio_prompt_string = open("/Users/nehiljain/code/kinconnect/kinconnect_api/prompts/prompt_extract_portfolio.txt", "r").read()
profile_prompt_string = open("/Users/nehiljain/code/kinconnect/kinconnect_api/prompts/prompt_extract_proile_attributes.txt", "r").read()
career_history_prompt_string = open("/Users/nehiljain/code/kinconnect/kinconnect_api/prompts/prompt_extract_career_firefunc.txt", "r").read()

firefunc_model = "accounts/fireworks/models/firefunction-v2"
mistral_model = "accounts/fireworks/models/mistral-7b-instruct-v3"

class ProfileModel(BaseModel):
    name: str = Field(..., title="Name of the person")
    honors: list[str] = Field(None, title="Honors, Awards and recognition they have received in life")
    interests: list[str] = Field(..., title="Interests and current focus of theirs the work or the event")
    skills: list[str] = Field(..., title="Skills they have")


class CareerEntry(BaseModel):
    company: str = Field(..., description="Company they worked at")
    title: str = Field(..., description="Title of the role they held")
    description: str = Field(..., description="Description of the role they held")
    start_date: str = Field(..., description="Start date of the role")
    end_date: str = Field(..., description="End date of the role")

class CareerHistory(BaseModel):
    history: List[CareerEntry] = Field(..., description="All the companies you have been at as part of your career")


class ProjectEntry(BaseModel):
    title: str = Field(..., title="Title of the project")
    description: str = Field(..., title="Description of the project")

class Portfolio(BaseModel):
    projects: List[ProjectEntry] = Field(..., description="All the projects you have worked on")

def call_api(prompt, structed_class, model):

    fireworks_llm = ChatFireworks(model=model)
    fireworks_llm = fireworks_llm.with_structured_output(structed_class)
    
    try:
        output = fireworks_llm.invoke([HumanMessage(content=prompt)])
        return {
            "output": output.dict(),
            "error": None
        }
        
    except Exception as e:
        return {
            "output": None,
            "error": e
        }


# print(call_api("I am a google engineer with 2 years of experience", Portfolio, mistral_model))

In [None]:
question_answer_pair = {"Timestamp":"01/07/2024 10:51:13","What is your name? ":"Alex Chi","What are your interests? (ie technical topic, coding language, business problem).":"I'm passionate about AI ethics, natural language processing, and creating accessible technology. I'm proficient in Python, TensorFlow, and have experience with large language models.","If you have a project idea, describe your idea . Please include whether what sector it is in, and what business problem it is solving and for whom. If you don’t have a project, skip this question.":"I have an idea for an AI-powered language learning assistant. It's in the edtech sector, solving the problem of personalized language acquisition for adult learners. The project would use GPT-based models to create interactive, context-aware conversations tailored to each user's proficiency level and learning style.","What is your strongest functional role (such as developer, UX, business, product)? Please share one or two things about your experience in role your experience, for example a success, companies you worked for, how many years experience, a challenging project, etc.":"My strongest role is as an AI researcher and developer. I have 5 years of experience, including 3 years at Google AI, where I contributed to the development of BERT. One of my biggest successes was implementing a bias detection and mitigation system for large language models, which is now used across multiple Google products.","Describe your career history? Think of it like a snapshot of your LinkedIn that is relevant for your teammates at this hackathon.":"- AI Research Scientist at OpenAI (Current, 2 years)\n- Senior AI Developer at Google AI (3 years)\n- Machine Learning Engineer at Coursera (2 years)\n- Ph.D. in Computer Science, specializing in NLP, from Stanford University","What are some of the projects you are proud of? Share links and description of what you did and why you are so proud of them":"Project 1: Bias Mitigation in LLMs\nDescription: Developed a system to detect and mitigate biases in large language models. This project involved creating a comprehensive framework for identifying various types of biases (gender, racial, socioeconomic) and implementing techniques to reduce these biases during model training and inference.\nLink: github.com/alexchi/bias-mitigation-llm\n\nProject 2: Multilingual NLP Toolkit\nDescription: Created an open-source toolkit for multilingual natural language processing tasks. This project supports over 100 languages and includes modules for tokenization, named entity recognition, sentiment analysis, and machine translation.\nLink: github.com/alexchi/multilingual-nlp-toolkit\n\nProject 3: AI-Powered Code Explanation Tool\nDescription: Developed a tool that uses GPT-3 to generate human-readable explanations of complex code snippets. This project aims to make programming more accessible to beginners and non-technical stakeholders.\nLink: github.com/alexchi/code-explainer-ai\n","Are you interested in meeting people with a specific skill set (either one that you lack or one that you already have but want to clone yourself to speed up building). What is the skills sets that you are looking to meet?":"I'm particularly interested in meeting UX designers and product managers who have experience in educational technology. I'd also love to connect with frontend developers who can create intuitive, accessible interfaces for AI-powered applications."}

In [None]:
def convert_question_answer_pair_to_markdown(question_answer_pair):
    markdown = ""
    for question, answer in question_answer_pair.items():
        markdown += f"## {question}\n{answer}\n\n"
    return markdown

def extract_career_history(question_answer_pair_markdown):
    career_history_prompt = career_history_prompt_string.replace("{{bio}}", question_answer_pair_markdown)

    return call_api(career_history_prompt, CareerHistory, firefunc_model)


def extract_profile_details(question_answer_pair_markdown):
    profile_prompt = profile_prompt_string.replace("{{bio}}", question_answer_pair_markdown)
    return call_api(profile_prompt, ProfileModel, mistral_model)

def extract_portfolio(question_answer_pair_markdown):
    portfolio_prompt = portfolio_prompt_string.replace("{{bio}}", question_answer_pair_markdown)
    return call_api(portfolio_prompt, Portfolio, mistral_model)

markdown_profile = convert_question_answer_pair_to_markdown(question_answer_pair)
career_history = extract_career_history(markdown_profile)
profile_details = extract_profile_details(markdown_profile)
portfolio = extract_portfolio(markdown_profile)
profile = profile_details['output']
profile['career_history'] = career_history['output']
profile['portfolio'] = portfolio['output']
profile['form_submission'] = markdown_profile
print(profile)

In [1]:
import logging
logging.getLogger().setLevel(logging.INFO)

In [2]:
import openai

client = openai.OpenAI(
    base_url = "https://api.fireworks.ai/inference/v1",
    api_key="9umTPOs14ATgAe038b6OSohQHi5JX0SrgJztIQjd3IIRSgNR",
)
response = client.embeddings.create(
  model="nomic-ai/nomic-embed-text-v1.5",
  input="search_document: Spiderman was a particularly entertaining movie with...",
)

print(response)

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.03369140625, 0.03814697265625, -0.130859375, 0.022735595703125, 0.03460693359375, 0.0712890625, -0.011383056640625, -0.0523681640625, 0.0016880035400390625, -0.002918243408203125, -0.027923583984375, 0.033447265625, 0.04351806640625, 0.0233154296875, -0.039276123046875, -0.115478515625, -0.0016450881958007812, -0.075439453125, 0.022125244140625, -0.0017156600952148438, -0.0771484375, -0.06463623046875, 0.06201171875, -0.03533935546875, 0.00534820556640625, 0.034149169921875, -0.06451416015625, -0.0272674560546875, -0.00991058349609375, 0.03228759765625, 0.0113983154296875, -0.017181396484375, -0.00899505615234375, -0.05438232421875, 0.0121612548828125, -0.047607421875, 0.0171356201171875, 0.041778564453125, 0.026580810546875, 0.0211944580078125, 0.0186767578125, 0.02899169921875, -0.0253448486328125, -0.0198516845703125, 0.08709716796875, -0.02374267578125, 0.04827880859375, 0.025360107421875, 0.0186309814453125, -0.046539306640625,

In [None]:
len(res_query)

In [None]:
synthetic_data_prompt = '''
Generate a new profile by answering the questions. The form is a set of questions that a person (fictitious profile) answers before going to a hackathon. The profile is a complex data object. Use the examples below as inspiration for the type of answers. Each example is a concatenated string of question and answer in markdown format.

The goal is to create a good representation of a participant in the hackathon in Silicon Valley, US.

<form_questions>
1. **What is your name?** 
2. **What are your interests?  (ie technical topic, coding language, business problem).**
3. **If you have a project idea, describe your idea . Please include whether what sector it is in, and what business problem it is solving and for whom. If you don’t have a project, skip this question.**
4. **If you have a project idea, describe your idea . Please include whether what sector it is in, and what business problem it is solving and for whom. If you don’t have a project, skip this question.**
5. **What is your strongest functional role (such as developer, UX, business, product)? Please share one or two things about your experience in role your experience, for example a success, companies you worked for, how many years experience, a challenging project, etc.**
6. **Describe your career history? Think of it like a snapshot of your LinkedIn that is relevant for your teammates at this hackathon.**
7. **What are some of the projects you are proud of? Share links and description of what you did and why you are so proud of them**
8. **Are you interested in meeting people with a specific skill set (either one that you lack or one that you already have but want to clone yourself to speed up building). What is the skills sets that you are looking to meet?**
</form_questions>

<examples>
1. "## What is your name? : \n Chloe Wong\n\n## What are your interests?  (ie technical topic, coding language, business problem).: \n I'm interested in Rags and AI LLMs\n\n## If you have a project idea, describe your idea . Please include whether what sector it is in, and what business problem it is solving and for whom. If you don’t have a project, skip this question.: \n nan\n\n## What is your strongest functional role (such as developer, UX, business, product)? Please share one or two things about your experience in role your experience, for example a success, companies you worked for, how many years experience, a challenging project, etc.: \n Back end developer, 10 years, build and deployed backend databases for Netflix including adding AI functionality to Netflix recommendation engines. \n\n## Career path from Linkedin: \n Senior Software EngineerSenior Software Engineer, Netflix, Netflix , Jun 2018 - Present · 6 yrs 1 mo, PayPal 3 yrs 11 mos3 yrs 11 mos\n\n## Are you interested in meeting people with a specific skill set (either one that you lack or one that you already have but want to clone yourself to speed up building). What is the skills sets that you are looking to meet?: \n Product, frontend, \n\n## Past Projects Portfolio: \n ### Project 1: **Netflix Recommendation Engine Enhancement**\n\n**Title:** AI-Powered Recommendation Engine for Netflix\n\n**Description:** Led the development and deployment of an advanced recommendation engine for Netflix. This project aimed to enhance the accuracy and personalization of content recommendations for users by integrating machine learning algorithms. The system utilized user behavior data, viewing history, and ratings to predict and suggest content that matched user preferences. The project included a real-time processing pipeline to ensure recommendations were updated dynamically as user interactions occurred.\n\n**Skills:** \n- Python\n- Machine Learning\n- TensorFlow/PyTorch\n- Apache Spark\n- AWS (S3, EC2, Lambda)\n- SQL\n- Big Data (Hadoop)\n- Data Engineering\n- API Development\n- Docker/Kubernetes\n\n### Project 2: **PayPal Fraud Detection System**\n\n**Title:** Real-Time Fraud Detection System for PayPal\n\n**Description:** Developed and deployed a robust fraud detection system for PayPal. This project involved creating a machine learning-based system to detect and prevent fraudulent transactions in real-time. The system analyzed transaction patterns, user behavior, and historical fraud data to identify suspicious activities. By implementing advanced algorithms and a scalable architecture, the system significantly reduced the incidence of fraud and enhanced the security of PayPal’s platform.\n\n**Skills:** \n- Java\n- Python\n- Machine Learning\n- Apache Kafka\n- NoSQL Databases (MongoDB, Cassandra)\n- SQL\n- Data Engineering\n- Real-Time Processing\n- Microservices Architecture\n- AWS (S3, EC2, Lambda)\n- Docker/Kubernetes\n\n### Project 3: **Netflix Data Lake**\n\n**Title:** Scalable Data Lake Infrastructure for Netflix\n\n**Description:** Designed and implemented a scalable data lake infrastructure for Netflix to store and manage vast amounts of data efficiently. The project involved setting up a distributed data storage system that could handle petabytes of structured and unstructured data. The data lake facilitated efficient data ingestion, storage, processing, and retrieval for various analytics and machine learning applications. This infrastructure played a crucial role in enabling data-driven decision-making across Netflix.\n\n**Skills:** \n- Java\n- Python\n- Apache Hadoop\n- Apache Spark\n- AWS (S3, EMR)\n- SQL\n- Data Engineering\n- ETL Processes\n- Distributed Systems\n- Docker/Kubernetes"

2. "## What is your name? : \n Rehka Mehta\n\n## What are your interests?  (ie technical topic, coding language, business problem).: \n Fashion \n\n## If you have a project idea, describe your idea . Please include whether what sector it is in, and what business problem it is solving and for whom. If you don’t have a project, skip this question.: \n E-commerce\n\n## What is your strongest functional role (such as developer, UX, business, product)? Please share one or two things about your experience in role your experience, for example a success, companies you worked for, how many years experience, a challenging project, etc.: \n Product manager.  I am an expert in personal recommendation.\n\n## Career path from Linkedin: \n As a product manager at Wayfair, they lead the development of innovative e-commerce solutions to enhance customer experience. With expertise in data-driven decision-making, they drive projects that optimize the online shopping journey.\n\n## Are you interested in meeting people with a specific skill set (either one that you lack or one that you already have but want to clone yourself to speed up building). What is the skills sets that you are looking to meet?: \n Data engineer\n\n## Past Projects Portfolio: \n ### Project 1: **Personalized Recommendation Engine for Wayfair**\n\n**Title:** Personalized Recommendation Engine for Wayfair\n\n**Description:** Led the development of a personalized recommendation engine for Wayfair's e-commerce platform. The project focused on leveraging customer data and advanced machine learning algorithms to provide tailored product recommendations. By analyzing user behavior, preferences, and purchase history, the engine delivered highly relevant suggestions, significantly increasing customer engagement and sales.\n\n**Skills:** \n- Machine Learning\n- Data Analysis\n- Product Management\n- Personalization\n\n### Project 2: **Enhanced Product Search and Discovery**\n\n**Title:** Enhanced Product Search and Discovery for Wayfair\n\n**Description:** Spearheaded the enhancement of Wayfair's product search and discovery features to improve the online shopping experience. The project involved optimizing search algorithms, implementing advanced filtering options, and integrating visual search capabilities. These improvements allowed customers to find products more easily and accurately, resulting in higher conversion rates and customer satisfaction.\n\n**Skills:** \n- Search Engine Optimization (SEO)\n- Data-Driven Decision Making\n- User Experience (UX) Design\n- Product Management\n\n### Project 3: **Customer Insights and Analytics Platform**\n\n**Title:** Customer Insights and Analytics Platform for Wayfair\n\n**Description:** Developed a comprehensive customer insights and analytics platform to support data-driven decision-making across Wayfair. The platform aggregated and analyzed customer data, providing actionable insights to inform marketing strategies, product development, and personalized customer experiences. This project enabled the company to better understand customer needs and preferences, driving more effective and targeted initiatives.\n\n**Skills:** \n- Data Analytics\n- Business Intelligence\n- Product Management\n- Customer Experience"
</examples>

Generate a new example, like a profile, as a markdown string of question-answer pairs. Pick from a sample of product managers, engineers, managers, investors. For engineers choose from variety of profiles like startup founding engineers, data scientists, data engineer, platform engineer, frontend engineer, designer, UX etc.

Consider this while creating the profile answers:
1. Foundational Roles should be mix bag of product managers, engineers, managers, investors across various profiles
2. For engineers, choose from a variety of profiles like startup founding engineers, data scientists, data engineers, platform engineers, frontend engineers, designers, UX, etc.
3. For project ideas for hackathon, the idea should be small. The core fundamental of the project related to generative AI, LLMs, diffusion models or applications of AI
4. For meeting people, it should align with other profiles of participants in the hackathon.
5. The career should be made up of real companies that exist in the world. Use innovative famous companies from variety of sectors. Feel free to choose companies from Y combinator and a16z portfolio. Format it like Title, Company Name, Start Date - End Date.
6. Project portfolio can have projects related classical machine learning (regression and classification), blockchain, e-commerce, recsys, search, web, mobile, ar, vr etc
'''

import requests
import json
import os

url = "https://api.fireworks.ai/inference/v1/chat/completions"
payload = {
  "model": "accounts/fireworks/models/mixtral-8x22b-instruct",
  "max_tokens": 8192,
  "top_p": 1,
  "top_k": 40,
  "presence_penalty": 0,
  "frequency_penalty": 0,
  "temperature": 0.6,
  "messages": [
    {
      "role": "user",
      "content": synthetic_data_prompt
    },
    
  ]
}
headers = {
  "Accept": "application/json",
  "Content-Type": "application/json",
  "Authorization": f"Bearer {os.environ['FIREWORKS_API_KEY']}"
}
# response = requests.request("POST", url, headers=headers, data=json.dumps(payload))


In [None]:

from kinconnect_api.config import load_dotenv
load_dotenv()
from typing import List
from langchain_core.messages import HumanMessage
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_fireworks import ChatFireworks


firefunc_model = "accounts/fireworks/models/firefunction-v2"

class QuestionAnswer(BaseModel):
    question: str = Field(..., title="Question asked by the user")
    answer: str = Field(..., title="Answer given by the user")
    

class FormSubmission(BaseModel):
    questions: List[QuestionAnswer] = Field(..., description="All the questions and answers of a profile")


def call_api(prompt, structed_class, model):

    fireworks_llm = ChatFireworks(model=model)
    fireworks_llm = fireworks_llm.with_structured_output(structed_class)
    
    try:
        output = fireworks_llm.invoke([HumanMessage(content=prompt)])
        return {
            "output": output.dict(),
            "error": None
        }
        
    except Exception as e:
        return {
            "output": None,
            "error": e
        }
# extract_synthetic_qna_prompt = f'''
# Separate the questions and answers into a structured format.

# {response.json()['choices'][0]['message']['content']}
# '''

# qna_parsed = call_api(extract_synthetic_qna_prompt, FormSubmission, firefunc_model)


In [None]:
# synth_data = []

In [None]:
# from tqdm import tqdm
# for i in tqdm(range(0, 20)):
#     synth_form_submission = requests.request("POST", url, headers=headers, data=json.dumps(payload))
#     extract_synthetic_qna_prompt = f'''
#         Separate the questions and answers into a structured format.

#         {synth_form_submission.json()['choices'][0]['message']['content']}
#     '''
#     qna_parsed = call_api(extract_synthetic_qna_prompt, FormSubmission, firefunc_model)
#     synth_data.append(qna_parsed['output'])



In [None]:
# import pickle
# from datetime import datetime
# pickle.dump(synth_data, open(f"synth_data_{datetime.now().strftime('%Y%m%d%H%M%S')}.pkl", "wb"))

In [None]:
import pickle
synth_data = pickle.load(open("synth_data_20240701164925.pkl", "rb"))

In [None]:
import pandas as pd
from thefuzz import process

def get_form_question(question, ideal_questions):
    best_match, score = process.extractOne(question, ideal_questions)
    return best_match
ideal_qna_pair={"Timestamp":"01/07/2024 12:42:44","What is your name? ":"Nehil","What are your interests? (ie technical topic, coding language, business problem).":"test","If you have a project idea, describe your idea . Please include whether what sector it is in, and what business problem it is solving and for whom. If you don’t have a project, skip this question.":"test","What is your strongest functional role (such as developer, UX, business, product)? Please share one or two things about your experience in role your experience, for example a success, companies you worked for, how many years experience, a challenging project, etc.":"test","Describe your career history? Think of it like a snapshot of your LinkedIn that is relevant for your teammates at this hackathon.":"test","What are some of the projects you are proud of? Share links and description of what you did and why you are so proud of them":"test","Are you interested in meeting people with a specific skill set (either one that you lack or one that you already have but want to clone yourself to speed up building). What is the skills sets that you are looking to meet?":"test","What your email? (we will send you matching profiles there)":"","Email address":"jain.nehil@gmail.com"}
ideal_questions = ideal_qna_pair.keys()



In [None]:
qna_pairs = [{get_form_question(item['question'], ideal_questions): item['answer'] for item in data['questions']} for data in synth_data if data is not None]

In [None]:
processed_profiles = []

for qna_pair in qna_pairs:
    print(qna_pair)
    markdown_profile = convert_question_answer_pair_to_markdown(qna_pair)
    career_history = extract_career_history(markdown_profile)
    profile_details = extract_profile_details(markdown_profile)
    portfolio = extract_portfolio(markdown_profile)
    profile = profile_details['output']
    profile['career_history'] = career_history['output']
    profile['portfolio'] = portfolio['output']
    profile['form_submission'] = markdown_profile
    print(profile)
    processed_profiles.append(profile)

In [None]:
import random

class Names(BaseModel):
    names: List[str] = Field(..., title="Names of a hackathon participants in Silicon Valley. In 2024. It should have diverisity of gender, race, ethnicity in the software engineering world.")
resp = call_api('Give 15 full name. Have the names of people who are Indian, Chinese, Korean, Japanese, Afrian american, White software engineer common names. Only give proper names.', Names, mistral_model)
resp2 = call_api('Give 20 full name. Have the names of people software engineer common names. Only give proper names.', Names, mistral_model)
unique_names = list(set(resp2['output']['names'] + resp['output']['names']))

for processed_profile in processed_profiles:
    print(processed_profile)
    if unique_names:
        original_name   = processed_profile['name']
        processed_profile['name'] = unique_names.pop(random.randrange(len(unique_names)))
        processed_profile['form_submission'] = processed_profile['form_submission'].replace(original_name, processed_profile['name'])
    

In [None]:
process_profiles_df = pd.DataFrame(processed_profiles)
process_profiles_df

In [None]:
from kinconnect_api.config import MONGO_CONNECTION_STRING
from pymongo import MongoClient
# Connect to MongoDB
client = MongoClient(MONGO_CONNECTION_STRING)
db = client['kinconnect']
profiles_collection = db['profiles']
for profile in processed_profiles:
    profiles_collection.update_one(
        {"name": profile['name']},
        {"$set": profile},
        upsert=True
    )

In [None]:
import pickle
from datetime import datetime
fp = '/Users/nehiljain/code/kinconnect/kinconnect_api/data/processed/processed_profiles_20240702115056.pkl'
data = pickle.load(open(fp, "rb"))
len(data)

In [None]:
form_sumissions = [profile['form_submission'] for profile in data]

In [None]:


# Parsing the specific question and its answer
import re

def get_question_answer_from_form_submission(question, form_submission):
    
    pattern = re.compile(rf"{re.escape(question)}\n(.*?)\n##", re.DOTALL)
    match = pattern.search(form_submission)

    if match:
        answer = match.group(1)
        return {
            "question": question,
            "answer": answer
        }
    else:
        print("Question not found or no answer available.")


In [None]:
intent_questions = [
    {
        'form_submission': form_submission,
        'people_to_meet': get_question_answer_from_form_submission(question = "## Are you interested in meeting people with a specific skill set (either one that you lack or one that you already have but want to clone yourself to speed up building). What is the skills sets that you are looking to meet?", form_submission=form_submission),
        'project_idea': get_question_answer_from_form_submission(question = "## If you have a project idea, describe your idea . Please include whether what sector it is in, and what business problem it is solving and for whom. If you don’t have a project, skip this question.", form_submission=form_submission)
    }
    for form_submission in form_sumissions
]



In [None]:
import pandas as pd
df = pd.DataFrame([{"context": f"""Topic: {intent_question['people_to_meet']['question']}
Request: {intent_question['people_to_meet']['answer']}

Topic: {intent_question['project_idea']['question']}
Request: {intent_question['project_idea']['answer']}"""}
for intent_question in intent_questions])

In [None]:

from kinconnect_api.config import load_dotenv
from langchain_core.messages import HumanMessage
from langchain_fireworks import ChatFireworks
load_dotenv()


def call_api(prompt, model):

    fireworks_llm = ChatFireworks(model=model)
    try:
        output = fireworks_llm.invoke([HumanMessage(content=prompt)])
        return {
            "output": output.content,
            "error": None
        }
    except Exception as e:
        return {
            "output": None,
            "error": e
        }


In [None]:
FIREFUNC_MODEL: str = "accounts/fireworks/models/firefunction-v2"
MISTRAL_MODEL: str = "accounts/fireworks/models/mistral-7b-instruct-v3"
LLAMA_70B_MODEL: str = 'accounts/fireworks/models/llama-v3-70b-instruct'



In [None]:
query_prompt = """I have a request to match with a database of engineers, designers, project managers etc. This request in the form of answers to two questions. 

Each profile in database includes their skills, projects, career history, and interests. 
The goal is to expand this request into one cohesive ask to ensure it captures all relevant aspects and nuances needed for accurate matching. 

Here is the request in question answer pairs:

# Query Context
"{context}"

Please expand this request to include additional relevant details, such as specific skills, roles, project types, and any other contextual information that would improve the accuracy of matching with the profiles in the database. Consider what someone might need in a hackathon context, including complementary skills, leadership qualities, and relevant experience. This request will be used to do semantic search on the database. So the description and content of the request is very very crucial for accuracy.

Respond in <expanded_request> xml tags. The request should have all the answers and details. No yapping. No other text."""

rewrite_summary_query = '''
Understand the request and rewrite it in a easy to understand manner. Make sure to capture all the details. Write in 2 paragraphs like a formal and professional tone. Describe technologies, skills and details in specific details.

Request: {context}

The response should be just the request, no yapping.
'''

df['query_prompt'] = df['context'].apply(lambda x: query_prompt.format(context=x))
query_string = ''
for index, row in df.sample(frac=0.1).iterrows():
    resp = call_api(row['query_prompt'], MISTRAL_MODEL)
    print(row['query_prompt'])
    print("-"*100)
    print(resp['output'])
    print("-"*100)
    query_string = call_api(rewrite_summary_query.format(context=resp['output']), LLAMA_70B_MODEL)['output']
    print(query_string)
    break

In [None]:
from kinconnect_api.db import get_vector_store, PROFILES_COLLECTION
import pandas as pd

vs = get_vector_store()

def get_match_profiles(query_text):
    
    docs = vs.similarity_search_with_score(query_text, k=5)
    profile_names = [doc.metadata['name'] for doc, score in docs]
    profiles = list(PROFILES_COLLECTION.find({'name': {'$in': profile_names}}))
    return profiles

profile_matches = get_match_profiles(query_string)
df = pd.DataFrame(profile_matches)
df.drop(columns=['_id', 'form_submission'], inplace=True)
df

In [None]:
summary_explain_prompt_template = '''
You are given a detailed, expanded query and a matched profile of an engineer. Your task is to generate a summary explanation that highlights why the profile is a good match for the query. The summary should include key points about the engineer's skills, experiences, and projects that align with the needs and goals outlined in the query. Ensure that the explanation is clear, concise, and compelling, emphasizing the most relevant aspects of the match.

Expanded Query:

Copy code
{expanded_query}
Matched Profile:

Copy code
{matched_profile}

Your response should be technical, consise and fun to read. It should be no more than 100 words. No yapping.
Enclose your response in <summary> tags.
'''
summary_explain_prompt = summary_explain_prompt_template.format(expanded_query=query_string, matched_profile=profile_matches[3]['form_submission'])

resp = call_api(summary_explain_prompt, LLAMA_70B_MODEL)
resp['output']

In [None]:
import pandas as pd
fp = '/Users/nehiljain/code/kinconnect/kinconnect_api/data/processed/matches_David_Johnson__Fake_Profile__20240702_170146_parquet'
df = pd.read_parquet(fp)
print(df.iloc[0]['portfolio'])
print('-'*100)
print(df.iloc[0]['summary_explanation'])
print('-'*100)


In [11]:
from kinconnect_api.db import delete_profile_and_chunks
# Example usage
profile_name_to_delete = "Nehil Jain (TEST)"
delete_profile_and_chunks(profile_name_to_delete)

2024-07-04 13:03:24,128 - DEBUG - {"message": "Server selection started", "selector": "<function writable_server_selector at 0x12013ff60>", "operation": "delete", "topologyDescription": "<TopologyDescription id: 6687000c97c4f2f1c260e798, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('ac-n01rlqs-shard-00-00.5e8k2is.mongodb.net', 27017) server_type: Unknown, rtt: None>, <ServerDescription ('ac-n01rlqs-shard-00-01.5e8k2is.mongodb.net', 27017) server_type: Unknown, rtt: None>, <ServerDescription ('ac-n01rlqs-shard-00-02.5e8k2is.mongodb.net', 27017) server_type: Unknown, rtt: None>]>", "clientId": {"$oid": "6687000c97c4f2f1c260e798"}}
2024-07-04 13:03:24,130 - DEBUG - {"message": "Waiting for suitable server to become available", "selector": "<function writable_server_selector at 0x12013ff60>", "operation": "delete", "topologyDescription": "<TopologyDescription id: 6687000c97c4f2f1c260e798, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('ac-n01rlqs-sha

In [None]:
import re
from datetime import datetime
from typing import Dict, Any, Optional, List
from langchain_core.messages import HumanMessage
from langchain_fireworks import ChatFireworks
from langchain_core.pydantic_v1 import BaseModel, Field
import pandas as pd
import os
from langchain_fireworks import ChatFireworks
from langchain_core.messages import HumanMessage
from pymongo import MongoClient
from typing import List, Dict
from langchain_fireworks import FireworksEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch


def get_vector_store() -> MongoDBAtlasVectorSearch:
    return MongoDBAtlasVectorSearch.from_connection_string(
        connection_string = os.getenv('MONGO_CONNECTION_STRING'),
        namespace = "kinconnect.profile_chunks",
        embedding = FireworksEmbeddings(model="nomic-ai/nomic-embed-text-v1.5"),
        index_name = "profile_chunks"
    )

def get_profile_by_name(name: str) -> Dict:
    client = MongoClient(os.getenv('MONGO_CONNECTION_STRING'))
    DB = client['kinconnect']
    PROFILES_COLLECTION = DB['profiles']
    return PROFILES_COLLECTION.find_one({'name': name})



# Constants
FIREWORKS_API_KEY: str = os.getenv('FIREWORKS_API_KEY')
FIREFUNC_MODEL: str = "accounts/fireworks/models/firefunction-v2"
MISTRAL_MODEL: str = "accounts/fireworks/models/mistral-7b-instruct-v3"
LLAMA_70B_MODEL: str = 'accounts/fireworks/models/llama-v3-70b-instruct'

def call_fireworks_api_no_structure(prompt: str, model: str) -> Dict[str, Optional[Any]]:
    """
    Calls the API with the given prompt and model.

    Args:
        prompt (str): The prompt to send to the API.
        model (str): The model to use for the API call.

    Returns:
        Dict[str, Optional[Any]]: The output from the API call or an error message.
    """
    fireworks_llm = ChatFireworks(model=model)
    try:
        output = fireworks_llm.invoke([HumanMessage(content=prompt)])
        return {
            "output": output.content,
            "error": None
        }
    except Exception as e:
        print(f"API call failed: {e}")
        return {
            "output": None,
            "error": e
        }


def call_fireworks_api_with_structure(prompt: str, structured_class: Any, model: str) -> Dict[str, Any]:
    """Calls the Fireworks API with a structured output.

    Args:
        prompt (str): The prompt to send to the API.
        structured_class (Any): The structured class to use for the output.
        model (str): The model to use for the API call.

    Returns:
        Dict[str, Any]: The output from the API call.
    """
    fireworks_llm = ChatFireworks(model=model).with_structured_output(structured_class)
    try:
        output = fireworks_llm.invoke([HumanMessage(content=prompt)])
        return {"output": output.dict(), "error": None}
    except Exception as e:
        print(f"Error calling API: {e}")
        return {"output": None, "error": e}

QUERY_PROMPT: str = """I have a request to match with a database of engineers, designers, project managers etc. This request in the form of answers to two questions. 

Each profile in database includes their skills, projects, career history, and interests. 
The goal is to expand this request into one cohesive ask to ensure it captures all relevant aspects and nuances needed for accurate matching. 

Here is the request in question answer pairs:

# Query Context
"{context}"

Please expand this request to include additional relevant details, such as specific skills, roles, project types, and any other contextual information that would improve the accuracy of matching with the profiles in the database. Consider what someone might need in a hackathon context, including complementary skills, leadership qualities, and relevant experience. This request will be used to do semantic search on the database. So the description and content of the request is very very crucial for accuracy.

Respond in <expanded_request> xml tags. The request should have all the answers and details. No yapping. No other text."""

REWRITE_SUMMARY_QUERY: str = '''
Understand the request and rewrite it in a easy to understand manner. Make sure to capture all the details. Write in 2 paragraphs like a formal and professional tone. Describe technologies, skills and details in specific details.

Request: {context}
'''


def extract_answer_from_submission(question: str, form_submission: str) -> Optional[Dict[str, str]]:
    """
    Extracts the answer to a specific question from the form submission.

    Args:
        question (str): The question to search for in the form submission.
        form_submission (str): The form submission text.

    Returns:
        Optional[Dict[str, str]]: A dictionary containing the question and its answer, or None if not found.
    """
    pattern = re.compile(rf"{re.escape(question)}\n(.*?)\n##", re.DOTALL)
    match = pattern.search(form_submission)

    if match:
        answer = match.group(1)
        return {
            "question": question,
            "answer": answer
        }
    else:
        return None

def create_profile_matching_request(profile: Dict[str, Any]) -> Optional[str]:
    """
    Creates a profile matching request based on the given profile.

    Args:
        profile (Dict[str, Any]): The profile containing form submission data.

    Returns:
        Optional[str]: The expanded query string or None if an error occurred.
    """
    form_submission: str = profile['form_submission']
    people_to_meet: Optional[Dict[str, str]] = extract_answer_from_submission(
        question="## Are you interested in meeting people with a specific skill set (either one that you lack or one that you already have but want to clone yourself to speed up building). What is the skills sets that you are looking to meet?",
        form_submission=form_submission
    )
    project_idea: Optional[Dict[str, str]] = extract_answer_from_submission(
        question="## If you have a project idea, describe your idea . Please include whether what sector it is in, and what business problem it is solving and for whom. If you don’t have a project, skip this question.",
        form_submission=form_submission
    )

    if not people_to_meet or not project_idea:
        print("Failed to extract necessary information from form submission.")
        return None

    context: str = f"""Topic: {people_to_meet['question']}
        Request: {people_to_meet['answer']}

        Topic: {project_idea['question']}
        Request: {project_idea['answer']}"""
    
    expanded_request: Optional[str] = generate_expanded_request(context)
    if expanded_request is None:
        return None

    expanded_query_string: Optional[str] = summarize_expanded_request(expanded_request)
    return expanded_query_string

def generate_expanded_request(context: str) -> Optional[str]:
    """
    Generates an expanded request using the given context.

    Args:
        context (str): The context to use for generating the expanded request.

    Returns:
        Optional[str]: The expanded request or None if an error occurred.
    """
    rewrite_query_prompt: str = QUERY_PROMPT.format(context=context)
    response: Dict[str, Optional[Any]] = call_fireworks_api_no_structure(rewrite_query_prompt, MISTRAL_MODEL)
    if response['error']:
        print("Failed to generate expanded request.")
        return None
    return response['output']

def summarize_expanded_request(expanded_request: str) -> Optional[str]:
    """
    Summarizes the expanded request.

    Args:
        expanded_request (str): The expanded request to summarize.

    Returns:
        Optional[str]: The summarized expanded request or None if an error occurred.
    """
    summary_prompt: str = REWRITE_SUMMARY_QUERY.format(context=expanded_request)
    response: Dict[str, Optional[Any]] = call_fireworks_api_no_structure(summary_prompt, LLAMA_70B_MODEL)
    if response['error']:
        return None
    return response['output']


def get_match_summary_explanation(expanded_request: str, profile: Dict[str, Any]) -> Optional[str]:
    """
    Get the summary explanation for a match.
    """
    summary_explain_prompt_template = '''
    You are given a detailed, expanded query and a matched profile of an engineer. Your task is to generate a summary explanation that highlights why the profile is a good match for the query. The summary should include key points about the engineer's skills, experiences, and projects that align with the needs and goals outlined in the query. Ensure that the explanation is clear, concise, and compelling, emphasizing the most relevant aspects of the match.

    Expanded Query:

    Copy code
    {expanded_query}
    Matched Profile:

    Copy code
    {matched_profile}

    Your response should be technical, consise and fun to read. It should be no more than 100 words. No yapping.
    Enclose your response in <summary> tags.
    '''
    summary_explain_prompt = summary_explain_prompt_template.format(expanded_query=expanded_request, matched_profile=profile['form_submission'])
    response: Dict[str, Optional[Any]] = call_fireworks_api_no_structure(summary_explain_prompt, LLAMA_70B_MODEL)
    class SummaryExplanation(BaseModel):
        summary: str = Field(description="The summary explanation for a matched profile to the request.")
    structure_parse_prompt = f'''
    You are given a response from an LLM. Your task is to parse the response and return the summary explanation.
    Response:
    {response}
    '''
    summary_response = call_fireworks_api_with_structure(structure_parse_prompt, SummaryExplanation, FIREFUNC_MODEL)
    return summary_response['output']['summary']

def get_match_profiles(query_text: str) -> List[Dict[str, Any]]:
    """
    Retrieves matching profiles based on the query text.

    Args:
        query_text (str): The query text to search for matching profiles.

    Returns:
        List[Dict[str, Any]]: A list of matching profiles.
    """
    vector_store = get_vector_store()
    documents_with_scores = vector_store.similarity_search_with_score(query_text, k=5)
    profile_names = [doc.metadata['name'] for doc, score in documents_with_scores]
    profiles = list(PROFILES_COLLECTION.find({'name': {'$in': profile_names}}))
    return profiles

def get_matches_for_profile_with_name(name: str) -> Optional[pd.DataFrame]:
    """
    Retrieves matching profiles for a given profile name.

    Args:
        name (str): The name of the profile to find matches for.

    Returns:
        Optional[pd.DataFrame]: A DataFrame containing the matching profiles or None if no matches found.
    """
    profile = get_profile_by_name(name)
    if not profile:
        print(f"No profile found with name: {name}")
        return None
    query_string = create_profile_matching_request(profile)
    if not query_string:
        print("Failed to create profile matching request.")
        return None
    profile_matches = get_match_profiles(query_string)
    profile_matches = [match for match in profile_matches if match.get('name') != name]
    df = pd.DataFrame(profile_matches)
    df['summary_explanation'] = df.apply(lambda row: get_match_summary_explanation(expanded_request=query_string, profile=row), axis=1)
    df.drop(columns=['_id', 'form_submission'], inplace=True)
    return df
    

def handler(pd: "pipedream"):
    # Reference data from previous steps
    # Connect to MongoDB
    client = MongoClient(os.getenv('MONGO_CONNECTION_STRING'))
    DB = client['kinconnect']
    PROFILES_COLLECTION = DB['profiles']
    PROFILE_CHUNKS_COLLECTION = DB['profile_chunks']
    profile_name = pd.steps["save_to_db"]["$return_value"]["inserted_profile"]["name"]
    print(profile_name)
    # matches_df = get_matches_for_profile_with_name("Nehil")
    # Return data for use in future steps
    # return matches_df.to_dict(orient='records')


In [None]:
from typing import List
from langchain_core.messages import HumanMessage
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_fireworks import ChatFireworks, FireworksEmbeddings

portfolio_prompt_string='''
You are a data generation app which extracts the relevant details about a engineering/design/product management profile. 
We have a form submission about the profile of the participant. They have answered few questions about themselves.
Find all the relevant details about their portfolio from their answers. Make sure to look at the details of each project. You want to collect a list of projects they have worked on.
                        
# BIO
{bio}
'''

profile_prompt_string='''You are a data generation app which extracts the relevant details about a engineering/design/product management profile. 
We have a form submission about the profile of the participant. They have answered few questions about themselves.
Given their form submission which asks them a few questions. Find all the attributes about the profile so you can call the right function.
Find all the relevant details about their profile. Find their skills, accolades, honors and current interests.
                  
# BIO
{bio}
'''

career_history_prompt_string='''
Use the questions answered by participants of a hackathon to call the function with the right arguments.
Find the relevant career related information in the bio to create career history. Make sure you capture all the companies and dates in order.
This information is very important so missing any career event will be expensive mistake.

If date is not mentioned or it says present/current, then the end date should be current date which is in timestamp.

# BIO
{bio}

'''


class ProfileModel(BaseModel):
    name: str = Field(..., title="Name of the person")
    honors: list[str] = Field(None, title="Honors, Awards and recognition they have received in life")
    interests: list[str] = Field(..., title="Interests and current focus of theirs the work or the event")
    skills: list[str] = Field(..., title="Skills they have")


class CareerEntry(BaseModel):
    company: str = Field(..., description="Company they worked at")
    title: str = Field(..., description="Title of the role they held")
    description: str = Field(..., description="Description of the role they held")
    start_date: str = Field(..., description="Start date of the role")
    end_date: str = Field(..., description="End date of the role")

class CareerHistory(BaseModel):
    history: List[CareerEntry] = Field(..., description="All the companies you have been at as part of your career")


class ProjectEntry(BaseModel):
    title: str = Field(..., title="Title of the project")
    description: str = Field(..., title="Description of the project")

class Portfolio(BaseModel):
    projects: List[ProjectEntry] = Field(..., description="All the projects you have worked on")

def call_api(prompt, structed_class, model):

    fireworks_llm = ChatFireworks(model=model)
    fireworks_llm = fireworks_llm.with_structured_output(structed_class)
    
    try:
        output = fireworks_llm.invoke([HumanMessage(content=prompt)])
        return {
            "output": output.dict(),
            "error": None
        }
        
    except Exception as e:
        return {
            "output": None,
            "error": e
        }

def convert_question_answer_pair_to_markdown(question_answer_pair):
    markdown = ""
    for question, answer in question_answer_pair.items():
        markdown += f"## {question}\n{answer}\n\n"
    return markdown

def extract_career_history(question_answer_pair_markdown):
    career_history_prompt = career_history_prompt_string.format(bio=question_answer_pair_markdown)
    firefunc_model = "accounts/fireworks/models/firefunction-v2"
    return call_api(career_history_prompt, CareerHistory, firefunc_model)


def extract_profile_details(question_answer_pair_markdown):
    profile_prompt = profile_prompt_string.format(bio=question_answer_pair_markdown)
    mistral_model = "accounts/fireworks/models/mistral-7b-instruct-v3"
    return call_api(profile_prompt, ProfileModel, mistral_model)

def extract_portfolio(question_answer_pair_markdown):
    portfolio_prompt = portfolio_prompt_string.format(bio=question_answer_pair_markdown)
    mistral_model = "accounts/fireworks/models/mistral-7b-instruct-v3"
    return call_api(portfolio_prompt, Portfolio, mistral_model)


def handler(pd: "pipedream"):
    embedding_model = FireworksEmbeddings(model="nomic-ai/nomic-embed-text-v1.5")
    question_answer_pair = pd.steps["get_formated_question_answer_pairs"]["$return_value"]
    markdown_profile = convert_question_answer_pair_to_markdown(question_answer_pair)
    career_history = extract_career_history(markdown_profile)
    profile_details = extract_profile_details(markdown_profile)
    portfolio = extract_portfolio(markdown_profile)
    profile = profile_details['output']
    profile['career_history'] = career_history['output']
    profile['portfolio'] = portfolio['output']
    profile['form_submission'] = markdown_profile
    print(profile)
    profile['embeddings'] = embedding_model.embed_query(markdown_profile)
    return profile
    

In [6]:
import pandas as pd
# fp = '/Users/nehiljain/code/kinconnect/kinconnect_api/data/processed/matches_Nehil_Jain__TEST__20240703_160619.parquet'
fp = '/Users/nehiljain/code/kinconnect/kinconnect_api/data/processed/processed_profiles_20240705124308.parquet'
df = pd.read_parquet(fp)

print(pd.read_parquet(fp)[['name', 'career_history', 'portfolio']].to_dict(orient='records'))

[{'name': 'Benjamin Lee (Fake Profile)', 'career_history': {'history': array([{'company': 'Airbnb', 'description': 'Built and optimized frontend applications for Airbnb, focusing on enhancing user experience and improving performance.', 'end_date': '2018', 'start_date': '2016', 'title': 'Frontend Developer'},
       {'company': 'Stripe', 'description': "Worked on developing and maintaining Stripe's frontend applications, ensuring a seamless and user-friendly experience for customers.", 'end_date': '2024-07-05 19:42:38', 'start_date': '2018', 'title': 'Senior Frontend Developer'}],
      dtype=object)}, 'portfolio': {'projects': array([{'description': 'Developed a frontend application for an e-commerce platform that provided personalized product recommendations based on user preferences and browsing history. The application improved the user experience and increased customer engagement, leading to a significant increase in sales.', 'title': 'Personalized Product Recommendations for E-co

In [11]:
from kinconnect_api.db import delete_profile_and_chunks
import logging
logging.getLogger().setLevel(logging.INFO)
# Example usage
profile_name_to_delete = "Nehil Jain"
delete_profile_and_chunks(profile_name_to_delete)
profile_name_to_delete = "Nehil Jain (Test)"
delete_profile_and_chunks(profile_name_to_delete)
profile_name_to_delete = "Nehil Jain (Test for recording)"
delete_profile_and_chunks(profile_name_to_delete)
profile_name_to_delete = "Nehil Jain (test for recording)"
delete_profile_and_chunks(profile_name_to_delete)

2024-07-05 12:53:43,944 - INFO - Deleted profile with name: Nehil Jain
2024-07-05 12:53:43,981 - INFO - Deleted chunks associated with profile name: Nehil Jain
2024-07-05 12:53:44,509 - INFO - Deleted profile with name: Nehil Jain (Test)
2024-07-05 12:53:44,551 - INFO - Deleted chunks associated with profile name: Nehil Jain (Test)
2024-07-05 12:53:45,171 - INFO - Deleted profile with name: Nehil Jain (Test for recording)
2024-07-05 12:53:45,209 - INFO - Deleted chunks associated with profile name: Nehil Jain (Test for recording)
2024-07-05 12:53:45,785 - INFO - Deleted profile with name: Nehil Jain (test for recording)
2024-07-05 12:53:45,826 - INFO - Deleted chunks associated with profile name: Nehil Jain (test for recording)
