## Chromadb

In [1]:
import chromadb 

In [2]:
# client.delete_collection(name="my_collection")
# collection = client.create_collection(name="my_collection")
client = chromadb.Client()  #using the default in-memory backend
collection = client.get_or_create_collection(name="my_collection") #get an existing collection, or create it if it doesn't exist
collection.add( 
    documents=["This document is about China", 
                "This document is about America"], 
    ids=["id1", "id2"]
)


In [3]:
all_docs = collection.get() 
all_docs    #get all the documents in the collection

{'ids': ['id1', 'id2'],
 'embeddings': None,
 'documents': ['This document is about China',
  'This document is about America'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [None, None]}

In [4]:
documents = collection.get(ids=["id1"]) 
documents   #get the document with id "id1"

{'ids': ['id1'],
 'embeddings': None,
 'documents': ['This document is about China'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [None]}

In [5]:
results = collection.query(query_texts=["Jiaozi"], n_results=2)
results  #query the collection for documents most similar to "Jiaozi"

{'ids': [['id1', 'id2']],
 'embeddings': None,
 'documents': [['This document is about China',
   'This document is about America']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.1991695165634155, 1.8454222679138184]]}

In [6]:
collection.delete(ids=["id1"]) #delete the document with id "id1"
collection.get()    #get all the documents in the collection, now only "id2" should remain

{'ids': ['id2'],
 'embeddings': None,
 'documents': ['This document is about America'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [None]}

In [7]:
collection.delete(ids=all_docs['ids'])  #delete all the documents in the collection
collection.get()    #get all the documents in the collection, now it should be empty

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': []}

In [8]:
collection.add( 
    documents=["This document is about China",  
                "This document is about America,"],
    ids=["id1", "id2"], 
    metadatas=[{"url": "https://en.wikipedia.org/wiki/China"},
                {"url": "https://en.wikipedia.org/wiki/United_States"}]
)   

In [9]:
collection.get()    #get all the documents in the collection, now "id1" and "id2" should be back

{'ids': ['id1', 'id2'],
 'embeddings': None,
 'documents': ['This document is about China',
  'This document is about America,'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'url': 'https://en.wikipedia.org/wiki/China'},
  {'url': 'https://en.wikipedia.org/wiki/United_States'}]}

In [10]:
results = collection.query(query_texts=["Jiaozi"], n_results=2)
results  #query the collection for documents most similar to "Jiaozi"

{'ids': [['id1', 'id2']],
 'embeddings': None,
 'documents': [['This document is about China',
   'This document is about America,']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/China'},
   {'url': 'https://en.wikipedia.org/wiki/United_States'}]],
 'distances': [[1.1991695165634155, 1.8767603635787964]]}

## ChatGroq

In [11]:
from langchain_groq import ChatGroq
from secret_key import groq_api_key

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
llm = ChatGroq(
    temperature=0,
    groq_api_key=groq_api_key,
    model="llama-3.3-70b-versatile",
)   #initialize the Groq LLM
response = llm.invoke("The firstt president of the USA is")
print(response)         
    


content='The first president of the United States of America is George Washington. He served from 1789 to 1797.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 25, 'prompt_tokens': 43, 'total_tokens': 68, 'completion_time': 0.066136524, 'prompt_time': 0.002184844, 'queue_time': 0.091735793, 'total_time': 0.068321368}, 'model_name': 'llama-3.3-70b-versatile', 'system_fingerprint': 'fp_155ab82e98', 'service_tier': 'on_demand', 'finish_reason': 'stop', 'logprobs': None} id='run--ff71215b-682c-4da4-849f-b3e4f2451c59-0' usage_metadata={'input_tokens': 43, 'output_tokens': 25, 'total_tokens': 68}


## WebBaseLoader

In [13]:
from langchain_community.document_loaders import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [14]:
loader = WebBaseLoader("https://lifeattiktok.com/search/7533507587720726791")
data = loader.load().pop().page_content #load the webpage and get the text content
# print(data[0].page_content)
print(data)    

Multimodal Algo Researcher Intern (AI Innovation Center) - 2026 Start (PhD)#LifeAtTikTok#LifeAtTikTokDiversity & InclusionOur PhilosophyTeamsAdvertising & SalesCorporate FunctionsDesignE-CommerceEngineering & TechnologyGlobal OperationsMarketing & CommunicationsProductTikTok  U.S. Data SecurityHow we hireApplying to TikTokInterview tipsFAQLocationsEarly CareersBlogJobsApplyCompanyAbout TikTokNewsroomContactProgramsTikTok for GoodTikTok for DevelopersEffect HouseAdvertise on TikTokTikTok RewardsTikTok BrowseTikTok EmbedsResourcesHelp centerSafety CenterCreator PortalCommunity GuidelinesTransparencyAccessibilityLegalPrivacy PolicyCandidate Privacy Policy Terms of ServiceEnglish日本語Join us as we inspire creativity and bring joy to millions of users worldwide.Ready for a career at TikTok?Discover a career that energizes and excites you every day.Search nowCompanyAbout TikTokNewsroomContactProgramsTikTok for GoodTikTok for DevelopersEffect HouseAdvertise on TikTokTikTok RewardsTikTok BrowseT

## PromptTemplate

In [15]:
from langchain_core.prompts import PromptTemplate

In [16]:
prompt_extract = PromptTemplate.from_template(
    """
        ### SCRAPED TEXT FROM WEBSITE:
        {page_data}
        ### INSTRUCTION:
        The scraped text is from the career's page of a website.
        Your job is to extract the job postings and return them in JSON format containing the 
        following keys: `role`, `experience`, `skills` and `description`.
        Only return the valid JSON.
        ### VALID JSON (NO PREAMBLE):    
        """
)

chain_extract = prompt_extract | llm
res = chain_extract.invoke(input={'page_data': data})
print(res.content)

```json
{
  "role": "Multimodal Algo Researcher Intern (AI Innovation Center)",
  "experience": "PhD",
  "skills": [
    "Machine Learning",
    "Large Language Models (LLMs)",
    "Generative AI",
    "C/C++",
    "Python",
    "Data structures",
    "Fundamental algorithm skills"
  ],
  "description": "Drive core technology development for large language model code direction, continuously optimizing code comprehension, reasoning, and generation capabilities. Focus on improving code comprehension, reasoning, and generation capabilities in real-world production codebases, improving TikTok service code performance and privacy compliance. Explore Code Agent capabilities suitable for actual business production environments, and improve TikTok R&D efficiency."
}
```


In [17]:
type(res.content)

str

## JsonOutputParser

In [18]:
from langchain_core.output_parsers import JsonOutputParser

In [19]:
json_parser = JsonOutputParser()
json_res = json_parser.parse(res.content)
json_res

{'role': 'Multimodal Algo Researcher Intern (AI Innovation Center)',
 'experience': 'PhD',
 'skills': ['Machine Learning',
  'Large Language Models (LLMs)',
  'Generative AI',
  'C/C++',
  'Python',
  'Data structures',
  'Fundamental algorithm skills'],
 'description': 'Drive core technology development for large language model code direction, continuously optimizing code comprehension, reasoning, and generation capabilities. Focus on improving code comprehension, reasoning, and generation capabilities in real-world production codebases, improving TikTok service code performance and privacy compliance. Explore Code Agent capabilities suitable for actual business production environments, and improve TikTok R&D efficiency.'}

In [20]:
type(json_res)

dict

In [21]:
import pandas as pd

In [None]:
df= pd.read_csv('my_portfolio.csv')
df

Unnamed: 0,Techstack,Links
0,"React, Node.js, MongoDB",https://example.com/react-portfolio
1,"Angular,.NET, SQL Server",https://example.com/angular-portfolio
2,"Vue.js, Ruby on Rails, PostgreSQL",https://example.com/vue-portfolio
3,"Python, Django, MySQL",https://example.com/python-portfolio
4,"Java, Spring Boot, Oracle",https://example.com/java-portfolio
5,"Flutter, Firebase, GraphQL",https://example.com/flutter-portfolio
6,"WordPress, PHP, MySQL",https://example.com/wordpress-portfolio
7,"Magento, PHP, MySQL",https://example.com/magento-portfolio
8,"React Native, Node.js, MongoDB",https://example.com/react-native-portfolio
9,"iOS, Swift, Core Data",https://example.com/ios-portfolio


In [23]:
import chromadb
import uuid #for generating unique IDs

In [24]:
client = chromadb.PersistentClient('vectorstore')  
collection = client.get_or_create_collection(name="portfolio_collection")

if not collection.count():
    for _, row in df.iterrows():
        collection.add(
            documents=row['Techstack'],
            metadatas={"links" : row['Links']},
            ids=[str(uuid.uuid4())]
        )   

In [25]:
links = collection.query(
    query_texts=["Python, Machine Learning"],
    n_results=3
)
links   

{'ids': [['ca10dabc-646a-4de9-aec5-591f574543b0',
   '25d024f5-4125-4f6a-a4d5-4afc2aed414f',
   '745fecc4-ca88-4bd6-8ddb-754b69b559cf']],
 'embeddings': None,
 'documents': [['Machine Learning, Python, TensorFlow',
   'Python, Django, MySQL',
   'Magento, PHP, MySQL']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'links': 'https://example.com/ml-python-portfolio'},
   {'links': 'https://example.com/python-portfolio'},
   {'links': 'https://example.com/magento-portfolio'}]],
 'distances': [[0.43525269627571106, 1.2631328105926514, 1.6880728006362915]]}

In [26]:
links = collection.query(
    query_texts=["Python, Machine Learning"],
    n_results=3
).get('metadatas', [])
links   

[[{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/python-portfolio'},
  {'links': 'https://example.com/magento-portfolio'}]]

In [27]:
links = collection.query(
    query_texts=["Python, Machine Learning"],
    n_results=3
)
links.get('metadatas', [])

[[{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/python-portfolio'},
  {'links': 'https://example.com/magento-portfolio'}]]

In [28]:
job = json_res
job['skills']

['Machine Learning',
 'Large Language Models (LLMs)',
 'Generative AI',
 'C/C++',
 'Python',
 'Data structures',
 'Fundamental algorithm skills']

In [29]:
links = collection.query(
    query_texts=job['skills'],
    n_results=3
).get('metadatas', [])
links

[[{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/wordpress-portfolio'},
  {'links': 'https://example.com/magento-portfolio'}],
 [{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/android-portfolio'},
  {'links': 'https://example.com/java-portfolio'}],
 [{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/android-portfolio'},
  {'links': 'https://example.com/ios-ar-portfolio'}],
 [{'links': 'https://example.com/magento-portfolio'},
  {'links': 'https://example.com/wordpress-portfolio'},
  {'links': 'https://example.com/ml-python-portfolio'}],
 [{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/python-portfolio'},
  {'links': 'https://example.com/magento-portfolio'}],
 [{'links': 'https://example.com/ios-portfolio'},
  {'links': 'https://example.com/magento-portfolio'},
  {'links': 'https://example.com/wordpress-portfolio'}],
 [{'links': 'ht

In [30]:
job

{'role': 'Multimodal Algo Researcher Intern (AI Innovation Center)',
 'experience': 'PhD',
 'skills': ['Machine Learning',
  'Large Language Models (LLMs)',
  'Generative AI',
  'C/C++',
  'Python',
  'Data structures',
  'Fundamental algorithm skills'],
 'description': 'Drive core technology development for large language model code direction, continuously optimizing code comprehension, reasoning, and generation capabilities. Focus on improving code comprehension, reasoning, and generation capabilities in real-world production codebases, improving TikTok service code performance and privacy compliance. Explore Code Agent capabilities suitable for actual business production environments, and improve TikTok R&D efficiency.'}

In [31]:
from langchain_core.prompts import PromptTemplate
prompt_email = PromptTemplate.from_template(
    """
    ### JOB DESCRIPTION:
    {job_description}

        ### INSTRUCTION:
        You are **Mengqing Hu**, a **Master’s student in CMS(Visual Computing)** at **TU Dresden**, with hands-on experience in **AI, Machine Learning, NLP, and Computer Vision**.  
        You have worked on projects involving **semantic search (LangChain + Elasticsearch)**, **Autoencoder-based feature extraction**, **Django web development**, and **AI-driven document retrieval pipelines**.  
        Drawing from your experience at **Fraunhofer IWU**, **FSD Fahrzeugsystemdaten GmbH**, and the **Institute of Mechatronic Engineering at TU Dresden**, you are skilled in developing and deploying intelligent systems that automate analysis, optimize performance, and integrate advanced algorithms into production-ready solutions.  
        Your task is to write a cold email to the client regarding the job mentioned above, describing your background and capability in fulfilling their needs.  
        Also add the most relevant ones from the following links to showcase your previous projects or portfolio: {link_list}  
        Remember you are **Mengqing Hu** from **TU Dresden**.  
        Do not provide a preamble.  
        ### EMAIL (NO PREAMBLE):
        
        """
)

chain_email = prompt_email | llm
res = chain_email.invoke(input={'job_description': job, 'link_list': links})
print(res.content)

Subject: Application for Multimodal Algo Researcher Intern (AI Innovation Center)

Dear Hiring Manager,

I am Mengqing Hu, a Master's student in Visual Computing at TU Dresden, with a strong background in AI, Machine Learning, NLP, and Computer Vision. I am excited to apply for the Multimodal Algo Researcher Intern position at the AI Innovation Center, where I can leverage my skills to drive core technology development and improve code comprehension, reasoning, and generation capabilities.

With hands-on experience in developing and deploying intelligent systems, I am confident in my ability to automate analysis, optimize performance, and integrate advanced algorithms into production-ready solutions. My experience working on projects involving semantic search (LangChain + Elasticsearch), Autoencoder-based feature extraction, Django web development, and AI-driven document retrieval pipelines has equipped me with the skills to tackle complex problems and deliver innovative solutions.

As