In [2]:
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_core.runnables import RunnableParallel,RunnablePassthrough
import os,asyncio,warnings
warnings.filterwarnings("ignore")

data = [file for file in os.listdir('data') if file.endswith(".pdf")]

In [10]:
from langchain.document_loaders import PyPDFLoader
from langchain_core.documents import Document
pages = PyPDFLoader(file_path=f'/home/faizan/Downloads/Get_Started_With_Smallpdf.pdf').load()


In [11]:
def flatteningdocs(data):
    return [item for sublist in data for item in sublist]

from langchain.document_loaders import PyPDFLoader
def get_data(file_path):
    pages = PyPDFLoader(file_path=f'data/{file_path}').load()
    if len(pages) > 1:
        pdfstring = ""
        metadata = {}
        for page in pages:
            pdfstring += page.page_content
            metadata.update(page.metadata)

        return [Document(
            page_content=pdfstring,
            metadata=metadata)]
        
    else:
        return pages

content_chain = RunnablePassthrough() | get_data
content = flatteningdocs(await content_chain.abatch(data))

In [12]:
len(content)

27

In [5]:
import pandas as pd
df = pd.read_csv('data/UpdatedResumeDataSet.csv')
# resumes = df['Resume'].tolist()
resumes = df[(df['Category'] == "Database") | (df['Category'] == "Data Science") | (df['Category'] == 'Python Developer')]

resumes.drop_duplicates(inplace=True)
resumes.shape

(27, 2)

In [6]:
from fpdf import FPDF

data_tuples = [tuple(x) for x in resumes.to_records(index=False)]

def save_to_pdf(data):
    for index,(category, resume_text) in enumerate(data):
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font('Arial', size=10)
        pdf.set_title(f"{category}.pdf")
        
        # Encode resume_text to 'latin-1' to handle Unicode characters
        try:
            encoded_text = resume_text.encode('latin-1', 'replace').decode('latin-1')
        except Exception as e:
            print(f"Error encoding text for {category}.pdf: {e}")
            continue
        
        pdf.multi_cell(0, 10, encoded_text, border=1)
        
        # Save the PDF file
        pdf_file_name = f"data/{category}{index + 1}.pdf"
        pdf.output(pdf_file_name)

save_to_pdf(data_tuples)

In [7]:
des = """Company Description
 
Evergreen Future Tech (EFT) is a software solutions provider located in Islamabad, Pakistan. We offer cutting-edge software solutions and specialize in artificial intelligence technologies.

 Role Description
 
This is a contract role for an Artificial Intelligence Engineer. The AI Engineer will be responsible for developing and implementing AI technologies, such as pattern recognition, neural networks, and natural language processing. The engineer will also be involved in software development and computer science-related tasks. This role is an on-site position located in Islamabad, Pakistan.

 Qualifications
 
Strong background in computer science and software development
Proficiency in pattern recognition and neural networks
Experience in natural language processing (NLP)
Problem-solving and analytical skills
Experience with AI frameworks and tools
Knowledge of machine learning algorithms
Excellent coding skills in languages such as Python or Java
Strong communication and collaboration skills
Bachelor's or master's degree in computer science or a related field"""

In [8]:
des2 = "we are looking for a doctor who is best in his feild mostly in the animle and he should be experinced."

In [29]:
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
embedding = HuggingFaceEmbeddings(model_name=model_name)
# vector_store = Chroma.from_texts(embedding=embedding,texts=resumes)
persist_directory = "database"
vector_store = Chroma.from_documents(embedding=embedding,documents=content,persist_directory=persist_directory)
vector_store

In [16]:

retriever = vector_store.as_retriever(
        search_kwargs={'k':20})

# retriever = vector_store.as_retriever(search_type="similarity_score_threshold",
        # search_kwargs={'score_threshold': 0.1,'k':30})
# print(len(getunique(retriever.invoke(des))))
len(retriever.invoke(des))

20

In [18]:
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers import ContextualCompressionRetriever
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_groq import ChatGroq
import os

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "CV"

Groq = ChatGroq(
    temperature=0,
    model="llama3-70b-8192").with_fallbacks([ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key=os.getenv("google_api_key"))])

# llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key=os.getenv("google_api_key")).with_fallbacks([ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key=os.getenv("dgoogle_api_key"))]).with_fallbacks([Groq])

from langchain_core.prompts import PromptTemplate

from langchain.output_parsers.boolean import BooleanOutputParser

prompt_template = """You are a powerfull assistant your task is to check weather the given CV match the given job requirements.Return only Yes if it match else return No.
<job description>{question}</job description>
# <cv>{context}</cv>
> Relevant (YES / NO):"""

def _get_default_chain_prompt() -> PromptTemplate:
    return PromptTemplate(
        template=prompt_template,
        input_variables=["question", "context"],
        output_parser=BooleanOutputParser())

_filter = LLMChainFilter.from_llm(Groq,prompt=_get_default_chain_prompt())

compression_retriever = ContextualCompressionRetriever(
    base_compressor=_filter, base_retriever=retriever)

compressed_docs = compression_retriever.invoke(des)

compressed_docs

[Document(metadata={'page': 4, 'source': 'data/Data Science8.pdf'}, page_content='Education Details \n B.Tech   Rayat and Bahra Institute of Engineering and Biotechnology\nData Science \nData Science\nSkill Details \nNumpy- Exprience - Less than 1 year months\nMachine Learning- Exprience - Less than 1 year months\nTensorflow- Exprience - Less than 1 year months\nScikit- Exprience - Less than 1 year months\nPython- Exprience - Less than 1 year months\nGCP- Exprience - Less than 1 year months\nPandas- Exprience - Less than 1 year months\nNeural Network- Exprience - Less than 1 year monthsCompany Details \ncompany - Wipro\ndescription - Bhawana Aggarwal\nE-Mail:bhawana.chd@gmail.com\nPhone: 09876971076\nVVersatile, high-energy professional targeting challenging assignments in Machine\nPROFILE SUMMARY\nâ–ª An IT professional with knowledge and experience of 2 years in Wipro Technologies in Machine\nLearning, Deep Learning, Data Science, Python, Software Development.\nâ–ª Skilled in managin

In [19]:
# # len(getunique(compressed_docs))
# from langchain.load import dumps,loads
# def getunique(docs):
#     # return [loads(_doc) for _doc in set([dumps(doc) for doc in docs])]
#     return [loads(_doc) for _doc in (set(dumps(doc) for doc in docs))]

# for doc in getunique(compressed_docs):
#     print(doc.metadata)
    
# # for doc in compressed_docs:
# # compressed_docs

import shutil
shortlisted_cvs = [cv.metadata['source'] for cv in compressed_docs]
for filename in shortlisted_cvs:
    shutil.copy(filename, 'shortlisted_cvs')

In [None]:
# llm = ChatGoogleGenerativeAI(max_retries=0,model="gemini-1.5-flash",google_api_key=os.getenv("google_api_key")).with_fallbacks([ChatGoogleGenerativeAI(max_retries=0,model="gemini-1.5-flash",google_api_key=os.getenv("dgoogle_api_key"))]).with_fallbacks([Groq])

In [21]:
[os.remove(f'shortlisted_cvs/{file}') for file in os.listdir('shortlisted_cvs')]


[None, None, None, None, None]

In [9]:
# # Install package
# %pip install --upgrade --quiet "unstructured[all-docs]" --break-system-packages

Note: you may need to restart the kernel to use updated packages.


In [28]:
def find_percent(number):
    return int(number * 0.70)

# Example usage:
num = 27
result = find_percent(num)
print(f"70% of {num} is {result}")


70% of 27 is 18


In [22]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
import os,asyncio,warnings
from langchain_chroma import Chroma
warnings.filterwarnings("ignore")
embedding = HuggingFaceEmbeddings()

persist_directory = "db2"

vector_store = Chroma.from_documents(persist_directory=persist_directory,collection_name="faizan1",documents=[Document(page_content="Hello, I'm a document!"),Document(page_content="This is a second document!")],embedding=embedding)

In [43]:
def main():
    return Chroma.from_documents(persist_directory=persist_directory,collection_name="faizan3",documents=[Document(page_content="we are pakistan"),Document(page_content="we are muslim"),Document(page_content="I am faizan mumtaz"),Document(page_content="we will fight"),Document(page_content="Muhammad Faizan Mumtaz is ML engineer."),Document(page_content="")],embedding=embedding)

r = main()

In [47]:
# r.delete_collection()
r.as_retriever(search_kwargs={"k":10}).invoke('we are pakis')

[Document(page_content='we are pakistan'),
 Document(page_content='we are pakistan'),
 Document(page_content='we are muslim'),
 Document(page_content='we are muslim'),
 Document(page_content='I am faizan mumtaz'),
 Document(page_content='I am faizan mumtaz'),
 Document(page_content='I am faizan mumtaz'),
 Document(page_content='we will fight'),
 Document(page_content='we will fight'),
 Document(page_content='Muhammad Faizan Mumtaz is ML engineer.')]

In [33]:
vector_store = Chroma(persist_directory=persist_directory,collection_name="faizan1",embedding_function=embedding)
# vector_store.invoke("second document")

In [39]:
vector_store

Collection(name=faizan1)

In [21]:
import chromadb
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings()

langchain_chroma = Chroma(
    collection_name="collection_name",
    embedding_function=embedding,
)

print("There are", langchain_chroma._collection.count(), "in the collection")

  from tqdm.autonotebook import tqdm, trange


There are 0 in the collection


In [1]:
import uuid
zip_filename = str(uuid.uuid4()) + ".zip"
zip_filename

'f10c37bf-f232-4d1e-911b-6afd3b5015a3.zip'

In [18]:
import os
os.makedirs('test',exist_ok=True)
os.makedirs("test/faizan",exist_ok=True)

In [19]:
import shutil
shutil.rmtree("test/faizan")