In [1]:
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import json
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()

True

In [3]:
df = pd.read_json("assets/freelancers.json")

In [4]:
loader = DataFrameLoader(df, page_content_column='description')
df.columns

Index(['id', 'firstName', 'lastName', 'profileRating', 'role',
       'profilePicture', 'description', 'lastEmploymentString',
       'freelancerDomain', 'freelancerExpertise', 'timeEmployment', 'email',
       'phone', 'instagram', 'twitter', 'website', 'gender', 'birthDate',
       'birthPlace', 'languages', 'address', 'experience', 'academicCareer',
       'skills'],
      dtype='object')

In [5]:
features = ['id', 'firstName', 'lastName', 'profileRating', 'role', 'profilePicture', 'description', 'lastEmploymentString', 'freelancerDomain', 'freelancerExpertise', 'timeEmployment', 'email', 'phone', 'instagram', 'twitter', 'website', 'gender', 'birthDate', 'birthPlace', 'languages', 'address', 'experience', 'academicCareer', 'skills']

In [6]:
df_str = df[features].astype(str)
texts = df_str.apply(lambda row: json.dumps(dict(zip(features, row))), axis=1).tolist()

> Creating a list, each element in the list represent a json describing the freelancer

In [7]:
texts

['{"id": "1", "firstName": "Aymen", "lastName": "Ben Ali", "profileRating": "4.5", "role": "Software Engineer", "profilePicture": "/profilePictures/m1.jpg", "description": "Lorem ipsum dolor sit amet consectetur, adipisicing elit. Aut quis nobis reiciendis tempore ullam, reprehenderit, officiis natus dignissimos aliquid facere deleniti in a odio ea voluptate harum dolorum. Earum, repellat error. Voluptatum hic, eveniet, odio omnis rerum repudiandae voluptates nobis reprehenderit, totam consectetur rem fuga. Quaerat dolorem aperiam molestiae facilis in. Consequatur magni harum quod vel molestiae nemo impedit sunt alias ad animi eos quisquam mollitia voluptatum recusandae, adipisci et cumque minima nesciunt natus repellat asperiores explicabo fugit voluptate. Quod repellendus voluptatum impedit asperiores unde quisquam adipisci esse tempore facere", "lastEmploymentString": "il ya 2 jours", "freelancerDomain": "Product Development", "freelancerExpertise": "marketing", "timeEmployment": "P

In [8]:
documents = loader.load()

In [9]:
inference_api_key = os.getenv('HUGGING_FACE_INFERENCE')

In [12]:
model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
embeddings_test = HuggingFaceInferenceAPIEmbeddings(
    api_key=inference_api_key, model_name="Salesforce/SFR-Embedding-2_R"
)
# hf_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-en-icl")

In [None]:
vectors = embeddings_test.embed_documents(texts)

> Creating a matrix with the texts and vectors, in goal to store them in FAISS

In [None]:
vectors

In [None]:
text_embeddings = list(zip(texts, vectors))

In [None]:
vectorstore = FAISS.from_embeddings(text_embeddings, embeddings_test)

### Creating the vectorstore for LLM use and permanent data instead of fine tuning

In [None]:
vectorstore

> the indexes before storing them

In [None]:
!dir rag_data

In [None]:
vectorstore.save_local("rag_data")

> the indexes after storing the personal data for RAG operations

In [None]:
!dir rag_data

In [None]:
reply = vectorstore.similarity_search("I want a freelancer that is proficient in Docker", k=1)
reply

In [None]:
LLM = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0)

In [None]:
temporary_template = f"""
Create a clear and concise summary of the freelancer described in the following json. 
Avoid any markdown responses or formatting, and focus on presenting the key details in a straightforward, easy-to-understand manner:
{reply}
"""
result = LLM.invoke(temporary_template)
result.content