In [1]:
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import json
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
df = pd.read_json("assets/freelancers.json")

In [4]:
loader = DataFrameLoader(df, page_content_column='description')
df.columns

Index(['id', 'firstName', 'lastName', 'profileRating', 'role',
       'profilePicture', 'description', 'lastEmploymentString',
       'freelancerDomain', 'freelancerExpertise', 'timeEmployment', 'email',
       'phone', 'instagram', 'twitter', 'website', 'gender', 'birthDate',
       'birthPlace', 'languages', 'address', 'experience', 'academicCareer',
       'skills'],
      dtype='object')

In [5]:
features = ['id', 'firstName', 'lastName', 'profileRating', 'role', 'profilePicture', 'description', 'lastEmploymentString', 'freelancerDomain', 'freelancerExpertise', 'timeEmployment', 'email', 'phone', 'instagram', 'twitter', 'website', 'gender', 'birthDate', 'birthPlace', 'languages', 'address', 'experience', 'academicCareer', 'skills']

In [6]:
df_str = df[features].astype(str)
texts = df_str.apply(lambda row: json.dumps(dict(zip(features, row))), axis=1).tolist()

> Creating a list, each element in the list represent a json describing the freelancer

In [7]:
texts

['{"id": "1", "firstName": "Aymen", "lastName": "Ben Ali", "profileRating": "4.5", "role": "Software Engineer", "profilePicture": "/profilePictures/m1.jpg", "description": "Lorem ipsum dolor sit amet consectetur, adipisicing elit. Aut quis nobis reiciendis tempore ullam, reprehenderit, officiis natus dignissimos aliquid facere deleniti in a odio ea voluptate harum dolorum. Earum, repellat error. Voluptatum hic, eveniet, odio omnis rerum repudiandae voluptates nobis reprehenderit, totam consectetur rem fuga. Quaerat dolorem aperiam molestiae facilis in. Consequatur magni harum quod vel molestiae nemo impedit sunt alias ad animi eos quisquam mollitia voluptatum recusandae, adipisci et cumque minima nesciunt natus repellat asperiores explicabo fugit voluptate. Quod repellendus voluptatum impedit asperiores unde quisquam adipisci esse tempore facere", "lastEmploymentString": "il ya 2 jours", "freelancerDomain": "Product Development", "freelancerExpertise": "marketing", "timeEmployment": "P

In [8]:
documents = loader.load()

In [9]:
embeddings_model_name = "all-MiniLM-L6-v2"

In [10]:
model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [11]:
vectors = model.embed_documents(texts)

In [12]:
vectors

[[0.05082392692565918,
  -0.04934182018041611,
  -0.024165434762835503,
  -0.012344611808657646,
  0.09535566717386246,
  -0.01274576224386692,
  0.011533556506037712,
  0.0012846230529248714,
  0.00743699399754405,
  0.057147491723299026,
  0.014497225172817707,
  0.00606469577178359,
  -0.027645166963338852,
  -0.011728320270776749,
  0.010607227683067322,
  -0.024335848167538643,
  0.011618019081652164,
  0.0217024777084589,
  0.013146873563528061,
  -0.026963505893945694,
  0.02162831462919712,
  -0.01127319224178791,
  -0.04122424125671387,
  -0.016896095126867294,
  -0.0038043283857405186,
  0.004717886447906494,
  0.0658600851893425,
  -0.03606115281581879,
  -0.03416164219379425,
  0.05467039346694946,
  -0.04234292358160019,
  0.051991287618875504,
  -0.047701645642519,
  0.04147335886955261,
  -0.021129269152879715,
  -0.05243844911456108,
  -9.472780220676214e-05,
  0.0346071831882,
  0.04211737588047981,
  0.04644463583827019,
  -0.029682621359825134,
  -0.01559818163514137

> Creating a matrix with the texts and vectors, in goal to store them in FAISS

In [13]:
text_embeddings = list(zip(texts, vectors))

In [14]:
vectorstore = FAISS.from_embeddings(text_embeddings, model)

### Creating the vectorstore for LLM use and permanent data instead of fine tuning

In [15]:
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x16e85f2aba0>

> the indexes before storing them

In [16]:
!dir rag_data

 Volume in drive C is Windows-SSD
 Volume Serial Number is 6CD0-8496

 Directory of C:\Users\moham_2xem6b3\freelancer-recommendation-platform\llm-service\notebooks\rag_data

05/08/2024  10:27 PM    <DIR>          .
06/08/2024  11:46 AM    <DIR>          ..
06/08/2024  11:34 AM             9ÿ261 index.faiss
06/08/2024  11:34 AM             9ÿ709 index.pkl
               2 File(s)         18ÿ970 bytes
               2 Dir(s)  396ÿ109ÿ000ÿ704 bytes free


In [17]:
vectorstore.save_local("rag_data")

> the indexes after storing the personal data for RAG operations

In [18]:
!dir rag_data

 Volume in drive C is Windows-SSD
 Volume Serial Number is 6CD0-8496

 Directory of C:\Users\moham_2xem6b3\freelancer-recommendation-platform\llm-service\notebooks\rag_data

05/08/2024  10:27 PM    <DIR>          .
06/08/2024  11:46 AM    <DIR>          ..
06/08/2024  11:46 AM            18ÿ477 index.faiss
06/08/2024  11:46 AM             9ÿ709 index.pkl
               2 File(s)         28ÿ186 bytes
               2 Dir(s)  396ÿ108ÿ992ÿ512 bytes free


In [25]:
reply = vectorstore.similarity_search("I want a freelancer that is proficient in Docker", k=1)
reply

[Document(page_content='{"id": "5", "firstName": "Nour", "lastName": "Maaloul", "profileRating": "4.4", "role": "DevOps Engineer", "profilePicture": "/profilePictures/f3.jpg", "description": "Lorem ipsum dolor sit amet consectetur, adipisicing elit. Aut quis nobis reiciendis tempore ullam, reprehenderit, officiis natus dignissimos aliquid facere deleniti in a odio ea voluptate harum dolorum. Earum, repellat error. Voluptatum hic, eveniet, odio omnis rerum repudiandae voluptates nobis reprehenderit, totam consectetur rem fuga. Quaerat dolorem aperiam molestiae facilis in. Consequatur magni harum quod vel molestiae nemo impedit sunt alias ad animi eos quisquam mollitia voluptatum recusandae, adipisci et cumque minima nesciunt natus repellat asperiores explicabo fugit voluptate. Quod repellendus voluptatum impedit asperiores unde quisquam adipisci esse tempore facere", "lastEmploymentString": "il y a 1 mois", "freelancerDomain": "DevOps", "freelancerExpertise": "cloud infrastructure", "ti