In [1]:
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import json

In [2]:
df = pd.read_json("assets/freelancers.json")

In [3]:
loader = DataFrameLoader(df, page_content_column='description')
df.columns

Index(['id', 'firstName', 'lastName', 'profileRating', 'role',
       'profilePicture', 'description', 'lastEmploymentString',
       'freelancerDomain', 'freelancerExpertise', 'timeEmployment', 'email',
       'phone', 'instagram', 'twitter', 'website', 'gender', 'birthDate',
       'birthPlace', 'languages', 'address', 'experience', 'academicCareer',
       'skills'],
      dtype='object')

In [4]:
features = ['id', 'firstName', 'lastName', 'profileRating', 'role', 'profilePicture', 'description', 'lastEmploymentString', 'freelancerDomain', 'freelancerExpertise', 'timeEmployment', 'email', 'phone', 'instagram', 'twitter', 'website', 'gender', 'birthDate', 'birthPlace', 'languages', 'address', 'experience', 'academicCareer', 'skills']

In [5]:
df_str = df[features].astype(str)
texts = df_str.apply(lambda row: json.dumps(dict(zip(features, row))), axis=1).tolist()

> Creating a list, each element in the list represent a json describing the freelancer

In [6]:
texts

['{"id": "1", "firstName": "Aymen", "lastName": "Ben Ali", "profileRating": "4.5", "role": "Software Engineer", "profilePicture": "/profilePictures/m1.jpg", "description": "Lorem ipsum dolor sit amet consectetur, adipisicing elit. Aut quis nobis reiciendis tempore ullam, reprehenderit, officiis natus dignissimos aliquid facere deleniti in a odio ea voluptate harum dolorum. Earum, repellat error. Voluptatum hic, eveniet, odio omnis rerum repudiandae voluptates nobis reprehenderit, totam consectetur rem fuga. Quaerat dolorem aperiam molestiae facilis in. Consequatur magni harum quod vel molestiae nemo impedit sunt alias ad animi eos quisquam mollitia voluptatum recusandae, adipisci et cumque minima nesciunt natus repellat asperiores explicabo fugit voluptate. Quod repellendus voluptatum impedit asperiores unde quisquam adipisci esse tempore facere", "lastEmploymentString": "il ya 2 jours", "freelancerDomain": "Product Development", "freelancerExpertise": "marketing", "timeEmployment": "P

In [7]:
documents = loader.load()

In [8]:
embeddings_model_name = "all-MiniLM-L6-v2"

In [9]:
model = HuggingFaceEmbeddings(model_name=embeddings_model_name)

  from tqdm.autonotebook import tqdm, trange


In [10]:
vectors = model.embed_documents(texts)

In [11]:
vectors

[[-0.049947984516620636,
  0.08893857151269913,
  -0.044400785118341446,
  -0.02897726744413376,
  -0.04426559433341026,
  -0.04716157913208008,
  0.08161693811416626,
  0.06335072964429855,
  0.01898817904293537,
  -0.009297428652644157,
  0.08822046965360641,
  -0.12649479508399963,
  0.037973057478666306,
  -0.04942050203680992,
  -0.07018156349658966,
  -0.03187515214085579,
  -0.05222097039222717,
  0.052433740347623825,
  -0.03776174411177635,
  -0.06109045818448067,
  0.05171530693769455,
  0.07502777129411697,
  0.0456610769033432,
  -0.041027605533599854,
  -0.02127649076282978,
  0.0380084291100502,
  -0.07511080801486969,
  0.046056490391492844,
  -0.05333027243614197,
  -0.08768860250711441,
  0.016574004665017128,
  0.08735572546720505,
  0.051500629633665085,
  -0.016097893938422203,
  0.08225832879543304,
  0.06554406881332397,
  -0.07011149078607559,
  -0.017745204269886017,
  0.07602154463529587,
  0.012481060810387135,
  -0.02947523258626461,
  -0.0090323556214571,
  

> Creating a matrix with the texts and vectors, in goal to store them in FAISS

In [12]:
text_embeddings = list(zip(texts, vectors))

In [13]:
vectorstore = FAISS.from_embeddings(text_embeddings, model)

### Creating the vectorstore for LLM use and permanent data instead of fine tuning

In [14]:
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x267977fcb00>

> the indexes before storing them

In [15]:
!dir rag_data

 Volume in drive C is Windows-SSD
 Volume Serial Number is 6CD0-8496

 Directory of C:\Users\moham_2xem6b3\freelancer-recommendation-platform\llm-service\notebooks\rag_data

05/08/2024  10:22 PM    <DIR>          .
05/08/2024  10:21 PM    <DIR>          ..
               0 File(s)              0 bytes
               2 Dir(s)  396ÿ043ÿ390ÿ976 bytes free


In [16]:
vectorstore.save_local("rag_data")

> the indexes after storing the personal data for RAG operations

In [17]:
!dir rag_data

 Volume in drive C is Windows-SSD
 Volume Serial Number is 6CD0-8496

 Directory of C:\Users\moham_2xem6b3\freelancer-recommendation-platform\llm-service\notebooks\rag_data

05/08/2024  10:23 PM    <DIR>          .
05/08/2024  10:21 PM    <DIR>          ..
05/08/2024  10:23 PM             9ÿ261 index.faiss
05/08/2024  10:23 PM             9ÿ709 index.pkl
               2 File(s)         18ÿ970 bytes
               2 Dir(s)  396ÿ043ÿ259ÿ904 bytes free
