In [29]:
import pandas as pd
from chromadb.utils.embedding_functions import OllamaEmbeddingFunction
import spacy
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import re
import string
from nltk.tokenize import RegexpTokenizer
from langdetect import detect
from gensim.models import Word2Vec
import chromadb
import numpy as np
import uuid

In [30]:
french_stopwords = set(stopwords.words('french'))

In [31]:
article = pd.read_csv("fstt-articles.csv")
clubs = pd.read_csv("fstt-clubs-info.csv")
dep = pd.read_csv("fstt-departements-info.csv")
formation = pd.read_csv("fstt-formation-initial.csv")

In [32]:
article.head()

Unnamed: 0,No,post_title,publish_date,post_content,post_link
0,1,CYBERSEC360° : PERSPECTIVES EN CYBERSÉCURITÉ A...,"mars 26, 2024",Un séminaire sera organisé le le samedi 30 Mar...,https://fstt.ac.ma/Portail2023/cybersec360-per...
1,2,PRÉSENTATION CONCOURS D’INNOVATION TDC,"mars 26, 2024",À la suite de l’appel à candidature au concour...,https://fstt.ac.ma/Portail2023/presentation-co...
2,3,SÉMINAIRE : WATER RESOURCES MANAGEMENT FOR SUS...,"mars 26, 2024","À l’occasion de la Journée mondiale de l’eau, ...",https://fstt.ac.ma/Portail2023/seminaire-water...
3,4,ORACLE MOROCCO R&D CENTER : PROGRAMME DE STAGE...,"mars 23, 2024",Oracle Morocco R&D Center lance l’appel aux ca...,https://fstt.ac.ma/Portail2023/oracle-morocco-...
4,5,APPEL À PROJET : «TERRITORY DEVELOPMENT CHALLE...,"mars 22, 2024",Le Centre régional d’investissement (CRI-TTA) ...,https://fstt.ac.ma/Portail2023/appel-a-projet-...


In [33]:
def is_french(text):
    try:
        return detect(text) == 'fr'
    except:
        return False
def preprocessing(text):
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove specific unwanted characters
    text = re.sub(r'«|»|“|”|’|‘', '', text)
    
        # Tokenization
    """tokens = sent_tokenize(text, language="french")
    print("Tokenization :" ,tokens)"""
    # Check if text is in French
    if not is_french(text):
        return ''
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenization using RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    tokens = [word for word in tokens if word not in french_stopwords]
    
    
    # Stemming
    stemmer =nltk.stem.snowball.FrenchStemmer()
    tokens_stemmed = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back into a single string
    return ' '.join(tokens_stemmed)

In [34]:
article.drop(columns=["No"], inplace=True)
clubs.drop(columns=["No"], inplace=True)
formation.drop(columns=["No"], inplace=True)

In [35]:
article["post_title"] = article["post_title"].apply(preprocessing)
article["post_content"] = article["post_content"].apply(preprocessing)
clubs["departement_info"] = clubs["departement_info"].apply(preprocessing)
formation["mst_name"] = formation["mst_name"].apply(preprocessing)
formation["mst_objectif"] = formation["mst_objectif"].apply(preprocessing)
formation["mst_program"] = formation["mst_program"].apply(preprocessing)
formation["mst_skills"] = formation["mst_skills"].apply(preprocessing)
formation["mst_Coord"] = formation["mst_Coord"].apply(preprocessing)

In [36]:
# create EF with custom endpoint
ef = OllamaEmbeddingFunction(
    model_name="nomic-embed-text",
    url="http://localhost:11434/api/embeddings",
)

In [37]:
client = chromadb.Client()

In [41]:
client.delete_collection(name="text_embeddings")

In [42]:
# Create a collection in ChromaDB with the OllamaEmbeddingFunction
collection = client.create_collection(name="text_embeddings", embedding_function=ef)

In [43]:
import uuid

def process_and_store_embeddings(dataframe, column_names, collection):
    ids = []
    metadatas = []
    documents = []

    for idx, row in dataframe.iterrows():
        doc_metadata = {}
        for key, value in row.items():
            if key not in column_names:
                doc_metadata[key] = value  # Leave non-string fields as is
        
        for column in column_names:
            sentence = row[column]
            if sentence is not None and sentence != '':
                ids.append(str(uuid.uuid1()))
                metadatas.append(doc_metadata)
                documents.append(sentence)
    collection.add(
        ids=ids,
        #embeddings=embeddings,
        metadatas=metadatas,
        documents=documents
    )

# Assuming article, clubs, formation DataFrames are already preprocessed
# Store embeddings, metadata, and documents from each DataFrame
process_and_store_embeddings(article, ["post_title", "post_content"], collection)
process_and_store_embeddings(clubs, ["departement_info"], collection)
process_and_store_embeddings(formation, ["mst_name", "mst_objectif", "mst_program", "mst_skills", "mst_Coord"], collection)

print("Embeddings, metadata, and documents stored in ChromaDB successfully.")


Embeddings, metadata, and documents stored in ChromaDB successfully.


In [44]:
collection.peek()

{'ids': ['b0842470-1756-11ef-a627-803049a3c985',
  'b0842600-1756-11ef-a627-803049a3c985',
  'b08429e8-1756-11ef-a627-803049a3c985',
  'b0842a92-1756-11ef-a627-803049a3c985',
  'b0842de4-1756-11ef-a627-803049a3c985',
  'b0842e84-1756-11ef-a627-803049a3c985',
  'b08431a4-1756-11ef-a627-803049a3c985',
  'b084323a-1756-11ef-a627-803049a3c985',
  'b0843546-1756-11ef-a627-803049a3c985',
  'b08435d2-1756-11ef-a627-803049a3c985'],
 'embeddings': [[0.8312462568283081,
   -0.39553454518318176,
   -2.517561674118042,
   -1.0600979328155518,
   1.1688108444213867,
   0.05425160378217697,
   0.3003782629966736,
   1.1378575563430786,
   0.013865257613360882,
   0.6300860643386841,
   -1.334061861038208,
   -0.6684838533401489,
   1.4093351364135742,
   -0.36715438961982727,
   1.7389932870864868,
   -0.37037375569343567,
   -0.6911161541938782,
   -0.4262985587120056,
   0.2172817587852478,
   -1.4302397966384888,
   -0.4119826555252075,
   -1.0122567415237427,
   -0.5350307822227478,
   0.5283504