In [1]:
import pandas as pd
from chromadb.utils.embedding_functions import OllamaEmbeddingFunction
import spacy
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import re
import string
from nltk.tokenize import RegexpTokenizer
from langdetect import detect
from gensim.models import Word2Vec
import chromadb
import numpy as np
import uuid

In [2]:
french_stopwords = set(stopwords.words('french'))

In [3]:
article = pd.read_csv("fstt-articles.csv")
clubs = pd.read_csv("fstt-clubs-info.csv")
dep = pd.read_csv("fstt-departements-info.csv")
formation = pd.read_csv("fstt-formation-initial.csv")

In [4]:
clubs.head()

Unnamed: 0,No,club_name,departement_info,club_link
0,1,GREENOLOGY,Le Club de l’environnement de la FST de Tanger...,https://fstt.ac.ma/Portail2023/greenology/
1,2,CADAC,https://www.facebook.com/CADAC.FSTT,https://fstt.ac.ma/Portail2023/club-d-art-dram...
2,3,CLUB GÉNIE CIVIL,Le Club Génie civil est un club scientifique d...,https://fstt.ac.ma/Portail2023/club-genie-civil/
3,4,CLUB LES SOPHISTES,https://www.facebook.com/SophistesFSTT,https://fstt.ac.ma/Portail2023/club-les-sophis...
4,5,Design & Photography,"Le « Club Design & Photography », est une fili...",https://fstt.ac.ma/Portail2023/club-design-pho...


In [5]:
def is_french(text):
    try:
        return detect(text) == 'fr'
    except:
        return False
def preprocessing(text):
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove specific unwanted characters
    text = re.sub(r'«|»|“|”|’|‘', '', text)
    
        # Tokenization
    """tokens = sent_tokenize(text, language="french")
    print("Tokenization :" ,tokens)"""
    # Check if text is in French
    if not is_french(text):
        return ''
    # Remove punctuation
    text = text.translate(str.maketrans(" ", " ", string.punctuation))
    
    # Tokenization using RegexpTokenizer
    pattern = r"[dnl]['´`]|\w+|\$[\d\.]+|\S+"
    tokenizer = RegexpTokenizer(pattern)
    tokens = tokenizer.tokenize(text)
    tokens = [word for word in tokens if word not in french_stopwords]
    
    
    # Stemming
    stemmer =nltk.stem.snowball.FrenchStemmer()
    tokens_stemmed = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back into a single string
    return ' '.join(tokens_stemmed)

In [6]:
article.drop(columns=["No"], inplace=True)
clubs.drop(columns=["No"], inplace=True)
formation.drop(columns=["No"], inplace=True)
dep.drop(columns=["No"], inplace=True)

In [7]:
article["post_title"] = article["post_title"].apply(preprocessing)
article["post_content"] = article["post_content"].apply(preprocessing)
clubs["departement_info"] = clubs["departement_info"].apply(preprocessing)
formation["mst_name"] = formation["mst_name"].apply(preprocessing)
formation["mst_objectif"] = formation["mst_objectif"].apply(preprocessing)
formation["mst_program"] = formation["mst_program"].apply(preprocessing)
formation["mst_skills"] = formation["mst_skills"].apply(preprocessing)
formation["mst_Coord"] = formation["mst_Coord"].apply(preprocessing)

In [9]:
client = chromadb.HttpClient(host='localhost', port=8000)

In [10]:
client.create_collection(name="articles")
client.create_collection(name="clubs")
client.create_collection(name="formation")
client.create_collection(name="departement")

Exception: {"error":"UniqueConstraintError('Collection articles already exists')"}

In [11]:
articles_collection = client.get_collection(name="articles")
clubs_collection = client.get_collection(name="clubs")
formation_collection = client.get_collection(name="formation")
departement_collection = client.get_collection(name="departement")

In [18]:
import uuid

def process_and_store_embeddings(dataframe, column_names, collection):
    ids = []
    metadatas = []
    documents = []

    for idx, row in dataframe.iterrows():
        doc_metadata = { 'row_index': idx }
        for key, value in row.items():
            if key not in column_names:
                doc_metadata[key] = value  # Leave non-string fields as is
        for column in column_names:
            sentence = row[column]
            if sentence is not None and sentence != '':
                ids.append(str(uuid.uuid1()))
                metadata_with_column = doc_metadata.copy()
                metadata_with_column['column_name'] = column  # Add column name to metadata
                metadatas.append(metadata_with_column)
                documents.append(sentence) 

    collection.add(
        ids=ids,
        # embeddings=embeddings,  # Assuming embeddings are being handled elsewhere
        metadatas=metadatas,
        documents=documents
    )

# Assuming article, clubs, formation DataFrames are already preprocessed
# Store embeddings, metadata, and documents from each DataFrame
process_and_store_embeddings(article, ["post_title", "post_content"], articles_collection)
process_and_store_embeddings(clubs, ["departement_name","departement_info"], departement_collection)
process_and_store_embeddings(formation, ["mst_name", "mst_objectif", "mst_program", "mst_skills", "mst_Coord"], formation_collection)
process_and_store_embeddings(dep, ["departement_info","departement_name"], departement_collection)
print("Embeddings, metadata, and documents stored in ChromaDB successfully.")



Embeddings, metadata, and documents stored in ChromaDB successfully.


In [18]:
dep.head()

Unnamed: 0,No,departement_name,departement_info
0,1,GÉNIE INFORMATIQUE,Chef : Pr.EL BRAK Mohamed \nEmail : melbr...
1,2,GÉNIE CHIMIQUE,Chef : Pr.CHABBI Mohamed\nEmail : mchabbi@uae....
2,3,SCIENCES DE LA TERRE,Chef : Pr.BOULAASSAL Hakim\nEmail : h.boulaass...
3,4,GÉNIE MÉCANIQUE,Chef : Pr.ELAYACHI Ilham\n \nEmail : ...
4,5,SCIENCES DE LA VIE,Chef : Pr.HASSANI ZERROUK Mounir\nEmail : mhas...


In [19]:
departement_collection.query(query_texts=["infromatique"], n_results=3)

{'ids': [['5f22f31e-2001-11ef-b8be-803049a3c985',
   '55d355f6-2001-11ef-b8be-803049a3c985',
   '5f22e270-2001-11ef-b8be-803049a3c985']],
 'distances': [[1.081623868357096, 1.2692459013549697, 1.320685818058943]],
 'embeddings': None,
 'metadatas': [[{'column_name': 'departement_name', 'row_index': 8},
   {'club_link': 'https://fstt.ac.ma/Portail2023/club-enactus/',
    'column_name': 'club_name',
    'row_index': 8},
   {'column_name': 'departement_name', 'row_index': 4}]],
 'documents': [['MATHÉMATIQUES', 'Club Enactus', 'SCIENCES DE LA VIE']],
 'uris': None,
 'data': None}

In [1]:
client.delete_collection(name="text_embeddings")

NameError: name 'client' is not defined