In [1]:
from opsci_toolbox.helpers.common import load_pickle, write_pickle, write_json
from opsci_toolbox.helpers.dataviz import generate_hexadecimal_color_palette
from tqdm import tqdm
%load_ext cudf.pandas
import pandas as pd
import pytz
from datetime import datetime
import re
from opsci_toolbox.helpers.nlp import *

## Fonction intermédiaire pour générer les clés de jointure
def generate_index(df, col_author_id ='author_id', col_date='created_time'):
    res=[]
    for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="generation des index"): 
        new_index=".".join([ str(i) for i in [ row[col_author_id], row[col_date].year, row[col_date].month, row[col_date].day]])
        res.append(new_index)
    df["index"]=res
    
    return df
                     
def avg_performance(df, 
                    col_date='created_time', 
                    col_author_id='author_id', 
                    col_engagement=['shares', 'comments', 'reactions', 'likes','top_comments', 'love', 'wow', 'haha', 
                                    'sad', 'angry','total_engagement', 'replies', 'percentage_replies'], 
                    rolling_period='7D'):
                     
    # Nettoyage au cas où
    df[col_date] = pd.to_datetime(df[col_date]) 
    df = df.sort_values([col_author_id, col_date]) 

    # Le point central c'est la colone created_time, on la met en index.
    # Ensuite on groupe par author_id en gardant les colonnes de valeurs.
    # On applique la moyenne mean sur un rolling tous les 2 jours. Automatiquement il va prendre l'index, ici created_time comme pivot. 
    # On met tout à plat
    average = df.set_index(col_date).groupby(col_author_id)[col_engagement].rolling(rolling_period).mean(numeric_only=True).reset_index()
                     
    # Sur les résultats précédent, on simplifie pour récupérer une liste avec juste la liste jour / author_id
    average = average.set_index(col_date).groupby([col_author_id]).resample('1D').last(numeric_only=True).reset_index()

    # On génère nos supers index
    df=generate_index(df, col_author_id =col_author_id, col_date=col_date)    
    
    average = generate_index(average, col_author_id = col_author_id, col_date=col_date)

    # On fusionne 
    df = pd.merge(df, average[['index']+col_engagement], how='left', on=['index'], suffixes=('', '_avg'))
    
    return df

def kpi_reaction(df, cols):
    """
    Cette fonction prend un dataframe et une liste de colonnes en entrée.
    Pour chaque colonne, on va calculer le taux de sur-réaction.
    """
    for col in cols:
        df['tx_'+col]=(df[col]-df[col+'_avg'])/(df[col]+df[col+'_avg'])
    return df

def get_reactions_type(df, cols, col_dest):
    all_val=[]
    
    for i,row in tqdm(df.iterrows(), total=df.shape[0], desc="qualification des posts"):
        
        str_val=''
        count=0
        for col in cols:
            if row[col]>0:
                str_val=str_val+' '+col.replace('tx_', 'sur-')
                count=count+1
        if count==0:
            str_val="sous reaction"
        if count==len(cols):
            str_val="sur reaction totale"
            
        all_val.append(str_val.strip())
            
    df[col_dest]=all_val       
    return df

def remove_brackets(text, replacement=""):
    # pattern = r'\b(?:train|ter|eurostar|t[h]?al[iy]s|tgv|intercité[s]?|transilien[s]?)*\s*?(n[°]?|num[ée]ro[s]?|num)?\s*?\d+\b'
    pattern = r'\[.*?\]'
    result = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return result


2024-05-15 10:50:42.938159: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-15 10:50:42.970931: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# col_user_id = "origin_id1"
# col_post_id = "uniq_id"
col_date = "message_date"


cols_sureaction_metrics = ['views', 'engagements', 'total_reactions', 'replies_count', 'forwards']

cols_typologie_sureaction=['views', "total_reactions", "forwards"]

rolling_period_sureaction = '7D'



spacy_lang = "en"                                       #language of the stopwords
spacy_model = "en_core_web_lg"                         # spacy model to import : ru_core_news_lg, en_core_web_lg, fr_core_news_lg
pos_to_keep = ["VERB","NOUN","ADJ", "ADV", "PROPN"] 

In [1]:
from datetime import datetime
import pytz
from opsci_toolbox.helpers.common import load_csv
import pandas as pd


start_date = datetime(2024, 1, 1, tzinfo=pytz.UTC) 
end_date = datetime(2024, 4, 1, tzinfo=pytz.UTC)
cols_to_keep_telegram = ['uniq_id', 'channel_id', "channel", 'message_date', 'text', 'translated_text', "views", "forwards", "replies_count", "total_reactions", "engagements"]


df_telegram = pd.read_pickle("data/df.pickle")
df_telegram = df_telegram.drop_duplicates(subset="uniq_id")


df_telegram = df_telegram[(df_telegram['datetime'] >= start_date) & (df_telegram['datetime'] <= end_date)]

df_telegram = df_telegram[cols_to_keep_telegram]
df_telegram['plateforme'] = "Telegram"

df_telegram=df_telegram.rename(columns={
    'uniq_id' : "message_id", 
    'channel_id' : "user_id", 
    "channel" : "user_name", 
    'message_date' : "date", 
    "forwards" : "share", 
    "replies_count" : "comments", 
    "total_reactions" : "likes"
})

df_telegram['date'] = df_telegram['date'].dt.strftime('%Y-%m-%d %H:%M:%S')

##############################################

cols_to_keep_twitter = ["tweet_id", "user_id", "user_username", "creation_date", "text", "views",  "retweet_count", "reply_count", "favorite_count", "engagements"]

df_twitter = load_csv('/home/erwan/scripts/rapidapi/PR_army_patriotes/timelines.csv')
df_twitter = df_twitter.drop_duplicates(subset="tweet_id")
df_twitter['engagements'] = df_twitter["retweet_count"].fillna(0) + df_twitter["reply_count"].fillna(0)+ df_twitter["favorite_count"].fillna(0) + df_twitter["quote_count"].fillna(0)
df_twitter=df_twitter[cols_to_keep_twitter]
df_twitter['plateforme'] = "Twitter"


start_date = datetime(2024, 1, 1, tzinfo=pytz.UTC) 
end_date = datetime(2024, 4, 1, tzinfo=pytz.UTC)

df_twitter["datetime"]= pd.to_datetime(df_twitter["creation_date"])
df_twitter = df_twitter[(df_twitter['datetime'] >= start_date) & (df_twitter['datetime'] <= end_date)]
df_twitter['date'] = df_twitter['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

df_twitter['views'] = df_twitter['views'].fillna('0').str.replace(',0','').astype(int)
df_twitter=df_twitter.rename(columns={
    'tweet_id' : "message_id", 
    "user_username" : "user_name", 
    "retweet_count" : "share", 
    "reply_count" : "comments", 
    "favorite_count" : "likes"
})

df_twitter['translated_text']=""
df_twitter.drop(columns=['datetime', 'creation_date'], inplace = True)

df = pd.concat([df_telegram, df_twitter])
df = df[df['text'].str.len()>0]
df = df.reset_index(drop=True)

  df = pd.read_csv(path, delimiter=delimiter, encoding="utf-8", decimal=decimal)


In [None]:
import requests

def get_tweet_html(username, tweet_id):
    url = f'https://publish.twitter.com/oembed?url=https://twitter.com/{username}/status/{tweet_id}'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        html = data.get('html')
        return html
    else:
        print(tweet_id, "Failed to fetch data from Twitter.")
        return None
    

df_twitter['tweet_html'] = df_twitter.apply(lambda row: get_tweet_html(row['user_name'], row['message_id']), axis=1)


In [10]:
df = pd.concat([df_telegram, df_twitter])
df = df[df['text'].str.len()>0]
df = df.reset_index(drop=True)

In [8]:
df_twitter_2 = df_twitter.copy()

In [9]:
df_twitter_2['tweet_html'] = df_twitter_2['tweet_html'].fillna(df_twitter_2['text'])
df_twitter_2

Unnamed: 0,message_id,user_id,user_name,text,views,share,comments,likes,engagements,plateforme,date,translated_text,tweet_html
857,1774369358208213478,1419254154976014336,AvocatduPeuple,"[Théologie] Une question me taraude, Christ me...",6529,2,39,52,94,Twitter,2024-03-31 09:33:15,,"<blockquote class=""twitter-tweet""><p lang=""fr""..."
858,1773634480751694056,1419254154976014336,AvocatduPeuple,Ah bin c’est une bonne question ça ! Pourquoi ...,16382,274,87,511,879,Twitter,2024-03-29 08:53:07,,"<blockquote class=""twitter-tweet""><p lang=""fr""..."
859,1773621153866678588,1419254154976014336,AvocatduPeuple,Vendredi Saint 9h Christ est crucifié 🕯✝️🕯,1443,7,8,68,84,Twitter,2024-03-29 08:00:10,,"<blockquote class=""twitter-tweet""><p lang=""fr""..."
860,1773452412369457400,1419254154976014336,AvocatduPeuple,J’assistais ce jour aux obsèques du père d’un ...,1567,3,10,147,160,Twitter,2024-03-28 20:49:38,,"<blockquote class=""twitter-tweet""><p lang=""fr""..."
861,1773258706639098358,1419254154976014336,AvocatduPeuple,"En ce jeudi saint, Même le #CielDeMerde fait l...",1340,10,5,59,75,Twitter,2024-03-28 07:59:55,,"<blockquote class=""twitter-tweet""><p lang=""fr""..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58551,1746568105797578850,1406587578791387136,laziza75715398,🌷 Be happy 🌷🤍💋 https://t.co/Ps4oz3xbQG,6027,17,20,262,299,Twitter,2024-01-14 16:21:00,,"<blockquote class=""twitter-tweet""><p lang=""en""..."
58552,1746496495870681155,1406587578791387136,laziza75715398,Bon après-midi ❤️💋 https://t.co/UaQPH3TqzK,5429,19,25,282,326,Twitter,2024-01-14 11:36:27,,"<blockquote class=""twitter-tweet""><p lang=""fr""..."
58553,1746455889052152066,1406587578791387136,laziza75715398,Bonjour ☕️ 💋 https://t.co/EJtlkuXzpl,4764,5,29,119,153,Twitter,2024-01-14 08:55:06,,"<blockquote class=""twitter-tweet""><p lang=""fr""..."
58554,1746263823743430685,1406587578791387136,laziza75715398,Bonne soirée 🤍💋 https://t.co/bptMqfOXtA,9683,42,35,526,604,Twitter,2024-01-13 20:11:54,,Bonne soirée 🤍💋 https://t.co/bptMqfOXtA


In [13]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) 

embeddings = model.encode(list(df["text"]), 
                            batch_size=12, 
                            max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
                            )['dense_vecs']

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

Inference Embeddings: 100%|██████████| 2651/2651 [30:05<00:00,  1.47it/s]  


In [1]:
from opsci_toolbox.helpers.common import write_pickle, load_pickle

In [14]:
df['embeddings']= embeddings.tolist()
write_pickle(df, "data", "df_prod_chroma_v2")

'data/df_prod_chroma_v2.pickle'

: 

In [2]:
d = load_pickle("data/df_prod_chroma_v2.pickle")

In [12]:
d_france = d[(d["translated_text"].str.lower().str.contains("france")) & (d["plateforme"]=="Telegram")]



from opsci_toolbox.helpers.common import save_dataframe_excel
save_dataframe_excel(d_france, "/home/erwan/scripts/st_pr", "france_telegram", "messages")

/home/erwan/scripts/st_pr/france_telegram.xlsx - File created


'/home/erwan/scripts/st_pr/france_telegram.xlsx'

In [18]:
df_twitter.head().to_clipboard()

In [5]:
df["date"]=df["datetime"].dt.date

df= avg_performance(
    df, 
    col_date=col_date, 
    col_author_id="channel", 
    col_engagement= cols_sureaction_metrics, 
    rolling_period=rolling_period_sureaction
    ) 

# on calcule les taux de sur-réaction pour notre liste de métriques
df=kpi_reaction(df, cols_sureaction_metrics)
cols_tx_engagement=['tx_'+c for c in cols_sureaction_metrics]
df[cols_tx_engagement]=df[cols_tx_engagement].fillna(-1)


# on supprime nos colonnes contenant la performance moyenne (on ne devrait plus en avoir besoin)
cols_to_drop = [c for c in df.columns if c.lower()[-4:] == '_avg']
df.drop(columns=cols_to_drop, inplace=True)

# on catégorise les formes de réaction
cols_typologie = ["tx_"+ col for col in cols_typologie_sureaction]
df=get_reactions_type(df, cols_typologie, 'type_engagement')


generation des index: 100%|██████████| 25111/25111 [00:01<00:00, 18482.28it/s]
generation des index: 100%|██████████| 1538/1538 [00:00<00:00, 18530.08it/s]
qualification des posts: 100%|██████████| 25111/25111 [00:01<00:00, 19922.11it/s]


In [6]:
df_stopwords = load_stopwords_df(spacy_lang)
stopwords = df_stopwords['word'].to_list()

nlp = load_spacy_model(spacy_model,  disable_components=["transformer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect=False, emoji=True)

# basic precleaning of text 
print("TEXT PRECLEANING")
df = TM_clean_text(df, "translated_text", "clean_text")

df["clean_text"] = df["clean_text"].apply(remove_brackets)
df["clean_text"] = df["clean_text"].apply(remove_extra_spaces)

# lemmatize text, remove stop words and keep only some PoS
print("NLP PROCESS")
df = TM_nlp_process(nlp, df, "clean_text", "lemmatized_text", pos_to_keep, stopwords, batch_size=100, n_process=1, stats=False, join_list = True) 

TEXT PRECLEANING
NLP PROCESS


NLP Process: 100%|██████████| 25111/25111 [03:14<00:00, 128.92it/s]


In [7]:
reaction_color_palette = generate_hexadecimal_color_palette(df["type_engagement"].unique())
channel_color_palette = generate_hexadecimal_color_palette(df["channel"].unique())

df["channel_color"]=df["channel"].map(channel_color_palette)
df["surreaction_color"]=df["type_engagement"].map(reaction_color_palette)
write_json(channel_color_palette, "data", "channel_color_palette")
write_json(reaction_color_palette, "data", "reaction_color_palette")

'data/reaction_color_palette.json'

In [8]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

col_text = "text"

lst_text = list(df[col_text])

embeddings = model.encode(lst_text, 
                            batch_size=12, 
                            max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
                            )['dense_vecs']

df['embeddings'] = embeddings.tolist()
df["plateforme"] = "Telegram"
write_pickle(df, "data", "df_prod_chroma")

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()
Inference Embeddings:   8%|▊         | 168/2093 [01:09<14:43,  2.18it/s]

: 

In [10]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = "BAAI/bge-m3"                 #"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "BAAI/bge-m3",  "DeepPavlov/rubert-base-cased"
model_kwargs = {'device': 'cuda:0'}
encode_kwargs = {"batch_size":2}

HF_encoder = load_HF_embeddings(embedding_model, encode_kwargs, model_kwargs)

# HF_encoder = HuggingFaceEmbeddings(model_name='BAAI/bge-m3')

embeddings = HF_encoder.embed_documents(['ukraine'])



In [11]:
len(embeddings[0])

1024

: 

In [18]:
embedding_model = "BAAI/bge-m3"                 #"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "BAAI/bge-m3",  "DeepPavlov/rubert-base-cased"
model_kwargs = {'device': 'cuda:0'}
encode_kwargs = {"batch_size":2}

HF_encoder = load_HF_embeddings(embedding_model, encode_kwargs, model_kwargs)

embeddings = HF_vectorize(HF_encoder, df["translated_text"])

df['embeddings'] = embeddings

cols_to_keep = ['origin_id1', 'channel_id', "channel", "channel_description", 'message_id', "uniq_id", "message_date", "date", "datetime", "text", "translated_text", 
                "is_reply", 'views', 'forwards', 'replies_count', 'total_reactions', "engagements", "sentiment", 'tx_views',
                'tx_engagements', 'tx_total_reactions', 'tx_replies_count',
                'tx_forwards', 'type_engagement', "channel_color", "surreaction_color", "lemmatized_text", "embeddings"]


df=df[cols_to_keep]

write_pickle(df, "data", "df_prod_chroma")

'data/df_prod_chroma.pickle'

In [3]:
from opsci_toolbox.helpers.nlp import spacy_NER
spacy_lang = "ru"                                       #language of the stopwords
spacy_model = "ru_core_news_lg"                         # spacy model to import : ru_core_news_lg, en_core_web_lg, fr_core_news_lg
pos_to_keep = ["VERB","NOUN","ADJ", "ADV", "PROPN"] 

nlp = load_spacy_model(spacy_model,  disable_components=["transformer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect=False, emoji=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['NER_type'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['NER_text'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['NER_start_char'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

In [5]:

df_ner = spacy_NER(nlp, df, "text", entities_to_keep=['PER','ORG', 'LOC'], explode= True)

In [4]:
df_ner

Unnamed: 0,origin_id1,origin_id2,channel_id,message_id,message_date,text,raw_text,is_reply,mentioned,post,...,domain,sentiment,sentiment_score,datetime,channel_description,channel,NER_type,NER_text,NER_start_char,NER_end_char
0,-1001645380944,1645380944,1645380944,15492,2024-04-24 12:08:01+00:00,🇷🇺**Защита планирует обжаловать арест замминис...,🇷🇺Защита планирует обжаловать арест замминистр...,True,False,True,...,[],neutral,-0.108559,2024-04-24 12:08:01+00:00,The whole truth about Western “democracy” from...,Fox News Russia,PER,Иванова,58,65
0,-1001645380944,1645380944,1645380944,15492,2024-04-24 12:08:01+00:00,🇷🇺**Защита планирует обжаловать арест замминис...,🇷🇺Защита планирует обжаловать арест замминистр...,True,False,True,...,[],neutral,-0.108559,2024-04-24 12:08:01+00:00,The whole truth about Western “democracy” from...,Fox News Russia,PER,Денис Балуев,190,202
0,-1001645380944,1645380944,1645380944,15492,2024-04-24 12:08:01+00:00,🇷🇺**Защита планирует обжаловать арест замминис...,🇷🇺Защита планирует обжаловать арест замминистр...,True,False,True,...,[],neutral,-0.108559,2024-04-24 12:08:01+00:00,The whole truth about Western “democracy” from...,Fox News Russia,PER,Иванова,231,238
0,-1001645380944,1645380944,1645380944,15492,2024-04-24 12:08:01+00:00,🇷🇺**Защита планирует обжаловать арест замминис...,🇷🇺Защита планирует обжаловать арест замминистр...,True,False,True,...,[],neutral,-0.108559,2024-04-24 12:08:01+00:00,The whole truth about Western “democracy” from...,Fox News Russia,PER,Тимур Иванов,327,339
1,-1001645380944,1645380944,1645380944,15491,2024-04-24 11:30:01+00:00,🇫🇮**Президент Финляндии призвал готовиться к в...,🇫🇮Президент Финляндии призвал готовиться к вой...,False,False,True,...,[],positive,0.436274,2024-04-24 11:30:01+00:00,The whole truth about Western “democracy” from...,Fox News Russia,LOC,Финляндии,14,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,-1001645380944,1645380944,1645380944,15385,2024-04-20 11:30:01+00:00,**🇺🇦Зеленский запретил солдатам ВСУ играть в ...,🇺🇦Зеленский запретил солдатам ВСУ играть в аз...,False,False,True,...,[],negative,-0.856492,2024-04-20 11:30:01+00:00,The whole truth about Western “democracy” from...,Fox News Russia,ORG,Рады,579,583
98,-1001645380944,1645380944,1645380944,15385,2024-04-20 11:30:01+00:00,**🇺🇦Зеленский запретил солдатам ВСУ играть в ...,🇺🇦Зеленский запретил солдатам ВСУ играть в аз...,False,False,True,...,[],negative,-0.856492,2024-04-20 11:30:01+00:00,The whole truth about Western “democracy” from...,Fox News Russia,PER,Гончаренко,584,594
98,-1001645380944,1645380944,1645380944,15385,2024-04-20 11:30:01+00:00,**🇺🇦Зеленский запретил солдатам ВСУ играть в ...,🇺🇦Зеленский запретил солдатам ВСУ играть в аз...,False,False,True,...,[],negative,-0.856492,2024-04-20 11:30:01+00:00,The whole truth about Western “democracy” from...,Fox News Russia,ORG,ВСУ,631,634
98,-1001645380944,1645380944,1645380944,15385,2024-04-20 11:30:01+00:00,**🇺🇦Зеленский запретил солдатам ВСУ играть в ...,🇺🇦Зеленский запретил солдатам ВСУ играть в аз...,False,False,True,...,[],negative,-0.856492,2024-04-20 11:30:01+00:00,The whole truth about Western “democracy” from...,Fox News Russia,PER,Зеленскому,742,752


In [5]:
import chromadb

chroma_client = chromadb.PersistentClient(path="/home/erwan/scripts/bertopic/chroma")



embedding_model = "BAAI/bge-m3"                 #"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "BAAI/bge-m3",  "DeepPavlov/rubert-base-cased"
model_kwargs = {'device': 'cuda:0'}
encode_kwargs = {'batch_size':32}

df["date"] = df["datetime"].dt.strftime("%Y-%m-%d")

cols_metadata = ['origin_id1', 'channel_id', "channel", 'message_id', "date", "is_reply", 'views', 'forwards', 'replies_count', 'total_reactions', "engagements", "sentiment", 'tx_views', 'tx_engagements', 'tx_total_reactions', 'tx_replies_count', 'tx_forwards', 'type_engagement', "translated_text"]
col_text = "text"
col_id = "uniq_id"

collection = chroma_client.create_collection(name="my_collection")

df_filter = df.sample(10)
lst_text = list(df_filter[col_text])
lst_ids = list(df_filter[col_id])
metadatas =  df_filter[cols_metadata].to_dict(orient="records")


collection.add(
    embeddings=embeddings,
    documents=lst_text,
    metadatas=metadatas,
    ids=lst_ids
)

: 